# EVOLVE-BLOCK-START
import numpy as np
from scipy.optimize import minimize, nnls
def scaling_law_func(data_points, params):
"""
Computes loss L = E + A * (N/1e9)^-alpha * K^-beta
Args:
data_points: (N, 2) array [num_params, parallel_size]
params: (4,) or (M, 4) array [E, A, alpha, beta]
Returns:
Predicted loss (N,) or (N, M)
"""
X = np.asarray(data_points, dtype=float)
if X.ndim == 1: X = X[None, :]
P = np.asarray(params, dtype=float)
if P.ndim == 1: P = P[None, :]
# Inputs: N in billions, K is raw count
# Pre-scaling N helps numerical stability of the power law base
N_scaled = X[:, 0] / 1.0e9
K = X[:, 1]
# Extract parameters
E = P[:, 0]
A = P[:, 1]
alpha = P[:, 2]
beta = P[:, 3]
# Broadcasting: (Samples, 1) vs (1, ParamSets)
# Term 1: Model size scaling
# N_scaled (N,) -> (N, 1), alpha (M,) -> (1, M)
term_N = N_scaled[:, None] ** -alpha[None, :]
# Term 2: Parallel size scaling
# K (N,) -> (N, 1), beta (M,) -> (1, M)
term_K = K[:, None] ** -beta[None, :]
# Combined: E + A * term_N * term_K
pred = E[None, :] + A[None, :] * term_N * term_K
# Flatten if single parameter set
if pred.shape[1] == 1:
return pred[:, 0]
return pred
def fit_scaling_law(data_points, loss_values):
"""
Fits parameters [E, A, alpha, beta] using Variable Projection (VarPro).
The problem is separable:
L(E, A, alpha, beta) = sum( (y - (E + A * N^-alpha * K^-beta))^2 )
For fixed (alpha, beta), the optimal (E, A) can be found via
Non-Negative Least Squares (NNLS) on the features [1, N^-alpha * K^-beta].
We optimize the outer variables (alpha, beta) using L-BFGS-B, evaluating
the inner variables (E, A) on the fly. This projects the difficult 4D
optimization surface onto a smoother 2D surface.
"""
X = np.atleast_2d(np.asarray(data_points, dtype=float))
Y = np.atleast_2d(np.asarray(loss_values, dtype=float))
if loss_values.ndim == 1: Y = Y.T
N_scaled = X[:, 0] / 1.0e9
K = X[:, 1]
fitted_params = []
# Design matrix container for the inner linear problem
# Column 0 is always 1s (intercept E)
# Column 1 will be updated with the power law term (slope A)
M = np.ones((X.shape[0], 2))
for i in range(Y.shape[1]):
y = Y[:, i]
# Define the profile loss function
def objective(exponents):
alpha, beta = exponents
# Compute the power law feature
# Add small epsilon to bases to avoid division by zero (though inputs are >0)
z = (N_scaled ** -alpha) * (K ** -beta)
M[:, 1] = z
# Solve non-negative linear least squares: min ||M*w - y||^2 s.t. w >= 0
# w = [E, A]
# NNLS is robust and naturally enforces E >= 0, A >= 0
w, resid_norm = nnls(M, y)
# Return MSE
return (resid_norm ** 2) / len(y)
# Strategy: Coarse Grid Search -> Gradient Descent
# This helps avoid local minima in the exponent space
best_loss = float('inf')
best_x0 = [0.5, 0.5]
# Search grid for exponents
grid_points = [0.05, 0.2, 0.5, 0.8, 1.2, 2.0]
for a0 in grid_points:
for b0 in grid_points:
loss = objective([a0, b0])
if loss < best_loss:
best_loss = loss
best_x0 = [a0, b0]
# Fine-tune alpha, beta using L-BFGS-B
# Bounds ensure exponents stay positive and reasonable
bounds = [(1e-5, 20.0), (1e-5, 20.0)]
try:
res = minimize(
objective,
best_x0,
method='L-BFGS-B',
bounds=bounds,
ftol=1e-13,
gtol=1e-13
)
alpha_opt, beta_opt = res.x
except:
alpha_opt, beta_opt = best_x0
# Reconstruct the linear parameters E, A for the optimal exponents
z = (N_scaled ** -alpha_opt) * (K ** -beta_opt)
M[:, 1] = z
w_opt, _ = nnls(M, y)
E_opt, A_opt = w_opt
fitted_params.append([E_opt, A_opt, alpha_opt, beta_opt])
final_params = np.array(fitted_params)
return final_params[0] if Y.shape[1] == 1 else final_params
# EVOLVE-BLOCK-END