# EVOLVE-BLOCK-START
import numpy as np
from scipy.optimize import minimize
# normalization constants
_lr0, _bsz0, _D0, _N0 = 1e-3, 256.0, 1e10, 1e8
def scaling_law_func(data_points, params):
"""
Enhanced scaling law:
L = Linf
+ A * (N/N0)^(-alpha) * (D/D0)^(-beta)
* [1 + B * (lr/lr0)^(gamma) * (bsz/bsz0)^(-delta)]^rho
params (length 8):
0: Linf – asymptotic loss floor
1: log(A) – size‐term prefactor
2: log(alpha) – exponent on model size
3: log(beta) – exponent on data size
4: log(B) – hyperparam coupling prefactor
5: gamma – lr exponent
6: delta – bsz exponent
7: log(rho) – saturation exponent on hyper‐term
"""
X = np.atleast_2d(np.asarray(data_points, dtype=float))
# unpack and normalize features
lr_rat = X[:,0] / _lr0
bsz_rat = X[:,1] / _bsz0
D_rat = X[:,2] / _D0
N_rat = X[:,3] / _N0
p = np.asarray(params, dtype=float)
if p.ndim == 1:
p = p[None, :]
assert p.shape[1] == 8, f"Expected 8 params, got {p.shape[1]}"
T = p.shape[0]
# parameter transforms
Linf = p[:,0]
A = np.exp(p[:,1])
alpha = np.exp(p[:,2])
beta = np.exp(p[:,3])
B = np.exp(p[:,4])
gamma = p[:,5]
delta = p[:,6]
rho = np.exp(p[:,7])
# broadcast to (N, T)
lr_m = lr_rat[:, None]
bsz_m = bsz_rat[:, None]
D_m = D_rat[:, None]
N_m = N_rat[:, None]
size_term = A[None, :] * (N_m ** (-alpha[None, :])) * (D_m ** (-beta[None, :]))
hyper = 1.0 + B[None, :] * (lr_m ** (gamma[None, :])) * (bsz_m ** (-delta[None, :]))
Y = Linf[None, :] + size_term * (hyper ** rho[None, :])
# return shape (N,) if single parameter set, else (N, T)
return Y[:, 0] if T == 1 else Y
def fit_scaling_law(data_points, loss_values):
"""
Fit the 8‐parameter enhanced scaling law by minimizing
MSE in log‐loss space, with positivity and stability bounds.
"""
X = np.atleast_2d(np.asarray(data_points, dtype=float))
y = np.asarray(loss_values, dtype=float).ravel()
# initial guesses
linf0 = 0.9 * np.min(y)
A0 = max(1e-6, np.median(y) - linf0)
alpha0 = 0.5
beta0 = 0.5
B0 = 0.1
gamma0 = 0.0
delta0 = 0.0
rho0 = 1.0
p0 = np.array([
linf0,
np.log(A0),
np.log(alpha0),
np.log(beta0),
np.log(B0),
gamma0,
delta0,
np.log(rho0)
], dtype=float)
# bounds ensure physical/scalable predictions
bnds = [
(0.0, np.min(y)), # Linf
(-20.0, 20.0), # log(A)
(-5.0, 5.0), # log(alpha)
(-5.0, 5.0), # log(beta)
(-20.0, 20.0), # log(B)
(-10.0, 10.0), # gamma
(-10.0, 10.0), # delta
(-5.0, 5.0) # log(rho)
]
def objective(p):
y_pred = scaling_law_func(X, p)
# heavy penalty on non-positive predictions
if np.any(y_pred <= 0.0):
return 1e6 + np.sum((np.minimum(y_pred, 1e-6))**2)
# log-space MSE
d = np.log(y_pred) - np.log(y)
return np.mean(d * d)
res = minimize(objective, p0, method='L-BFGS-B', bounds=bnds)
return res.x if (res.success and res.x.shape == p0.shape) else p0
# EVOLVE-BLOCK-END