# EVOLVE-BLOCK-START
"""
Lossu scaling with parameter–data synergy and U-shaped vocabulary modulation.
Model (7 params):
L = L_inf + A / ( (P/P_REF)^alpha + g * (D_eff/C_REF)^beta )
D_eff = C / (1 + k * (log(V+1) - m)^2)
Parameterization (stability/positivity):
[0] L_inf, [1] log_A, [2] s_alpha, [3] s_beta, [4] log_g, [5] m, [6] log_k
alpha = softplus(s_alpha), beta = softplus(s_beta), g = exp(log_g), k = exp(log_k), A = exp(log_A)
"""
import numpy as np
from scipy.optimize import minimize
_P_REF = 1e8
_C_REF = 1e10
_EPS = 1e-12
def _softplus(x):
x = np.asarray(x)
return np.where(x > 20.0, x, np.log1p(np.exp(x)))
def _inv_softplus(y):
y = float(max(1e-12, y))
return np.log(np.exp(y) - 1.0)
def _decode_1d(p):
p = np.asarray(p).ravel()
if p.size != 7: raise ValueError("params must have length 7")
return (p[0], np.exp(p[1]), _softplus(p[2]), _softplus(p[3]), np.exp(p[4]), p[5], np.exp(p[6]))
def _decode_2d(P):
if P.shape[1] != 7: raise ValueError("params rows must have length 7")
return (P[:,0], np.exp(P[:,1]), _softplus(P[:,2]), _softplus(P[:,3]), np.exp(P[:,4]), P[:,5], np.exp(P[:,6]))
def scaling_law_func(data_points, params):
X = np.atleast_2d(np.asarray(data_points))
if X.shape[1] != 3: raise ValueError("data_points must have shape (N,3)")
P, V, C = X[:,0].astype(float), X[:,1].astype(float), X[:,2].astype(float)
logV = np.log(V + 1.0)
par = np.asarray(params)
if par.ndim == 1:
L0, A, a, b, g, m, k = _decode_1d(par)
denomV = 1.0 + k * (logV - m)**2
Deff = C / (denomV + _EPS)
Ps = (P / _P_REF) + _EPS
Ds = (Deff / _C_REF) + _EPS
return L0 + A / (np.power(Ps, a) + g * np.power(Ds, b) + _EPS)
elif par.ndim == 2:
L0, A, a, b, g, m, k = _decode_2d(par)
Pc, Cc, logVc = P[:,None], C[:,None], logV[:,None]
denomV = 1.0 + k[None,:] * (logVc - m[None,:])**2
Deff = Cc / (denomV + _EPS)
Ps = (Pc / _P_REF) + _EPS
Ds = (Deff / _C_REF) + _EPS
pred = L0[None,:] + A[None,:] / (np.power(Ps, a[None,:]) + g[None,:] * np.power(Ds, b[None,:]) + _EPS)
return pred if pred.shape[1] > 1 else pred[:,0]
else:
raise ValueError("params must be 1D or 2D array")
def fit_scaling_law(data_points, loss_values):
X = np.atleast_2d(np.asarray(data_points))
y = np.asarray(loss_values)
if X.shape[1] != 3: raise ValueError("data_points must have shape (N,3)")
Y = y[:,None] if y.ndim == 1 else y
if Y.shape[0] != X.shape[0]: raise ValueError("loss_values must align with data_points rows")
N = X.shape[0]
P, V, C = X[:,0].astype(float), X[:,1].astype(float), X[:,2].astype(float)
logV = np.log(V + 1.0)
lv_min, lv_max = float(np.min(logV)), float(np.max(logV))
lv_span = max(1e-6, lv_max - lv_min)
# Balance vocab groups to prevent dominance by frequent vocab sizes
uv, cnt = np.unique(V, return_counts=True)
w_map = dict(zip(uv.tolist(), cnt.tolist()))
w = np.array([1.0 / w_map[v] for v in V], float)
w *= N / np.sum(w)
def huber(r, d):
a = np.abs(r)
return np.where(a <= d, 0.5*r*r, d*(a - 0.5*d))
def phi_from_params(sa, sb, lg, m, lk):
a = _softplus(sa); b = _softplus(sb)
g = np.exp(lg); k = np.exp(lk)
denomV = 1.0 + k * (logV - m)**2
Deff = C / (denomV + _EPS)
Ps = (P / _P_REF) + _EPS
Ds = (Deff / _C_REF) + _EPS
den = np.power(Ps, a) + g * np.power(Ds, b) + _EPS
return 1.0 / den
def solve_LA(phi, target):
sw = np.sqrt(w + _EPS)
M = np.stack([np.ones_like(phi), phi], 1)
Mw = sw[:,None] * M
yw = sw * target
try:
theta, _, _, _ = np.linalg.lstsq(Mw, yw, rcond=None)
except Exception:
theta = np.array([np.average(target, weights=w), 1e-3], float)
L, A = float(theta[0]), float(theta[1])
if A <= 1e-12:
A = 1e-12
L = float((np.sum(w*(target - A*phi)) / (np.sum(w) + _EPS)))
return L, A
def optimize_column(target):
ymin, ymax = float(np.min(target)), float(np.max(target))
med = float(np.median(target))
mad = float(np.median(np.abs(target - med)) + 1e-8)
delta = max(0.05, 1.4826 * mad)
# Seed grid for (m, k) and data/param balance g
m_grid = np.linspace(lv_min, lv_max, 5)
k_grid = np.array([0.05, 0.2, 1.0, 5.0, 20.0]) / (lv_span**2 + 1e-12)
a0, b0 = 0.35, 0.30
sa0, sb0 = _inv_softplus(a0), _inv_softplus(b0)
seeds = []
for m0 in m_grid:
for lk0 in np.log(k_grid + _EPS):
# balance g via median branch magnitudes
denomV = 1.0 + np.exp(lk0) * (logV - m0)**2
Deff = C / (denomV + _EPS)
Ps = (P / _P_REF) + _EPS
Ds = (Deff / _C_REF) + _EPS
pphi = np.power(Ps, a0); dphi = np.power(Ds, b0)
g0 = float(np.median(pphi) / (np.median(dphi) + _EPS))
seeds.append(np.array([sa0, sb0, np.log(max(g0,1e-12)), m0, lk0], float))
# Score seeds via robust error with closed-form (L_inf, A)
def score(par5):
phi = phi_from_params(*par5)
L, A = solve_LA(phi, target)
pred = L + A*phi
return float(np.sum(w * huber(pred - target, delta)) / N)
seeds.sort(key=score)
topK = seeds[:6]
# Local objective only over (s_alpha, s_beta, log_g, m, log_k)
def obj(par5):
sa, sb, lg, m, lk = par5
# mild priors to stabilize exponents, vocab center and width
a = _softplus(sa); b = _softplus(sb); k = np.exp(lk)
reg = (1e-4 * ((a - 0.35)**2 + (b - 0.30)**2)
+ 1e-6 * (np.log(k + 1e-12)**2)
+ 1e-5 * (max(0.0, m - (lv_max + 0.5))**2 + max(0.0, (lv_min - 0.5) - m)**2))
phi = phi_from_params(sa, sb, lg, m, lk)
L, A = solve_LA(phi, target)
pred = L + A*phi
loss = np.sum(w * huber(pred - target, delta)) / N
return loss + reg
bounds = [
(None, None), # s_alpha
(None, None), # s_beta
(-20.0, 20.0), # log_g
(lv_min - 1.0, lv_max + 1.0), # m
(-20.0, 10.0), # log_k
]
rng = np.random.default_rng(2025)
best_x, best_val = topK[0], obj(topK[0])
for base in topK:
for _ in range(3):
start = base + rng.normal(0, [0.4,0.4,0.4,0.15,0.3])
res = minimize(obj, start, method='L-BFGS-B', bounds=bounds, options={'maxiter':700})
val = res.fun if res.success else obj(res.x)
if val < best_val: best_val, best_x = val, (res.x if res.success else start)
# Recover (L_inf, log_A) at optimum via weighted LS
phi = phi_from_params(*best_x)
L_opt, A_opt = solve_LA(phi, target)
return np.array([L_opt, np.log(max(A_opt, 1e-12)), best_x[0], best_x[1], best_x[2], best_x[3], best_x[4]], float)
T = Y.shape[1]
pars = np.stack([ optimize_column(Y[:,t]) for t in range(T) ], 0)
return pars[0] if T == 1 else pars
# EVOLVE-BLOCK-END