# EVOLVE-BLOCK-START
"""
Stable log-parameterization additive power-law scaling for LLM loss under
unique_tokens (U), parameters (P), and tokens (T) constraints:
L(U,P,T) = c0
+ exp(lk1 - a1·ln U)
+ exp(lk2 - a2·ln P)
+ exp(lk3 - a3·ln T)
7 parameters:
c0, lk1, a1, lk2, a2, lk3, a3
Positivity of k-terms is enforced via exp(log-k). Exponents a_i ∈ [0,5].
Fitted via L-BFGS-B with bounds for numerical stability.
"""
import numpy as np
from scipy.optimize import minimize
def scaling_law_func(data_points, params):
"""
Predict cross-entropy loss from (U, P, T) data.
Args:
data_points: array-like of shape (N,3): [unique_tokens, params, tokens]
params: array of 7 floats: [c0, lk1, a1, lk2, a2, lk3, a3]
Returns:
preds: ndarray of shape (N,) of predicted losses.
"""
X = np.asarray(data_points, dtype=float)
if X.ndim == 1:
X = X[None, :]
# clip to avoid log(0)
U = np.clip(X[:, 0], 1e-8, None)
P = np.clip(X[:, 1], 1e-8, None)
T = np.clip(X[:, 2], 1e-8, None)
c0, lk1, a1, lk2, a2, lk3, a3 = params
lnU, lnP, lnT = np.log(U), np.log(P), np.log(T)
# additive sum of three positive power-law terms in log-space
termU = np.exp(lk1 - a1 * lnU)
termP = np.exp(lk2 - a2 * lnP)
termT = np.exp(lk3 - a3 * lnT)
return c0 + termU + termP + termT
def fit_scaling_law(data_points, loss_values):
"""
Fit the 7-parameter scaling law to (U,P,T) → loss data.
Args:
data_points: ndarray of shape (N,3) with [unique_tokens, params, tokens]
loss_values: ndarray of shape (N,) of observed losses
Returns:
params_opt: ndarray of fitted parameters [c0, lk1, a1, lk2, a2, lk3, a3]
"""
X = np.asarray(data_points, dtype=float)
y = np.asarray(loss_values, dtype=float).ravel()
# 1) Initialize c0 to a small fraction of the lower envelope of y
y_min = np.min(y)
c0_init = max(0.0, np.percentile(y, 5) * 0.9)
# 2) Shifted target for k-terms
y_shift = np.clip(y - c0_init, 1e-12, None)
# 3) Compute inverse log-spread weights for U, P, T
log_feats = np.vstack([
np.log(np.clip(X[:, 0], 1e-12, None)),
np.log(np.clip(X[:, 1], 1e-12, None)),
np.log(np.clip(X[:, 2], 1e-12, None))
])
inv_spread = 1.0 / (np.std(log_feats, axis=1) + 1e-8)
w = inv_spread / np.sum(inv_spread)
# 4) Allocate mean shifted loss across three terms
base = np.mean(y_shift)
k_inits = base * w # positive initial magnitudes
# 5) Parameterize k_i via log(k_i) for stability
lk1_init, lk2_init, lk3_init = np.log(np.clip(k_inits, 1e-12, None))
# 6) Exponent initial guesses
a1_init = a2_init = a3_init = 0.5
init = np.array([
c0_init,
lk1_init, a1_init,
lk2_init, a2_init,
lk3_init, a3_init
], dtype=float)
# 7) Bounds: c0 ∈ [0, y_min], exponents ∈ [0,5], logs unbounded
bounds = [
(0.0, y_min), # c0
(None, None), # lk1
(0.0, 5.0), # a1
(None, None), # lk2
(0.0, 5.0), # a2
(None, None), # lk3
(0.0, 5.0) # a3
]
# 8) Objective: mean squared error
def objective(p):
pred = scaling_law_func(X, p)
return np.mean((pred - y) ** 2)
# 9) Optimize with L-BFGS-B
result = minimize(
objective,
init,
method='L-BFGS-B',
bounds=bounds,
options={'ftol': 1e-12, 'gtol': 1e-8, 'maxiter': 5000}
)
if result.success and result.x.shape == init.shape:
return result.x
# fallback to initialization
return init
# EVOLVE-BLOCK-END