# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios.
Refined scaling law with additive terms for Model, Data, LR penalty, and Batch Size.
Uses geometric mean normalization for stability and physics-informed bounds.
Includes a diverse set of initial guesses including known scaling regimes.
"""
import numpy as np
from scipy.optimize import minimize
def scaling_law_func(data_points, params):
"""
Predicts LM loss based on scaling law parameters.
Model Form:
L = E + A*N^(-alpha) + B*D^(-beta) + C*(log(lr) - log_lr_opt)^2 + F*bsz^G
Where:
log_lr_opt = d0 + d1*log(N) + d2*log(bsz)
Inputs are normalized:
- N: Parameters / 1e9
- D: Tokens / 1e10
- lr: Learning Rate / 1e-3
- bsz: Batch Size / 2048
Parameters (11 total):
0: E (Irreducible loss)
1: A (Model size coeff)
2: alpha (Model size exponent)
3: B (Data size coeff)
4: beta (Data size exponent)
5: C (LR penalty coeff)
6: d0 (Opt LR intercept)
7: d1 (Opt LR slope w.r.t N)
8: d2 (Opt LR slope w.r.t bsz)
9: F (Batch size coeff)
10: G (Batch size exponent)
"""
# Normalization constants (Program 1 settings proved effective)
scales = np.array([1e-3, 2048.0, 1e10, 1e9])
X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
X_norm = X / scales[None, :]
lr = X_norm[:, 0]
bsz = X_norm[:, 1]
D = X_norm[:, 2]
N_param = X_norm[:, 3]
# Handle params shape
params = np.asarray(params, dtype=np.float64)
original_ndim = params.ndim
if original_ndim == 1:
params = params[None, :]
# Unpack parameters
E = params[:, 0:1]
A = params[:, 1:2]
alpha = params[:, 2:3]
B = params[:, 3:4]
beta = params[:, 4:5]
C = params[:, 5:6]
d0 = params[:, 6:7]
d1 = params[:, 7:8]
d2 = params[:, 8:9]
F = params[:, 9:10]
G = params[:, 10:11]
eps = 1e-9
# Broadcasting preparation
N_p = N_param[None, :]
D_p = D[None, :]
lr_p = lr[None, :]
bsz_p = bsz[None, :]
# 1. Power Laws for N and D
# Use abs(alpha/beta) to ensure decay
term_N = A * ((N_p + eps) ** (-np.abs(alpha)))
term_D = B * ((D_p + eps) ** (-np.abs(beta)))
# 2. Learning Rate Penalty
# Optimal LR depends on N and bsz
log_N = np.log(N_p + eps)
log_bsz = np.log(bsz_p + eps)
log_lr = np.log(lr_p + eps)
opt_log_lr = d0 + d1 * log_N + d2 * log_bsz
term_LR = C * ((log_lr - opt_log_lr) ** 2)
# 3. Batch Size Effect
# Power law scaling. If G < 0, larger batch size -> smaller loss (efficiency/stability)
# If G > 0, larger batch size -> larger loss (noise reduction saturation?)
term_BSZ = F * ((bsz_p + eps) ** G)
# Total Loss
pred = E + term_N + term_D + term_LR + term_BSZ
# Return shape handling
pred = pred.T
if original_ndim == 1:
return pred[:, 0]
return pred
def fit_scaling_law(data_points, loss_values):
"""
Fits the scaling law parameters using multi-start L-BFGS-B.
"""
X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
y = np.asarray(loss_values, dtype=np.float64).flatten()
min_loss = np.min(y)
def objective(p):
preds = scaling_law_func(X, p)
return np.mean((preds - y)**2)
# Parameter Bounds
# E: [1.0, min_loss] - Irreducible loss must be below achieved loss
# A, B: [0, inf]
# alpha, beta: [0, 3]
# C: [0, inf]
# d0, d1, d2: unconstrained
# F: unconstrained (though usually positive if G is chosen well)
# G: unconstrained
bounds = [
(1.0, min_loss - 0.01), # E
(0.0, None), # A
(0.0, 3.0), # alpha
(0.0, None), # B
(0.0, 3.0), # beta
(0.0, None), # C
(None, None), # d0
(None, None), # d1
(None, None), # d2
(None, None), # F
(None, None) # G
]
# Initial Guesses
# p = [E, A, alpha, B, beta, C, d0, d1, d2, F, G]
guesses = [
# 1. Kaplan-like (low exponents)
[1.8, 0.5, 0.07, 0.5, 0.07, 0.2, 0.0, -0.1, 0.1, 0.01, -0.1],
# 2. Chinchilla-like (higher exponents)
[1.6, 1.0, 0.33, 1.0, 0.33, 0.2, 0.0, -0.2, 0.1, 0.01, -0.1],
# 3. High data dependence
[1.5, 0.5, 0.05, 2.0, 0.5, 0.1, 0.5, -0.1, 0.0, 0.05, -0.2],
# 4. Previous Best (Program 1 params approx)
[1.8, 0.5, 0.1, 0.5, 0.1, 0.2, 0.0, -0.1, 0.0, 0.01, -0.01],
# 5. Flat batch size effect
[2.0, 1.0, 0.1, 1.0, 0.1, 0.5, 0.0, -0.2, 0.2, 0.0, 0.0],
# 6. Negative BSZ exponent (efficiency gain)
[1.7, 0.5, 0.1, 0.5, 0.1, 0.2, 0.0, -0.1, 0.1, 0.1, -0.2]
]
best_loss = np.inf
best_params = np.array(guesses[0])
for p0 in guesses:
try:
res = minimize(objective, p0, method='L-BFGS-B', bounds=bounds,
options={'maxiter': 2500, 'ftol': 1e-10, 'gtol': 1e-10})
if res.fun < best_loss:
best_loss = res.fun
best_params = res.x
except Exception:
continue
return best_params
# EVOLVE-BLOCK-END