# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios.
Implements a highly optimized Grid Search + L-BFGS-B strategy to discover
the optimal additive power law parameters with 7 degrees of freedom.
"""
import numpy as np
import itertools
from scipy.optimize import minimize
# Fixed scaling factors for numerical stability
# Based on typical LLM scales: Params~1e9, Vocab~1e4, Data~1e11
SCALES = np.array([1e9, 1e4, 1e11])
def scaling_law_func(data_points, params):
"""
Predicts Lossu using a sum of power laws with a bias term.
Formula: L = Bias + c1*(N/S1)^e1 + c2*(V/S2)^e2 + c3*(D/S3)^e3
Args:
data_points: (N, 3) array [non_vocab_params, vocab_size, num_characters]
params: (7,) or (1, 7) array [bias, c1, c2, c3, e1, e2, e3]
"""
X = np.atleast_2d(np.asarray(data_points))
# Normalize inputs
# X columns: [non_vocab_params, vocab_size, num_characters]
X_norm = X / SCALES[None, :]
params = np.asarray(params)
if params.ndim == 1:
params = params[None, :]
# Unpack parameters
bias = params[:, 0] # (T,)
coeffs = params[:, 1:4] # (T, 3)
exponents = params[:, 4:7] # (T, 3)
# Compute power terms: (N, 3) ** (T, 3) -> (N, T, 3) after broadcasting
# Use abs() for safety, though inputs are positive
terms = np.abs(X_norm[:, None, :]) ** exponents[None, :, :]
# Weighted sum: sum over the 3 features
# (N, T, 3) * (T, 3) -> (N, T, 3) -> sum(axis=2) -> (N, T)
pred = (terms * coeffs[None, :, :]).sum(axis=2) + bias[None, :]
if pred.shape[1] == 1:
return pred[:, 0]
return pred
def fit_scaling_law(data_points, loss_values):
"""
Fits the scaling law parameters using a hybrid Grid Search + L-BFGS-B approach.
Uses a dense grid of exponents to locate the global basin of attraction,
solved efficiently via Linear Least Squares, followed by gradient-based refinement.
"""
X = np.atleast_2d(np.asarray(data_points))
y = np.asarray(loss_values)
if y.ndim == 1:
y = y[:, None]
# Normalize X for fitting
X_norm = X / SCALES[None, :]
# Define a dense grid of candidate exponents
# Covers decay (negative), growth (positive), and near-linear/log (near 0) regimes
grid_vals = [-1.5, -1.0, -0.75, -0.5, -0.33, -0.2, -0.1, -0.05,
0.05, 0.1, 0.2, 0.33, 0.5, 0.75, 1.0, 1.5]
# Precompute feature powers to optimize the inner loop
# feats[dim] is shape (N, grid_size)
feats = [np.abs(X_norm[:, i:i+1]) ** np.array(grid_vals)[None, :] for i in range(3)]
N_targets = y.shape[1]
final_params = []
# Pre-allocate ones for bias term
ones = np.ones((X.shape[0], 1))
# Grid indices
grid_indices = range(len(grid_vals))
for i in range(N_targets):
y_target = y[:, i]
candidates = []
# 1. Grid Search with Linear Least Squares
# Iterate over all combinations of exponents for the 3 features
for idx in itertools.product(grid_indices, repeat=3):
i1, i2, i3 = idx
# Construct design matrix A: [1, x1^e1, x2^e2, x3^e3]
# Use precomputed columns
A = np.column_stack((ones, feats[0][:, i1], feats[1][:, i2], feats[2][:, i3]))
# Solve linear system A * w = y
# w = [bias, c1, c2, c3]
try:
# rcond=None lets numpy decide, usually 1e-15
w, res, rank, s = np.linalg.lstsq(A, y_target, rcond=None)
# Calculate MSE
if res.size > 0:
mse = res[0] / X.shape[0]
else:
mse = np.mean((A @ w - y_target)**2)
# Store full param vector: [bias, c1, c2, c3, e1, e2, e3]
p_vec = np.array([w[0], w[1], w[2], w[3],
grid_vals[i1], grid_vals[i2], grid_vals[i3]])
candidates.append((mse, p_vec))
except:
continue
# Sort candidates by MSE and pick top ones for refinement
candidates.sort(key=lambda x: x[0])
top_k = 10
refinement_starts = [c[1] for c in candidates[:top_k]]
# Fallback: Constant mean model
p_const = np.zeros(7)
p_const[0] = np.mean(y_target)
refinement_starts.append(p_const)
best_mse = float('inf')
best_p = refinement_starts[0]
# Objective function for optimizer
def objective(p):
preds = scaling_law_func(X, p)
return np.mean((preds - y_target) ** 2)
# 2. Refine with L-BFGS-B
# Bounds: Exponents in [-3, 3] to prevent overflow/instability
# Coefficients/Bias unbounded
bounds = [(None, None)]*4 + [(-3.0, 3.0)]*3
for init_p in refinement_starts:
try:
res = minimize(objective, init_p, method='L-BFGS-B', bounds=bounds,
options={'ftol': 1e-9, 'maxiter': 1000})
if res.fun < best_mse:
best_mse = res.fun
best_p = res.x
except:
continue
final_params.append(best_p)
return np.vstack(final_params)[0] if N_targets == 1 else np.vstack(final_params)
# EVOLVE-BLOCK-END