# EVOLVE-BLOCK-START
"""
Enhanced scaling law with adaptive reference scales and improved regularization
Achieves better generalization through data-informed normalization and
sophisticated parameter-specific regularization strategies
Uses 7-parameter additive model: L = a0 + a1*(P_nv)^a2 + a3*(V)^a4 + a5*(N_chars)^a6
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution
def scaling_law_func(data_points, params):
"""
Additive power-law scaling law with 7 parameters
Model: L = a0 + a1*(P_nv)^a2 + a3*(V)^a4 + a5*(N_chars)^a6
Adaptive normalization based on data statistics for better numerical behavior.
"""
X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
params = np.asarray(params, dtype=np.float64)
if X.shape[1] != 3:
raise ValueError(f"Expected 3 features, got {X.shape[1]}")
P_nv = X[:, 0]
V = X[:, 1]
N_chars = X[:, 2]
if len(params) < 7:
params = np.concatenate([params, np.zeros(7 - len(params))])
a0, a1, a2, a3, a4, a5, a6 = params[:7]
# Adaptive reference scales using geometric mean of data ranges
# More stable than fixed values across different data distributions
P_ref = np.exp(0.5 * (np.log(np.min(P_nv[P_nv > 0]) + 1e-10) +
np.log(np.max(P_nv) + 1e-10)))
V_ref = np.exp(0.5 * (np.log(np.min(V[V > 0]) + 1e-10) +
np.log(np.max(V) + 1e-10)))
N_ref = np.exp(0.5 * (np.log(np.min(N_chars[N_chars > 0]) + 1e-10) +
np.log(np.max(N_chars) + 1e-10)))
# Normalize inputs
P_nv_norm = P_nv / P_ref
V_norm = V / V_ref
N_chars_norm = N_chars / N_ref
# Tighter clipping based on typical exponent ranges
P_nv_norm = np.clip(P_nv_norm, 1e-4, 1e4)
V_norm = np.clip(V_norm, 1e-4, 1e4)
N_chars_norm = np.clip(N_chars_norm, 1e-4, 1e4)
# Power law terms
term_params = a1 * np.power(P_nv_norm, a2)
term_vocab = a3 * np.power(V_norm, a4)
term_data = a5 * np.power(N_chars_norm, a6)
return a0 + term_params + term_vocab + term_data
def fit_scaling_law(data_points, loss_values):
"""
Advanced three-phase hierarchical optimization with adaptive strategies
Phase 1: Global exploration with differential evolution
Phase 2: Local refinement from theory-informed initializations
Phase 3: Final convergence with tight tolerances and parameter-specific tuning
"""
X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
y = np.asarray(loss_values, dtype=np.float64).ravel()
y_mean = np.mean(y)
y_std = np.std(y) + 1e-8
y_norm = (y - y_mean) / y_std
# Domain-informed bounds with tighter exponent constraints
bounds = [
(-2.0, 2.0), # a0: intercept (baseline loss)
(-15.0, 15.0), # a1: parameter coefficient
(-1.3, 1.3), # a2: parameter exponent (tighter - usually -0.5 to 0)
(-15.0, 15.0), # a3: vocab coefficient
(-1.0, 0.3), # a4: vocab exponent (tighter - typically -0.4 to -0.1)
(-15.0, 15.0), # a5: data coefficient
(-0.5, -0.01), # a6: data exponent (much tighter - all negative)
]
def objective(params):
"""Objective with parameter-specific regularization"""
try:
pred = scaling_law_func(X, params)
if not np.all(np.isfinite(pred)):
return 1e10
pred_norm = (pred - y_mean) / y_std
mse = np.mean((pred_norm - y_norm) ** 2)
# Parameter-specific regularization (stronger on exponents than coefficients)
reg_coeff = 0.0005 * (params[1]**2 + params[3]**2 + params[5]**2)
reg_exp = 0.0020 * (params[2]**2 + params[4]**2 + params[6]**2)
reg_intercept = 0.0010 * params[0]**2
return mse + reg_coeff + reg_exp + reg_intercept
except:
return 1e10
# Phase 1: Global exploration with differential evolution
result_de = differential_evolution(
objective, bounds, seed=42, maxiter=600, popsize=28,
atol=1e-11, tol=1e-11, mutation=(0.5, 1.5), recombination=0.8,
workers=1, updating='deferred', polish=False
)
best_params = result_de.x
best_loss = result_de.fun
# Phase 2: Local refinement from multiple theory-informed initializations
smart_inits = [
result_de.x, # Best from DE
np.array([-0.2, -2.8, -0.55, -3.2, -0.28, -3.2, -0.22]), # Refined Chinchilla
np.array([-0.8, -3.2, -0.75, -1.8, -0.15, -2.8, -0.28]), # Data-heavy
np.array([-0.4, -1.8, -0.35, -3.8, -0.35, -3.8, -0.35]), # Vocab-heavy
np.array([-0.6, -2.2, -0.65, -2.5, -0.22, -2.2, -0.18]), # Balanced
np.array([-0.3, -2.0, -0.45, -2.8, -0.30, -3.5, -0.25]), # Parameter-focused
np.array([-1.0, -1.5, -0.80, -1.2, -0.10, -2.0, -0.12]), # Minimal scaling
]
for init_params in smart_inits:
# Clip to bounds to ensure valid initialization
init_params = np.array([np.clip(p, b[0], b[1]) for p, b in zip(init_params, bounds)])
result = minimize(
objective, init_params, method='L-BFGS-B', bounds=bounds,
options={'ftol': 1e-13, 'gtol': 1e-12, 'maxiter': 1000, 'maxls': 120}
)
if result.fun < best_loss:
best_loss = result.fun
best_params = result.x
# Phase 3: Final ultra-tight refinement with adaptive tolerance
result_final = minimize(
objective, best_params, method='L-BFGS-B', bounds=bounds,
options={'ftol': 1e-15, 'gtol': 1e-13, 'maxiter': 2000, 'maxls': 200}
)
if result_final.fun < best_loss:
best_params = result_final.x
# Phase 3b: One more tight global polish around best solution
tight_bounds = [
(max(b[0], p - 0.25), min(b[1], p + 0.25))
for b, p in zip(bounds, best_params)
]
result_polish = differential_evolution(
objective, tight_bounds, seed=43, maxiter=200, popsize=15,
atol=1e-12, tol=1e-12, mutation=(0.5, 1.5), recombination=0.9,
workers=1, updating='deferred', polish=True
)
if result_polish.fun < best_loss:
best_params = result_polish.x
return best_params
# EVOLVE-BLOCK-END