# EVOLVE-BLOCK-START
"""
High-performance scaling law combining theoretical grounding with empirical optimization.
Restores log-power formulation that achieved 0.5032 with critical improvements:
- Better numerical stability through logarithmic scaling
- Comprehensive parameter space with interaction modeling
- Three-stage optimization strategy
- Data-driven initialization with multiple emphases
- Adaptive regularization based on convergence
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution
from scipy.stats import linregress
def scaling_law_func(data_points, params):
"""
Hybrid scaling law with log-power form:
loss = a + b*log(D)^alpha + c*log(N)^beta + d*(log(lr)-log(lr_opt))^2
+ e*log(bsz)^gamma + f*interaction
Theoretical grounding:
- Log-power scaling for data/params: captures Chinchilla scaling law structure
- Quadratic LR penalty: reflects optimization landscape near optimum
- Log-power batch size: models gradient noise scaling
- LR-BSZ interaction: synergistic effects on training dynamics
- Numerically stable across 8 orders of magnitude in hyperparameters
"""
X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
params = np.asarray(params, dtype=np.float64).flatten()
# Extract features with safety clipping for stability
lr = np.clip(X[:, 0], 1e-5, 1.0)
bsz = np.clip(X[:, 1], 1.0, 10000.0)
data_size = np.clip(X[:, 2], 1e8, 1e12)
param_size = np.clip(X[:, 3], 1e7, 1e10)
# Ensure 10 parameters
if len(params) < 10:
params = np.pad(params, (0, 10 - len(params)), mode='constant', constant_values=0.0)
a = params[0] # baseline loss
b = params[1] # data size coefficient
alpha = np.clip(params[2], 0.2, 2.5) # data size exponent
c = params[3] # param size coefficient
beta = np.clip(params[4], 0.2, 2.5) # param size exponent
d = np.clip(params[5], 0.0, 0.6) # learning rate penalty strength
lr_opt = np.clip(params[6], 1e-4, 0.1) # optimal learning rate
e = params[7] # batch size coefficient
gamma = np.clip(params[8], 0.05, 2.0) # batch size exponent
interaction = params[9] # lr-bsz interaction
# Logarithmic transformations for numerical stability
log_data = np.log(data_size)
log_param = np.log(param_size)
log_lr = np.log(lr)
log_bsz = np.log(bsz)
log_lr_opt = np.log(lr_opt)
# Data scaling: log-power form (more stable than direct power laws)
data_term = b * np.power(log_data, alpha)
# Parameter scaling: log-power form
param_term = c * np.power(log_param, beta)
# Learning rate term: quadratic penalty around optimal
# Theory: gradient-based optimization has quadratic loss landscape near optimum
lr_penalty = log_lr - log_lr_opt
lr_term = d * np.power(lr_penalty, 2)
# Batch size term: log-power form
# Theory: gradient noise scales as O(1/sqrt(batch_size))
bsz_term = e * np.power(log_bsz, gamma)
# Learning rate-batch size interaction: models synergistic effects
# Captures how larger batches can tolerate different learning rates
interaction_term = interaction * np.power(lr_penalty, 2) * log_bsz
# Combine all components with safety clipping
pred = a + data_term + param_term + lr_term + bsz_term + interaction_term
pred = np.clip(pred, 1.5, 4.5)
return pred
def fit_scaling_law(data_points, loss_values):
"""
Multi-stage optimization with data-driven initialization and aggressive refinement.
Strategy:
1. Multi-start local optimization from diverse initializations
2. Global differential evolution as primary search
3. Fine-tuning with aggressive local optimization
"""
X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
y = np.asarray(loss_values, dtype=np.float64).flatten()
# Normalize for numerical stability during optimization
y_mean = np.mean(y)
y_std = np.std(y) + 1e-8
y_norm = (y - y_mean) / y_std
def objective(params):
try:
pred = scaling_law_func(X, params)
pred_norm = (pred - y_mean) / y_std
mse = np.mean((pred_norm - y_norm) ** 2)
# Gentle regularization on scaling coefficients
# Focus on preventing extreme parameter values
reg = 0.003 * np.sum(params[1:7]**2)
return mse + reg
except:
return 1e10
# Data-driven initialization using linear regression in log space
log_data = np.log(X[:, 2])
log_param = np.log(X[:, 3])
log_lr = np.log(X[:, 0])
log_bsz = np.log(X[:, 1])
# Estimate slopes via linear regression (data-driven priors)
data_slope, _, _, _, _ = linregress(log_data, y)
param_slope, _, _, _, _ = linregress(log_param, y)
lr_slope, _, _, _, _ = linregress(log_lr**2, y)
bsz_slope, _, _, _, _ = linregress(log_bsz, y)
# Multiple initializations with different emphases
# Each reflects a different hypothesis about scaling dynamics
inits = [
# Init 1: Balanced Chinchilla (equal data/param scaling)
np.array([y_mean, data_slope/1.8, 1.0, param_slope/1.8, 1.0, 0.07, 0.005, bsz_slope/4, 0.9, -0.005]),
# Init 2: Data-dominant (data scaling more important)
np.array([y_mean, data_slope/1.5, 1.2, param_slope/2.2, 0.8, 0.06, 0.006, bsz_slope/5, 1.0, 0.0]),
# Init 3: Parameter-dominant (parameter scaling more important)
np.array([y_mean, data_slope/2.2, 0.8, param_slope/1.5, 1.2, 0.08, 0.004, bsz_slope/5, 1.0, 0.01]),
# Init 4: Learning rate focused (stronger LR penalty)
np.array([y_mean, data_slope/2.0, 0.9, param_slope/2.0, 0.9, 0.12, 0.004, bsz_slope/6, 1.1, 0.02]),
# Init 5: Batch size focused (stronger batch effects)
np.array([y_mean, data_slope/2.5, 1.1, param_slope/2.5, 1.1, 0.05, 0.007, bsz_slope/3, 0.8, -0.02]),
# Init 6: Strong interaction effects
np.array([y_mean, data_slope/1.6, 1.3, param_slope/1.6, 1.3, 0.1, 0.006, bsz_slope/2, 1.0, 0.05]),
# Init 7: Weak exponents (smoother)
np.array([y_mean, data_slope/2.0, 0.7, param_slope/2.0, 0.7, 0.06, 0.005, bsz_slope/4, 0.7, 0.0]),
# Init 8: High interaction negative
np.array([y_mean, data_slope/1.9, 1.05, param_slope/1.9, 1.05, 0.075, 0.0055, bsz_slope/3.5, 0.95, -0.03]),
]
# Parameter bounds based on theoretical considerations
bounds = [
(1.5, 4.5), # a: baseline loss (within observed range)
(-0.5, 0.1), # b: data coefficient (mostly negative)
(0.2, 2.5), # alpha: data exponent
(-0.5, 0.1), # c: param coefficient (mostly negative)
(0.2, 2.5), # beta: param exponent
(0.0, 0.6), # d: lr penalty strength
(1e-4, 0.1), # lr_opt: optimal learning rate
(-0.3, 0.3), # e: batch size coefficient
(0.05, 2.0), # gamma: batch size exponent
(-0.15, 0.15), # interaction: LR-BSZ interaction
]
best_params = None
best_loss = np.inf
# Stage 1: Multi-start local optimization from diverse initializations
for init in inits:
try:
result = minimize(
objective,
init,
method='L-BFGS-B',
bounds=bounds,
options={'maxiter': 6000, 'ftol': 1e-10, 'gtol': 1e-8}
)
if result.fun < best_loss:
best_loss = result.fun
best_params = result.x
except:
pass
# Stage 2: Global optimization with differential evolution
# Use as primary search if local optimization didn't converge well
if best_params is None or best_loss > 0.25:
try:
result = differential_evolution(
objective,
bounds,
seed=42,
maxiter=2000,
popsize=35,
atol=1e-12,
tol=1e-12,
workers=1,
updating='deferred',
polish=True
)
if result.fun < best_loss:
best_loss = result.fun
best_params = result.x
except:
if best_params is None:
best_params = inits[0]
# Stage 3: Fine-tune with aggressive optimization
if best_params is not None and best_loss < 0.5:
try:
result = minimize(
objective,
best_params,
method='L-BFGS-B',
bounds=bounds,
options={'maxiter': 15000, 'ftol': 1e-12, 'gtol': 1e-9}
)
if result.fun < best_loss:
best_params = result.x
except:
pass
return best_params if best_params is not None else inits[0]
# EVOLVE-BLOCK-END