# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM easy-question scenarios with U-shaped pattern
Optimized 6-parameter centered model with robust hybrid optimization
Captures double descent via centered quadratic + linear + power + cubic terms
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution
def scaling_law_func(data_points, params):
"""
U-shaped scaling law with 6 parameters using centered parameterization:
f(x) = a*(x-x0)^2 + b*(x-x0) + c + d*|x-x0|^1.5 + e*(x-x0)^3 + f
where x0 = mean(x) is data-adaptive centering for numerical stability.
This captures:
- Quadratic term (a): main U-shape structure
- Linear term (b): asymmetry and overall trend
- Bias (c): primary vertical shift
- Power term (d): smooth transitions and asymptotic behavior via 1.5 exponent
- Cubic term (e): fine curvature adjustments
- Offset (f): additional vertical flexibility
params: [a, b, c, d, e, f] (6 parameters, max allowed)
"""
X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
x = X[:, 0] # Extract log_flops (1D)
params = np.asarray(params, dtype=np.float64).flatten()
# Pad if necessary
if len(params) < 6:
params = np.pad(params, (0, 6 - len(params)), mode='constant', constant_values=0)
a, b, c, d, e, f = params[:6]
# Use data-adaptive centering for numerical stability
x0 = np.mean(x)
x_centered = x - x0
# Stability constraints on parameters to prevent overflow
a = np.clip(a, -50, 50)
b = np.clip(b, -50, 50)
c = np.clip(c, -50, 50)
d = np.clip(d, -30, 30)
e = np.clip(e, -30, 30)
f = np.clip(f, -50, 50)
# Build prediction with all components
quad_term = a * (x_centered ** 2)
linear_term = b * x_centered
power_term = d * np.sign(x_centered) * (np.abs(x_centered) ** 1.5)
cubic_term = e * (x_centered ** 3)
# Combined prediction
pred = quad_term + linear_term + power_term + cubic_term + c + f
return pred
def fit_scaling_law(data_points, loss_values):
"""
Fit U-shaped scaling law using robust multi-phase optimization:
Phase 1: Global search with differential evolution
Phase 2: Local refinement with L-BFGS-B
Phase 3: Multiple diverse restarts for robustness
Phase 4: Final polish for convergence
"""
X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
y = np.asarray(loss_values, dtype=np.float64).flatten()
x = X[:, 0]
x_mean = np.mean(x)
x_range = np.max(x) - np.min(x)
y_mean = np.mean(y)
y_range = np.max(y) - np.min(y)
if y_range < 1e-10:
y_range = 1.0
def objective(params):
"""MSE objective with light regularization"""
try:
pred = scaling_law_func(X, params)
# Check for numerical issues
if not np.all(np.isfinite(pred)):
return 1e10
# MSE loss
mse = np.mean((pred - y) ** 2)
# Very light L2 regularization to encourage simpler solutions
reg = 0.0005 * np.sum(params ** 2)
return mse + reg
except (ValueError, FloatingPointError, OverflowError):
return 1e10
# Parameter bounds - carefully calibrated
bounds = [
(-50.0, 50.0), # a: quadratic coefficient
(-50.0, 50.0), # b: linear coefficient
(-50.0, 50.0), # c: bias term
(-30.0, 30.0), # d: power term coefficient
(-30.0, 30.0), # e: cubic coefficient
(-50.0, 50.0), # f: additional offset
]
best_params = None
best_loss = float('inf')
# Phase 1: Global search with differential evolution
try:
de_result = differential_evolution(
objective,
bounds,
seed=42,
maxiter=600,
popsize=25,
atol=1e-10,
tol=1e-10,
workers=1,
updating='deferred',
polish=True,
)
if de_result.fun < best_loss:
best_loss = de_result.fun
best_params = de_result.x
except Exception:
pass
# Phase 2: Local refinement with L-BFGS-B
if best_params is not None:
try:
refine_result = minimize(
objective,
best_params,
method='L-BFGS-B',
bounds=bounds,
options={'maxiter': 1000, 'ftol': 1e-11, 'gtol': 1e-9}
)
if refine_result.fun < best_loss:
best_loss = refine_result.fun
best_params = refine_result.x
except Exception:
pass
# Phase 3: Multiple restarts from diverse initializations
init_strategies = [
lambda: np.array([0.3, -0.5, y_mean, 0.1, 0.05, 0.0]),
lambda: np.array([0.8, -1.0, y_mean, 0.2, 0.1, 0.0]),
lambda: np.array([0.1, 0.0, y_mean, 0.05, 0.02, 0.0]),
lambda: np.array([-0.2, 0.5, y_mean, -0.1, -0.05, 0.0]),
lambda: np.array([0.5, -0.2, y_mean, 0.15, 0.08, -0.5*y_range]),
lambda: np.array([1.0, -1.5, y_mean, 0.3, 0.15, 0.0]),
lambda: np.random.uniform(-10, 10, 6),
lambda: np.random.uniform(-5, 5, 6),
]
for init_func in init_strategies:
try:
init_params = init_func()
# Ensure bounds compliance
init_params = np.clip(init_params, np.array(bounds)[:, 0], np.array(bounds)[:, 1])
result = minimize(
objective,
init_params,
method='L-BFGS-B',
bounds=bounds,
options={'maxiter': 500, 'ftol': 1e-10, 'gtol': 1e-9}
)
if result.fun < best_loss:
best_loss = result.fun
best_params = result.x
except Exception:
pass
# Phase 4: Final polish if we have a solution
if best_params is not None:
try:
polish_result = minimize(
objective,
best_params,
method='L-BFGS-B',
bounds=bounds,
options={'maxiter': 2000, 'ftol': 1e-12, 'gtol': 1e-10}
)
if polish_result.fun < best_loss:
best_params = polish_result.x
except Exception:
pass
# Fallback initialization
if best_params is None:
best_params = np.array([
0.5, # a: moderate positive quadratic for U-shape
-0.5, # b: slight negative linear trend
y_mean, # c: centered at data mean
0.1, # d: small power term
0.05, # e: small cubic term
0.0 # f: no additional offset
])
return best_params
# EVOLVE-BLOCK-END