# EVOLVE-BLOCK-START
"""
Refined scaling law with adaptive reference scales and stabilized vocab term
Theory: L = a * (P/P0)^(-alpha) * (D/D0)^(-beta) * [1 + c*log(V/V0) + e*log(V/V0)^2] + d
Uses learnable P0 and D0 for better cross-dataset adaptation, with enhanced numerical stability.
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution
def scaling_law_func(data_points, params):
"""
Adaptive scaling law with learnable reference scales:
L = a * (P/P0)^(-alpha) * (D/D0)^(-beta) * [1 + c*log(V/V0) + e*log(V/V0)^2] + d
7 parameters: [a, alpha, beta, c, e, log_P0, log_D0]
Note: d is derived as 7th param, but we pack both log_P0 and log_D0
Actually: [a, alpha, beta, c, e, log_P0, d] with fixed D0
- a: overall scale factor
- alpha: parameter exponent
- beta: data exponent
- c: linear vocabulary coefficient
- e: quadratic vocabulary coefficient
- log_P0: learnable parameter reference scale
- d: asymptotic bias
"""
X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
params = np.asarray(params, dtype=np.float64)
if params.ndim == 1:
params = params[None, :]
# Extract features with safety bounds
P = np.maximum(X[:, 0], 1e6)
V = np.maximum(X[:, 1], 1000)
D = np.maximum(X[:, 2], 1e8)
# Extract parameters
a = params[:, 0]
alpha = np.abs(params[:, 1])
beta = np.abs(params[:, 2])
c = params[:, 3]
e = params[:, 4]
log_P0 = params[:, 5]
d = params[:, 6]
# Learnable P0 with tight bounds, fixed D0 at empirical median
P0 = np.exp(np.clip(log_P0, np.log(1e7), np.log(1e10)))[:, None]
D0 = 1e11
V0 = 32000.0
# Normalized ratios with enhanced numerical stability
P_norm = np.clip(P[None, :] / P0, 1e-3, 1e3)
D_norm = np.clip(D[None, :] / D0, 1e-3, 1e3)
log_V_ratio = np.clip(np.log(V[None, :] / V0), -3.0, 3.0)
# Main power law with more stable computation
power_base = a[:, None] * np.power(P_norm, -alpha[:, None]) * np.power(D_norm, -beta[:, None])
# Quadratic vocabulary efficiency (proven to work well)
vocab_factor = 1.0 + c[:, None] * log_V_ratio + e[:, None] * log_V_ratio**2
# Combined prediction
pred = power_base * vocab_factor + d[:, None]
return pred[0] if pred.shape[0] == 1 else pred
def fit_scaling_law(data_points, loss_values):
"""
Enhanced three-stage optimization: global search + multi-restart local + fine polish
"""
X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
y = np.asarray(loss_values, dtype=np.float64)
if y.ndim == 1:
y = y[:, None]
T = y.shape[1]
# Data-driven initialization
P_med = np.median(X[:, 0])
y_mean = np.mean(y)
y_std = np.std(y)
# Improved initialization closer to Chinchilla optimal
init_params = np.array([
[np.abs(y_std) * 2.85, # a: scale factor
0.072, # alpha: slightly lower than 0.073
0.072, # beta: symmetric with alpha
-0.058, # c: slightly stronger negative
-0.0032, # e: small quadratic term
np.log(P_med), # log_P0: data-driven
y_mean] # d: bias at mean
]).repeat(T, axis=0)
def objective(flat_params):
params = flat_params.reshape(T, 7)
try:
pred = scaling_law_func(X, params)
if pred.ndim == 1:
pred = pred[:, None]
# MSE loss
mse = np.mean((pred - y) ** 2)
# Refined regularization with optimal weights
reg = 0.00016 * (
np.sum((params[:, 1] - 0.072)**2) + # alpha near optimal
np.sum((params[:, 2] - 0.072)**2) + # beta near optimal
np.sum((params[:, 3] + 0.058)**2) + # c near optimal
np.sum(params[:, 4]**2) * 2.8 # small quadratic term
)
return mse + reg
except:
return 1e10
# Optimized parameter bounds
bounds = [
(0.01, 85.0), # a
(0.018, 0.35), # alpha
(0.018, 0.35), # beta
(-2.7, 2.7), # c
(-0.15, 0.15), # e
(np.log(1e7), np.log(1e10)), # log_P0
(-12.5, 0.0) # d
] * T
best_result = None
best_loss = float('inf')
# Stage 1: Global search with enhanced settings
try:
result_de = differential_evolution(
objective,
bounds,
strategy='best1bin',
maxiter=420,
popsize=13,
seed=42,
atol=1e-9,
tol=1e-9,
polish=True,
updating='deferred',
workers=1
)
if result_de.success or result_de.fun < best_loss:
best_result = result_de
best_loss = result_de.fun
except:
pass
# Stage 2: Multi-restart local optimization with adaptive perturbation
restart_configs = [
(0.0, 'exact'), # Start from DE result
(0.012, 'micro'), # Micro perturbation
(0.04, 'small'), # Small perturbation
(0.08, 'medium'), # Medium exploration
(0.13, 'large'), # Large exploration
(0.20, 'xlarge') # Extra large exploration
]
for scale, _ in restart_configs:
if scale == 0.0 and best_result is not None:
init = best_result.x
else:
init = init_params.ravel() + np.random.randn(T * 7) * scale
try:
result = minimize(
objective,
init,
method='L-BFGS-B',
bounds=bounds,
options={
'maxiter': 2200,
'ftol': 1e-12,
'gtol': 1e-10,
'maxfun': 28000
}
)
if result.fun < best_loss:
best_loss = result.fun
best_result = result
except:
continue
# Stage 3: Final polish with alternative method if needed
if best_result is not None and best_loss > 0.014:
try:
result_powell = minimize(
objective,
best_result.x,
method='Powell',
options={'maxiter': 1000, 'ftol': 1e-11}
)
if result_powell.fun < best_loss:
best_result = result_powell
except:
pass
# Final TNC polish for difficult cases
if best_result is not None and best_loss > 0.016:
try:
result_tnc = minimize(
objective,
best_result.x,
method='TNC',
bounds=bounds,
options={'maxiter': 800, 'ftol': 1e-11}
)
if result_tnc.fun < best_loss:
best_result = result_tnc
except:
pass
if best_result is None:
params_opt = init_params
else:
params_opt = best_result.x.reshape(T, 7)
return params_opt[0] if T == 1 else params_opt
# EVOLVE-BLOCK-END