# EVOLVE-BLOCK-START
"""
Optimized scaling law with proven data efficiency formulation
Form: L = A/U^α + B/P^β + C/(D^γ * U^0.15) + E
Focuses on numerical stability and robust parameter fitting
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution
def scaling_law_func(data_points, params):
"""
Scaling law: L = A/U^α + B/P^β + C/(D^γ * U^0.15) + E
Components:
- A/U^α: Unique token diversity effect
- B/P^β: Model capacity (Chinchilla-style)
- C/(D^γ * U^0.15): Data efficiency moderated by unique content
- E: Irreducible loss floor
Parameters: [A, α, B, β, C, γ, E] (7 params)
"""
X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
params = np.asarray(params, dtype=np.float64)
if params.size != 7:
params = np.array([8.5, 0.29, 92.0, 0.21, 46.0, 0.17, 2.05])
U = np.maximum(X[:, 0], 1e6) # unique_tokens
P = np.maximum(X[:, 1], 1e7) # params
D = np.maximum(X[:, 2], 1e8) # tokens
A, alpha, B, beta, C, gamma, E = params
# Constrain exponents for stability
alpha = np.clip(alpha, 0.05, 0.8)
beta = np.clip(beta, 0.05, 0.7)
gamma = np.clip(gamma, 0.05, 0.6)
# Core terms
term1 = A / np.power(U, alpha)
term2 = B / np.power(P, beta)
# Data efficiency with fixed U exponent for stability
denom = np.power(D, gamma) * np.power(U, 0.15)
term3 = C / np.maximum(denom, 1.0)
return term1 + term2 + term3 + E
def fit_scaling_law(data_points, loss_values):
"""
Robust hybrid optimization with enhanced convergence
"""
X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
y = np.asarray(loss_values, dtype=np.float64)
# Data statistics
U_med = np.median(X[:, 0])
P_med = np.median(X[:, 1])
y_min, y_max = np.min(y), np.max(y)
y_range = y_max - y_min
y_mean = np.mean(y)
def objective(params):
try:
pred = scaling_law_func(X, params)
if not np.all(np.isfinite(pred)):
return 1e10
# Primary MSE loss
mse = np.mean((pred - y) ** 2)
# Minimal regularization
reg = 1e-9 * (params[0]**2/8000 + params[2]**2/8000 + params[4]**2/8000)
# Soft penalties
penalty = 0.0
if np.any(pred < 0):
penalty += 80.0 * np.sum(pred[pred < 0]**2)
over_mask = pred > y_max + 2.2 * y_range
if np.any(over_mask):
penalty += 8.0 * np.sum((pred[over_mask] - y_max)**2)
return mse + reg + penalty
except:
return 1e10
# Adaptive bounds
bounds = [
(0.01, y_range * np.power(U_med, 0.42)),
(0.05, 0.8),
(0.01, y_range * np.power(P_med, 0.32)),
(0.05, 0.7),
(0.01, y_range * 140),
(0.05, 0.6),
(y_min * 0.45, y_max * 1.25)
]
best_result = None
best_score = float('inf')
# Global search with differential evolution
for seed_val in [42, 99, 333, 777]:
try:
result = differential_evolution(
objective,
bounds,
maxiter=480,
popsize=23,
seed=seed_val,
atol=1e-9,
tol=1e-9,
workers=1,
polish=True,
strategy='best1bin',
updating='deferred'
)
if result.fun < best_score:
best_score = result.fun
best_result = result.x
except:
continue
# Multi-start local optimization
init_points = [
[8.5, 0.29, 92.0, 0.21, 46.0, 0.17, y_mean],
[9.5, 0.31, 98.0, 0.23, 49.0, 0.19, y_mean * 0.97],
[7.0, 0.26, 88.0, 0.19, 43.0, 0.15, y_mean * 1.03],
[11.0, 0.33, 105.0, 0.25, 52.0, 0.21, y_mean * 0.94],
[6.5, 0.24, 95.0, 0.20, 48.0, 0.18, y_mean * 1.06],
]
if best_result is not None:
init_points.insert(0, best_result)
for init in init_points:
try:
result = minimize(
objective,
init,
method='L-BFGS-B',
bounds=bounds,
options={'maxiter': 3500, 'ftol': 1e-12, 'gtol': 1e-10}
)
if result.fun < best_score:
best_score = result.fun
best_result = result.x
except:
continue
# Final ultra-fine refinement
if best_result is not None and best_score < 0.15:
try:
result = minimize(
objective,
best_result,
method='L-BFGS-B',
bounds=bounds,
options={'maxiter': 2500, 'ftol': 1e-14, 'gtol': 1e-12}
)
if result.fun < best_score:
best_result = result.x
except:
pass
# Fallback
if best_result is None:
best_result = np.array([8.5, 0.29, 92.0, 0.21, 46.0, 0.17, 2.05])
return best_result
# EVOLVE-BLOCK-END