# EVOLVE-BLOCK-START
"""
Scaling law for LLM finetuning: L(N) = a + b/N^α + c*log(N)
Evolved Chinchilla-inspired 4-parameter form with improved initialization.
Data-adaptive bounds and optimized hybrid optimization strategy.
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution
def scaling_law_func(data_points, params):
"""
Chinchilla-inspired scaling law: L(N) = a + b/N^α + c*log(N)
params: [a, b, alpha, c] (4 parameters)
- a: asymptotic loss floor
- b: power-law amplitude
- alpha: power-law exponent (0.01-2.0)
- c: logarithmic correction coefficient
"""
X = np.atleast_2d(np.asarray(data_points))
N = X[:, 0]
params = np.asarray(params).flatten()
if len(params) < 4:
params = np.pad(params, (0, 4 - len(params)), constant_values=0.0)
a, b, alpha, c = params[:4]
alpha = np.clip(alpha, 0.01, 2.0)
N_safe = np.maximum(N, 1.0)
loss = a + b / (N_safe ** alpha) + c * np.log(N_safe)
return loss
def fit_scaling_law(data_points, loss_values):
"""
Evolved hybrid optimization with improved initialization and bounds.
Adaptive strategy based on data characteristics.
"""
X = np.atleast_2d(np.asarray(data_points))
y = np.asarray(loss_values).flatten()
N = X[:, 0]
# Compute robust statistics
y_min, y_max = np.min(y), np.max(y)
y_mean = np.mean(y)
y_std = np.std(y) + 1e-8
y_range = y_max - y_min + 1e-8
N_min, N_max = np.min(N), np.max(N)
log_N_min, log_N_max = np.log(N_min), np.log(N_max)
log_N_range = log_N_max - log_N_min
# Estimate power-law component via log-log regression
log_N = np.log(N_safe := np.maximum(N, 1.0))
log_y_centered = np.log(np.maximum(y - y_min + 1e-8, 1e-8))
# Robust slope estimation (ignoring potential outliers)
valid_idx = np.isfinite(log_y_centered)
if np.sum(valid_idx) > 2:
slope = np.polyfit(log_N[valid_idx], log_y_centered[valid_idx], 1)[0]
alpha_init = np.clip(-slope, 0.1, 1.5)
else:
alpha_init = 0.5
def objective(params):
try:
pred = scaling_law_func(X, params)
if np.any(np.isnan(pred)) or np.any(np.isinf(pred)):
return 1e10
return np.mean((pred - y) ** 2)
except:
return 1e10
# Data-adaptive bounds
b_range = y_range * (N_max - N_min)
bounds = [
(y_min * 0.3, y_max * 1.5), # a: asymptotic floor (wider range)
(-b_range, b_range), # b: adaptive amplitude
(0.01, 2.0), # alpha: standard exponent
(-1.0, 1.0) # c: wider log correction range
]
best_params = None
best_loss = np.inf
# Evolved multi-start initialization with better statistical grounding
init_candidates = [
# Power-law dominant (various exponents)
[y_min, y_range * (N_max - N_min) * 0.5, 0.25, 0.0],
[y_min, y_range * (N_max - N_min) * 0.5, alpha_init, 0.0],
[y_min, y_range * (N_max - N_min) * 0.5, 0.8, 0.0],
[y_min, y_range * (N_max - N_min) * 0.5, 1.2, 0.0],
# With logarithmic correction
[y_min, y_range * (N_max - N_min) * 0.5, alpha_init, -0.1 * y_range / log_N_range],
[y_min, y_range * (N_max - N_min) * 0.5, alpha_init, 0.1 * y_range / log_N_range],
# Higher baseline
[y_mean * 0.8, y_range * (N_max - N_min) * 0.3, alpha_init, 0.0],
[y_mean * 0.8, y_range * (N_max - N_min) * 0.3, 0.6, -0.05 * y_range / log_N_range],
]
# Phase 1: Quick local optimization from multiple initializations
for init in init_candidates:
try:
result = minimize(objective, init, method='L-BFGS-B', bounds=bounds,
options={'ftol': 1e-6, 'maxiter': 500})
if result.fun < best_loss:
best_loss = result.fun
best_params = result.x
except:
pass
# Phase 2: Adaptive global search if local search insufficient
# Trigger DE if: (1) no solution found, (2) relative error still high
de_threshold = y_range ** 2 * 0.03
if best_params is None or best_loss > de_threshold:
try:
result = differential_evolution(
objective, bounds, seed=42, maxiter=300,
atol=1e-8, tol=1e-8, workers=1,
strategy='best1bin', popsize=15
)
if result.fun < best_loss:
best_loss = result.fun
best_params = result.x
except:
if best_params is None:
best_params = init_candidates[0]
# Phase 3: Aggressive local refinement with tight tolerance
if best_params is not None:
try:
result = minimize(objective, best_params, method='L-BFGS-B', bounds=bounds,
options={'ftol': 1e-9, 'maxiter': 3000})
if result.fun < best_loss:
best_params = result.x
except:
pass
# Safeguard: ensure valid output
if best_params is None:
best_params = np.array([y_min, y_range * (N_max - N_min) * 0.5, 0.5, 0.0])
return np.asarray(best_params)[:4]
# EVOLVE-BLOCK-END