# EVOLVE-BLOCK-START
"""
Simplified MoE scaling law with data-driven normalization and focused optimization
Hybrid power-law model: L = a * (E/E_ref)^b * (P/P_ref)^c + d * (E*P)^e + f
Uses 6 parameters with improved numerical stability and faster convergence
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution
def scaling_law_func(data_points, params):
"""
Simplified hybrid scaling law with interaction term
L = a * (E/E_ref)^b * (P/P_ref)^c + d * (E*P)^e + f
6 parameters: [a, b, c, d, e, f]
"""
X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
if X.shape[0] == 0:
return np.array([])
params = np.asarray(params, dtype=np.float64)
num_experts = X[:, 0]
dense_params = X[:, 1]
# Data-driven normalization
E_norm = num_experts / 32.0
P_norm = dense_params / 4e8
# Safety clipping
E_norm = np.clip(E_norm, 0.01, 100.0)
P_norm = np.clip(P_norm, 0.01, 100.0)
a, b, c, d, e, f = params[0], params[1], params[2], params[3], params[4], params[5]
# Clip exponents
b = np.clip(b, -1.5, 1.5)
c = np.clip(c, -1.5, 1.5)
e = np.clip(e, -1.5, 1.5)
# Main power law term
term1 = a * np.power(E_norm, b) * np.power(P_norm, c)
# Interaction term (replaces logarithmic)
interaction = E_norm * P_norm
term2 = d * np.power(interaction, e)
loss = term1 + term2 + f
return np.clip(loss, 0.5, 5.5)
def fit_scaling_law(data_points, loss_values):
"""
Focused multi-stage optimization with data-driven initialization
"""
X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
y = np.asarray(loss_values, dtype=np.float64)
if len(y) < 3:
return np.array([1.0, -0.5, -0.5, 0.1, -0.5, 2.5])
y_mean = np.mean(y)
y_std = np.std(y)
y_min = np.min(y)
y_max = np.max(y)
# Extract features
E = X[:, 0]
P = X[:, 1]
# Compute relationships for initialization
E_norm = E / 32.0
P_norm = P / 4e8
def objective(params):
try:
pred = scaling_law_func(X, params)
mse = np.mean((pred - y) ** 2)
return mse if np.isfinite(mse) else 1e10
except:
return 1e10
# Tighter, data-driven bounds
bounds = [
(0.5, 3.0), # a: amplitude
(-1.5, 1.0), # b: expert exponent
(-1.5, 1.0), # c: parameter exponent
(-0.5, 1.0), # d: interaction amplitude
(-1.5, 0.5), # e: interaction exponent
(y_min - y_std, y_max + y_std) # f: bias
]
# Focused initialization: 4 diverse candidates
inits = [
np.array([1.5, -0.2, -0.3, 0.1, -0.3, y_mean]), # Conservative
np.array([1.0, -0.3, -0.5, 0.05, -0.2, y_mean]), # Param-heavy
np.array([2.0, -0.1, -0.2, 0.2, -0.4, y_mean]), # Expert-heavy
np.array([1.3, -0.25, -0.35, 0.08, -0.25, y_mean]), # Balanced
]
best_params = None
best_loss = float('inf')
# Stage 1: Multi-start local optimization with aggressive tolerance
for init in inits:
try:
result = minimize(
objective,
init,
method='L-BFGS-B',
bounds=bounds,
options={'maxiter': 1500, 'ftol': 1e-12, 'gtol': 1e-11}
)
if result.fun < best_loss:
best_loss = result.fun
best_params = result.x
except:
pass
# Stage 2: Global optimization only if needed
if best_loss > 0.3:
try:
result = differential_evolution(
objective,
bounds,
maxiter=800,
popsize=30,
seed=42,
atol=1e-9,
tol=1e-9,
workers=1,
updating='deferred',
polish=True
)
if result.fun < best_loss:
best_loss = result.fun
best_params = result.x
except:
pass
# Fallback
if best_params is None:
best_params = np.array([1.3, -0.25, -0.35, 0.1, -0.3, y_mean])
return best_params
# EVOLVE-BLOCK-END