import numpy as np
from scipy.optimize import minimize
# EVOLVE-BLOCK-START
"""
Refined MoE scaling law with a saturating expert‐term to capture diminishing returns.
Model form (6 params):
loss ≈ A1 * D_norm^α * E_norm^(−γ)
+ A2 * (1 − exp(−β * E_norm))
+ C
where
D_norm = dense_parameter_count / 1e8
E_norm = num_experts / 64
Parameters = [A1, α, γ, A2, β, C]
"""
def scaling_law_func(data_points, params):
X = np.atleast_2d(np.asarray(data_points, dtype=float))
E = X[:, 0] # num_experts
D = X[:, 1] # dense_parameter_count
# fixed‐scale normalization (no input‐dependent stats)
Dn = D / 1e8
En = E / 64.0
p = np.asarray(params, dtype=float)
if p.ndim == 1:
p = p[None, :] # shape (1,6)
# unpack columns
A1, alpha, gamma, A2, beta, C = p.T
# broadcast to compute (N × T)
Dm = Dn[:, None]
Em = En[:, None]
term1 = A1[None, :] * (Dm ** alpha[None, :]) * (Em ** (-gamma[None, :]))
term2 = A2[None, :] * (1.0 - np.exp(-beta[None, :] * Em))
preds = term1 + term2 + C[None, :]
# if a single parameter vector was passed, return shape (N,)
return preds[:, 0] if preds.shape[1] == 1 else preds
def fit_scaling_law(data_points, loss_values):
"""
Fit [A1, α, γ, A2, β, C] by minimizing mean absolute log‐error
via multi‐start L‐BFGS‐B in a mixed raw/log space.
"""
X = np.atleast_2d(np.asarray(data_points, dtype=float))
y = np.asarray(loss_values, dtype=float).ravel()
E = X[:, 0]
D = X[:, 1]
# same fixed normalization
Dn = D / 1e8
En = E / 64.0
eps = 1e-12
# unpack raw vector → actual params
def unpack(raw):
logA1, alpha, logγ, logA2, logβ, C = raw
return (np.exp(logA1),
alpha,
np.exp(logγ),
np.exp(logA2),
np.exp(logβ),
C)
# objective: mean absolute log‐error
def objective(raw):
A1, alpha, γ, A2, β, C = unpack(raw)
pred = (A1 * (Dn ** alpha) * (En ** (-γ))
+ A2 * (1.0 - np.exp(-β * En))
+ C)
pred = np.maximum(pred, eps)
return np.mean(np.abs(np.log(pred) - np.log(y + eps)))
# bounds for raw parameters:
# logA1 ∈ [-5,5], α ∈ [-2,2], logγ ∈ [-5,5],
# logA2 ∈ [-5,5], logβ ∈ [-5,5], C ∈ [0,5]
bounds = [
(-5, 5),
(-2, 2),
(-5, 5),
(-5, 5),
(-5, 5),
(0, 5),
]
# prepare multi‐start initial guesses
y0 = np.mean(y)
inits = [
np.array([ 0.0, 0.5, 0.0, 0.0, 1.0, y0]),
np.array([-1.0, 1.0, 0.5, -1.0, 0.5, y0]),
np.array([ 1.0, -0.5, 1.0, 1.0, -0.5, y0])
]
# add a couple of random perturbations (deterministic seed)
rng = np.random.RandomState(0)
for _ in range(2):
raw = np.array([
rng.uniform(-2,2), # logA1
rng.uniform(-1,1), # α
rng.uniform(-1,1), # logγ
rng.uniform(-2,2), # logA2
rng.uniform( 0,2), # logβ
y0
])
inits.append(raw)
best_fun = np.inf
best_raw = inits[0]
for raw0 in inits:
res = minimize(
objective,
raw0,
method='L-BFGS-B',
bounds=bounds,
options={'maxiter': 500, 'ftol': 1e-9}
)
if res.success and res.fun < best_fun:
best_fun, best_raw = res.fun, res.x
# recover optimal parameters
A1_opt, alpha_opt, γ_opt, A2_opt, β_opt, C_opt = unpack(best_raw)
return np.array([A1_opt, alpha_opt, γ_opt, A2_opt, β_opt, C_opt])
# EVOLVE-BLOCK-END