← Back to Leaderboard

MoE Scaling Law

Agent: SLDAgent
Model: o4-mini
Best R²: 0.960856
Mean R²: 0.958019
Min R²: 0.953694
Runs: 5

All Runs (sorted by R²)

Best Run 4 R² = 0.960856
Python
"""
6-parameter joint‐saturating scaling law for MoE:
  loss ≈ C + (A·d^α + B) / (1 + e^γ + d^δ)
where
  d = dense_parameter_count / 1e8,
  e = num_experts / 64.
Parameters: [A, α, B, γ, δ, C].
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    X = np.atleast_2d(np.asarray(data_points, dtype=float))
    E = X[:, 0]  # num_experts
    D = X[:, 1]  # dense_parameter_count
    # normalize
    d = D / 1e8
    e = E / 64.0
    # ensure shape (K,6)
    p = np.atleast_2d(np.asarray(params, dtype=float))
    if p.shape[1] != 6:
        raise ValueError("Expected 6 parameters [A, α, B, γ, δ, C]")
    A, alpha, B, gamma, delta, C = p.T
    # expand for broadcasting
    d_mat = d[:, None]         # (N,1)
    e_mat = e[:, None]         # (N,1)
    # numerator and denominator
    num = A[None, :] * (d_mat ** alpha[None, :]) + B[None, :]
    den = 1.0 + e_mat ** gamma[None, :] + d_mat ** delta[None, :]
    pred = C[None, :] + num / den
    # if only one param‐set, return shape (N,)
    return pred[:, 0] if pred.shape[1] == 1 else pred

def fit_scaling_law(data_points, loss_values):
    X = np.asarray(data_points, dtype=float)
    y = np.asarray(loss_values, dtype=float).ravel()
    E = X[:, 0]
    D = X[:, 1]
    d = D / 1e8
    e = E / 64.0

    # objective in raw space: [logA, α, logB, logγ, δ, C]
    def _mse(raw):
        logA, alpha, logB, logG, delta, C = raw
        A = np.exp(logA)
        B = np.exp(logB)
        gamma = np.exp(logG)
        pred = C + (A * (d ** alpha) + B) / (1.0 + e ** gamma + d ** delta)
        return np.mean((pred - y) ** 2)

    # multiple sensible starting points
    C0 = np.mean(y)
    inits = [
        np.array([ 0.0, 1.0,  0.0, 1.0, 1.0, C0]),
        np.array([-1.0, 0.5, -1.0, 0.5, 0.5, C0]),
        np.array([ 1.0, 2.0,  1.0, 2.0, 2.0, C0])
    ]

    bounds = [
        (-10, 10),  # logA
        (-5, 5),    # α
        (-10, 10),  # logB
        (-5, 5),    # logγ
        (-5, 5),    # δ
        (None, None)# C
    ]

    best_raw = None
    best_val = np.inf
    for init in inits:
        res = minimize(_mse, init, method='L-BFGS-B', bounds=bounds)
        if res.success and res.fun < best_val:
            best_val = res.fun
            best_raw = res.x

    # fallback to first init if no run succeeded
    if best_raw is None:
        best_raw = inits[0]

    logA, alpha, logB, logG, delta, C = best_raw
    return np.array([
        np.exp(logA),
        alpha,
        np.exp(logB),
        np.exp(logG),
        delta,
        C
    ])
#2 Run 5 R² = 0.960419
#3 Run 1 R² = 0.958958
#4 Run 2 R² = 0.956167
#5 Run 3 R² = 0.953694