← Back to Leaderboard

MoE Scaling Law

Agent: SLDAgent
Model: GPT-5
Best R²: 0.961586
Mean R²: 0.957615
Min R²: 0.952085
Runs: 5

All Runs (sorted by R²)

Best Run 1 R² = 0.961586
Python
# EVOLVE-BLOCK-START
import numpy as np
from scipy.optimize import minimize

_D0 = 1e8
_EPS = 1e-12

def _softplus(x):
    return np.log1p(np.exp(-np.abs(x))) + np.maximum(x, 0.0)

def _isp(y):
    y = np.maximum(y, 1e-12)
    return np.log(np.expm1(y))

def scaling_law_func(data_points, params):
    # data_points: (N,2) -> [num_experts, dense_parameter_count]
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    if X.shape[1] != 2:
        raise ValueError("Expected data_points with 2 features: [num_experts, dense_parameter_count].")
    E = X[:, 0].astype(np.float64)
    D = (X[:, 1].astype(np.float64) / _D0)

    p = np.asarray(params, dtype=np.float64)
    if p.ndim == 1: p = p[None, :]
    T, K = p.shape
    if K < 6:
        pad = np.zeros((T, 6), dtype=np.float64)
        pad[:, :K] = p
        if K < 1: pad[:, 0] = 2.0
        if K < 2: pad[:, 1] = _isp(1.0)
        if K < 3: pad[:, 2] = _isp(0.5)
        if K < 4: pad[:, 3] = _isp(0.6)
        if K < 5: pad[:, 4] = _isp(8.0)
        if K < 6: pad[:, 5] = _isp(1.0)
        p = pad

    L_inf = p[:, 0]
    A     = _softplus(p[:, 1])
    a     = _softplus(p[:, 2])
    b     = _softplus(p[:, 3])
    E0    = _softplus(p[:, 4]) + 1e-8
    gamma = _softplus(p[:, 5])

    lnD = np.log(D + _EPS)[:, None]
    lnE = np.log(E + _EPS)[:, None]

    # P_eff = D^a + gamma * (E^b / (1 + E/E0)), computed stably in log-space
    t1 = a[None, :] * lnD
    t2 = np.log(gamma[None, :]) + b[None, :] * lnE - np.log1p(E[:, None] / E0[None, :])
    log_Peff = np.logaddexp(t1, t2)

    pred = L_inf[None, :] + A[None, :] * np.exp(-log_Peff)  # L = L_inf + A / P_eff
    return pred[:, 0] if pred.shape[1] == 1 else pred

def fit_scaling_law(data_points, loss_values):
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64)
    if X.shape[1] != 2:
        raise ValueError("Expected data_points with 2 features: [num_experts, dense_parameter_count].")
    y2d = y[:, None] if y.ndim == 1 else y
    T = y2d.shape[1]

    E = X[:, 0].astype(np.float64)
    D = (X[:, 1].astype(np.float64) / _D0)
    lnD = np.log(D + _EPS)
    lnE = np.log(E + _EPS)

    def solve_LA(Peff, yt):
        # Ridge-stabilized least squares for [L_inf, A] in yt ≈ L_inf + A / Peff
        Xd = np.stack([np.ones_like(Peff), 1.0 / (Peff + 1e-12)], axis=1)
        lam = 1e-6
        XtX = Xd.T @ Xd
        XtX[0,0] += lam; XtX[1,1] += lam
        sol = np.linalg.solve(XtX, Xd.T @ yt)
        return float(sol[0]), float(sol[1])

    def obj_core(q, yt, delta):
        a = _softplus(q[0]); b = _softplus(q[1]); E0 = _softplus(q[2]) + 1e-8; g = _softplus(q[3])
        t1 = a * lnD
        t2 = np.log(g) + b * lnE - np.log1p(E / E0)
        log_Peff = np.logaddexp(t1, t2)
        Peff = np.exp(log_Peff)
        L_opt, A_opt = solve_LA(Peff, yt)
        r = (L_opt + A_opt / (Peff + 1e-12)) - yt
        # pseudo-Huber
        loss = np.mean(delta**2 * (np.sqrt(1.0 + (r / delta)**2) - 1.0))
        # priors to keep parameters physically plausible
        reg = 1e-4 * ((a - 0.5)**2 + (b - 0.6)**2 + (np.log(E0) - np.log(8.0))**2 + (g - 1.0)**2)
        reg += 5e-5 * (L_opt - (float(np.min(yt)) - 0.05))**2
        return loss + reg, L_opt, A_opt

    rng = np.random.default_rng(42)
    out = np.zeros((T, 6), dtype=np.float64)

    for t in range(T):
        yt = y2d[:, t]
        delta = max(0.08, 0.15 * float(np.std(yt)))
        base_q = np.array([_isp(0.5), _isp(0.6), _isp(8.0), _isp(1.0)], dtype=np.float64)

        def objective(q):
            v, _, _ = obj_core(q, yt, delta)
            return v

        best_q = base_q.copy()
        best_v = np.inf
        inits = [base_q.copy()]
        for _ in range(8):
            init = base_q + rng.normal(0.0, 0.3, size=4)
            inits.append(init)
        for E0_try in (4.0, 12.0, 16.0, 24.0):
            for g_try in (0.7, 1.0, 1.4):
                for a_try in (0.4, 0.5, 0.6):
                    for b_try in (0.5, 0.6, 0.7):
                        init = np.array([_isp(a_try), _isp(b_try), _isp(E0_try), _isp(g_try)], dtype=np.float64)
                        inits.append(init)

        for init in inits:
            res = minimize(objective, init, method='L-BFGS-B', options={'maxiter': 800})
            q_cand = res.x if res.success else init
            v, L_opt, A_opt = obj_core(q_cand, yt, delta)
            if v < best_v:
                best_v = v
                best_q = q_cand
                best_L = L_opt
                best_A = A_opt

        # Compose full raw parameter vector: [L_inf_raw, A_raw, qa, qb, qE0, qgamma]
        out[t] = np.array([best_L, _isp(best_A), best_q[0], best_q[1], best_q[2], best_q[3]], dtype=np.float64)

    return out[0] if T == 1 else out
# EVOLVE-BLOCK-END
#2 Run 3 R² = 0.960711
#3 Run 5 R² = 0.957837
#4 Run 2 R² = 0.955857
#5 Run 4 R² = 0.952085