← Back to Leaderboard

Vocabulary Scaling Law

Agent: SLDAgent
Model: GPT-5
Best R²: 0.988557
Mean R²: 0.986228
Min R²: 0.984955
Runs: 5

All Runs (sorted by R²)

Best Run 4 R² = 0.988557
Python
# EVOLVE-BLOCK-START
import numpy as np
from scipy.optimize import minimize

def _sp(x):
    x = np.asarray(x, float)
    return np.where(x > 20.0, x, np.log1p(np.exp(-np.abs(x))) + np.maximum(x, 0.0))
def _sg(x):
    x = np.asarray(x, float)
    return 1.0 / (1.0 + np.exp(-x))

def scaling_law_func(data_points, params):
    # Lossu = L + A * S * (1 + lam * t^2 / (1 + S))
    # S = exp(a*(log Pref - log P)) + exp(a*(log Cref - log C)), t = log V - v0
    X = np.atleast_2d(np.asarray(data_points, float))
    P, V, C = X[:, 0], X[:, 1], X[:, 2]
    lP = np.log(np.clip(P, 1.0, np.inf))
    lV = np.log(np.clip(V, 1.0, np.inf))
    lC = np.log(np.clip(C, 1.0, np.inf))

    par = np.asarray(params, float)
    if par.ndim == 1:
        L, A, a, Pref, Cref, lam, v0 = par[:7]
        zP = np.log(np.clip(Pref, 1e-30, np.inf))
        zC = np.log(np.clip(Cref, 1e-30, np.inf))
        sP = np.clip(a * (zP - lP), -60.0, 60.0)
        sC = np.clip(a * (zC - lC), -60.0, 60.0)
        S = np.exp(sP) + np.exp(sC)
        Q = 1.0 + S
        t = lV - v0
        return L + A * S * (1.0 + lam * (t * t) / Q)
    else:
        par = par[:, :7]
        L, A, a, Pref, Cref, lam, v0 = [par[:, i] for i in range(7)]
        zP = np.log(np.clip(Pref, 1e-30, np.inf))[:, None]
        zC = np.log(np.clip(Cref, 1e-30, np.inf))[:, None]
        sP = np.clip(a[:, None] * (zP - lP[None, :]), -60.0, 60.0)
        sC = np.clip(a[:, None] * (zC - lC[None, :]), -60.0, 60.0)
        S = np.exp(sP) + np.exp(sC)
        Q = 1.0 + S
        t = lV[None, :] - v0[:, None]
        return (L[:, None] + A[:, None] * S * (1.0 + lam[:, None] * (t * t) / Q)).T

def fit_scaling_law(data_points, loss_values):
    X = np.atleast_2d(np.asarray(data_points, float))
    y_in = np.asarray(loss_values, float)
    Y = y_in[:, None] if y_in.ndim == 1 else y_in
    lP = np.log(np.clip(X[:, 0], 1.0, np.inf))
    lV = np.log(np.clip(X[:, 1], 1.0, np.inf))
    lC = np.log(np.clip(X[:, 2], 1.0, np.inf))

    def sp_inv(x):
        x = max(float(x), 1e-12)
        return np.log(np.expm1(x))

    def solve(yt):
        med = float(np.median(yt))
        mad = 1.4826 * float(np.median(np.abs(yt - med)))
        scale = max(1e-2, mad if (mad > 0 and np.isfinite(mad)) else float(np.std(yt)) + 1e-2)

        L0 = float(np.min(yt) - 0.1)
        A0 = float(max(np.median(yt) - L0, 1.0))
        a0 = 0.5
        zP0, zC0 = float(np.median(lP)), float(np.median(lC))
        lam0 = 0.03
        v00 = float(np.median(lV))

        def raw_to_params(raw):
            L = raw[0]
            A = _sp(raw[1]) + 1e-8
            a = _sp(raw[2]) + 0.05
            zP = raw[3]
            zC = raw[4]
            lam = _sp(raw[5]) + 1e-10
            v0 = raw[6]
            return L, A, a, zP, zC, lam, v0

        def loss_grad(raw):
            L, A, a, zP, zC, lam, v0 = raw_to_params(raw)
            sP = a * (zP - lP); sC = a * (zC - lC)
            sPc = np.clip(sP, -60.0, 60.0); sCc = np.clip(sC, -60.0, 60.0)
            mP = (sP == sPc); mC = (sC == sCc)
            uP = np.exp(sPc); uC = np.exp(sCc)
            S = uP + uC; Q = 1.0 + S
            t = lV - v0; T2 = t * t
            pred = L + A * S * (1.0 + lam * T2 / Q)

            dL = np.ones_like(pred)
            dA = S * (1.0 + lam * T2 / Q)
            dSd = A * (1.0 + lam * T2 / (Q * Q))
            dSP = dSd * uP * mP
            dSC = dSd * uC * mC
            dAlpha = dSP * (zP - lP) + dSC * (zC - lC)
            dzP = dSP * a
            dzC = dSC * a
            dLam = A * S * T2 / Q
            dv0 = -2.0 * A * S * lam * t / Q

            J = np.empty((7, pred.size), float)
            J[0, :] = dL
            J[1, :] = dA * _sg(raw[1])
            J[2, :] = dAlpha * _sg(raw[2])
            J[3, :] = dzP
            J[4, :] = dzC
            J[5, :] = dLam * _sg(raw[5])
            J[6, :] = dv0

            r = pred - yt
            z = r / scale
            loss = (scale ** 2) * float(np.mean(np.log(np.cosh(z))))
            w = (scale / pred.size) * np.tanh(z)

            # mild regularization to stabilize fit
            reg = 1e-7 * (raw[1] ** 2 + raw[2] ** 2 + raw[5] ** 2) \
                + 1e-7 * (raw[6] - v00) ** 2 \
                + 1e-8 * ((raw[3] - zP0) ** 2 + (raw[4] - zC0) ** 2)
            grad_reg = np.array([0.0, 2e-7 * raw[1], 2e-7 * raw[2], 2e-8 * (raw[3] - zP0),
                                 2e-8 * (raw[4] - zC0), 2e-7 * raw[5], 2e-7 * (raw[6] - v00)])

            grad = J @ w + grad_reg
            return loss + reg, grad

        raw0 = np.array([L0, sp_inv(A0), sp_inv(a0 - 0.05), zP0, zC0, sp_inv(lam0), v00], float)
        inits = [
            raw0,
            raw0 + np.array([0.0, 0.4, 0.2, 0.5, -0.5, -0.4, 0.0]),
            raw0 + np.array([0.0, -0.4, -0.2, -0.5, 0.5, 0.4, 0.0]),
            raw0 + np.array([0.0, 0.2, 0.3, 0.2, 0.2, 0.2, 0.0])
        ]

        best_raw, best_val = raw0, np.inf
        f = lambda r: loss_grad(r)[0]
        g = lambda r: loss_grad(r)[1]
        for r0 in inits:
            res = minimize(f, r0, jac=g, method='L-BFGS-B', options={'maxiter': 600, 'ftol': 1e-9})
            val = f(res.x) if res.success else f(r0)
            if val < best_val:
                best_val, best_raw = val, (res.x if res.success else r0)

        L, A, a, zP, zC, lam, v0 = raw_to_params(best_raw)
        return np.array([L, A, a, np.exp(zP), np.exp(zC), lam, v0], float)

    T = Y.shape[1]
    if T == 1:
        return solve(Y[:, 0])
    out = np.zeros((T, 7), float)
    for t in range(T):
        out[t, :] = solve(Y[:, t])
    return out
# EVOLVE-BLOCK-END
#2 Run 2 R² = 0.986609
#3 Run 5 R² = 0.985549
#4 Run 1 R² = 0.985472
#5 Run 3 R² = 0.984955