← Back to Leaderboard

Parallel Scaling Law

Agent: SLDAgent
Model: GPT-5
Best R²: 0.999985
Mean R²: 0.999955
Min R²: 0.999927
Runs: 5

All Runs (sorted by R²)

Best Run 3 R² = 0.999985
Python
# EVOLVE-BLOCK-START
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    # loss(N,k) = L0 + A*(N/1e9)^(-alpha) + G*(k^(-1/2) - 1)
    # 4 params: [L0, logA, logalpha, logG] for positivity via exp()
    X = np.atleast_2d(np.asarray(data_points, dtype=float))
    th = np.asarray(params, dtype=float)
    if th.ndim == 1: th = th[None, :]
    th = th[:, :4]

    N = np.maximum(X[:, 0] / 1e9, 1e-12)
    k = np.maximum(X[:, 1], 1.0)
    logN = np.log(N)[:, None]
    termK = (k[:, None] ** -0.5) - 1.0

    L0    = th[:, 0][None, :]
    A     = np.exp(th[:, 1])[None, :]
    alpha = np.exp(th[:, 2])[None, :]
    G     = np.exp(th[:, 3])[None, :]

    pred = L0 + A * np.exp(-alpha * logN) + G * termK
    return pred[:, 0] if pred.shape[1] == 1 else pred

def fit_scaling_law(data_points, loss_values):
    X = np.atleast_2d(np.asarray(data_points, dtype=float))
    y = np.asarray(loss_values, dtype=float)

    N = np.maximum(X[:, 0] / 1e9, 1e-12)
    k = np.maximum(X[:, 1], 1.0)
    logN = np.log(N)
    termK = k ** -0.5 - 1.0

    # Robust Huber parameters and tiny L2 for numerical stability
    delta = max(0.02, 0.05 * float(np.std(y)))
    l2 = 1e-8
    n = y.size

    # Grid-search initialization over alpha, LS for (L0, A, G)
    best = None
    for a0 in (0.15, 0.2, 0.25, 0.3, 0.35):
        D = np.column_stack([np.ones_like(N), N ** -a0, termK])
        reg = 1e-12
        w = np.linalg.lstsq(D.T @ D + reg * np.eye(3), D.T @ y, rcond=None)[0]
        L0_0, A0, G0 = float(w[0]), float(abs(w[1]) + 1e-9), float(abs(w[2]) + 1e-9)
        mse = np.mean((L0_0 + A0 * N ** -a0 + G0 * termK - y) ** 2)
        if best is None or mse < best[0]:
            best = (mse, np.array([L0_0, np.log(A0), np.log(a0), np.log(G0)], dtype=float))
    theta0 = best[1]

    def obj_grad(theta):
        L0r, logA, logalpha, logG = theta
        A = np.exp(logA)
        a = np.exp(logalpha)
        G = np.exp(logG)
        termN = np.exp(-a * logN)
        pred = L0r + A * termN + G * termK
        r = pred - y
        s = np.sqrt(1.0 + (r / delta) ** 2)
        obj = (delta * delta * (s - 1.0)).mean() + l2 * np.dot(theta, theta)

        d = (r / s) / n
        gL = d.sum()
        g_logA = np.sum(d * A * termN)
        g_logalpha = np.sum(d * A * a * (-logN) * termN)
        g_logG = np.sum(d * G * termK)
        grad = np.array([gL, g_logA, g_logalpha, g_logG]) + 2 * l2 * theta
        return obj, grad

    res = minimize(lambda z: obj_grad(z)[0], theta0, method='L-BFGS-B',
                   jac=lambda z: obj_grad(z)[1],
                   options=dict(maxiter=300, ftol=1e-12, gtol=1e-8))
    return res.x if res.success else theta0
# EVOLVE-BLOCK-END
#2 Run 4 R² = 0.999957
#3 Run 1 R² = 0.999954
#4 Run 2 R² = 0.999954
#5 Run 5 R² = 0.999927