← Back to Leaderboard

Parallel Scaling Law

Agent: SLDAgent
Model: o4-mini
Best R²: 0.999958
Mean R²: 0.999955
Min R²: 0.999954
Runs: 5

All Runs (sorted by R²)

Best Run 2 R² = 0.999958
Python
import numpy as np
from scipy.optimize import least_squares

def scaling_law_func(data_points, params):
    """
    Four-parameter scaling law with diminishing-parallel returns:
      loss = b + a * (N/1e9)^(-alpha) * (1 + log2(P))^(-beta)
    where:
      N      = model size (num_params)
      P      = parallel_size
      P_eff  = 1 + log2(P) for diminishing returns
    params = [a, alpha, beta, b]
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=float))
    N = X[:, 0] / 1e9
    P = X[:, 1]
    # effective parallel factor
    eps = 1e-12
    P_eff = 1.0 + np.log2(np.clip(P, eps, None))
    a, alpha, beta, b = params
    # enforce minimal positivity for stability
    a     = max(a, eps)
    alpha = max(alpha, eps)
    beta  = max(beta, eps)
    b     = max(b, 0.0)
    return b + a * (N ** (-alpha)) * (P_eff ** (-beta))


def fit_scaling_law(data_points, loss_values):
    """
    Fit the 4-parameter law
      loss = b + a*(N/1e9)^(-alpha)*(1+log2(P))^(-beta)
    via:
      1) init b near the lowest observed loss
      2) log-linear regression for [a, alpha, beta]
      3) robust non-linear least squares (Huber) with analytic Jacobian
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=float))
    y = np.asarray(loss_values, dtype=float).ravel()
    if X.shape[0] != y.size:
        raise ValueError("data_points and loss_values must match lengths")

    # preprocess features
    N = X[:, 0] / 1e9
    P = X[:, 1]
    eps = 1e-12
    P_eff = 1.0 + np.log2(np.clip(P, eps, None))
    y_min = float(np.min(y))

    # 1) initialize intercept b near the lower envelope
    b0 = max(0.0, 0.9 * y_min)

    # 2) log-linear initialization for a, alpha, beta
    y_shift = y - b0
    # clamp to positive
    y_shift = np.clip(y_shift, eps, None)
    logy   = np.log(y_shift)
    logN   = np.log(np.clip(N,   eps, None))
    logPe  = np.log(P_eff)

    # design matrix: logy ≈ C0 − alpha*logN − beta*logPe
    A = np.column_stack([np.ones_like(logy), -logN, -logPe])
    try:
        C0, alpha0, beta0 = np.linalg.lstsq(A, logy, rcond=None)[0]
        a0     = max(np.exp(C0),     eps)
        alpha0 = max(alpha0,         eps)
        beta0  = max(beta0,          eps)
    except Exception:
        span   = max(np.max(y) - y_min, eps)
        a0, alpha0, beta0 = span, 0.5, 0.2

    init_params = np.array([a0, alpha0, beta0, b0], dtype=float)

    # residuals for least_squares (raw domain)
    def residuals(p):
        return scaling_law_func(X, p) - y

    # analytic Jacobian ∂r/∂p
    def jac(p):
        a, alpha, beta, _ = p
        M = (N ** (-alpha)) * (P_eff ** (-beta))
        da     = M
        dalpha = -a * M * np.log(np.clip(N, eps, None))
        dbeta  = -a * M * np.log(P_eff)
        db     = np.ones_like(M)
        return np.vstack([da, dalpha, dbeta, db]).T

    # bounds: a,alpha,beta ≥ eps; b in [0, y_min]
    lower = [eps, eps, eps, 0.0]
    upper = [np.inf, np.inf, np.inf, y_min]

    # 3) robust fitting with Huber loss
    result = least_squares(
        residuals,
        init_params,
        jac=jac,
        bounds=(lower, upper),
        loss='huber',
        f_scale=1e-3,
        xtol=1e-12,
        ftol=1e-12,
        gtol=1e-12
    )

    p_opt = result.x if result.success else init_params
    # ensure intercept never exceeds the observed minimum
    p_opt[3] = min(p_opt[3], y_min)
    return p_opt
#2 Run 1 R² = 0.999954
#3 Run 3 R² = 0.999954
#4 Run 4 R² = 0.999954
#5 Run 5 R² = 0.999954