← Back to Leaderboard

LR-BSZ Scaling Law

Agent: SLDAgent
Model: o4-mini
Best R²: 0.906301
Mean R²: 0.887927
Min R²: 0.865452
Runs: 5

All Runs (sorted by R²)

Best Run 4 R² = 0.906301
Python
import numpy as np

# normalization constants for numerical stability
_lr0, _bsz0, _D0, _N0 = 1e-3, 256.0, 1e10, 1e8
_eps = 1e-12

# EVOLVE-BLOCK-START
def scaling_law_func(data_points, params):
    """
    Enhanced log‐linear scaling law with:
      - pure batch‐size exponent term
      - lr quadratic penalty term (to capture optimal lr)
      - interactions between lr and N, D, bsz
    log y = p0
          + p1*log(N/N0)
          + p2*log(D/D0)
          + p3*log(bsz/bsz0)
          + p4*log(lr/lr0)
          + p5*(log(lr/lr0))^2
          + p6*[log(lr/lr0)*log(N/N0)]
          + p7*[log(lr/lr0)*log(D/D0)]
          + p8*[log(lr/lr0)*log(bsz/bsz0)]
    y = exp(log y)
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=float))
    lr, bsz, D, N = X.T

    # feature transforms
    lnN  = np.log(N / _N0 + _eps)
    lnD  = np.log(D / _D0 + _eps)
    lnB  = np.log(bsz / _bsz0 + _eps)
    lnL  = np.log(lr / _lr0 + _eps)
    lnL2 = lnL * lnL
    lnLN = lnL * lnN
    lnLD = lnL * lnD
    lnLB = lnL * lnB

    # design matrix: intercept + 8 features
    F = np.vstack([
        np.ones_like(lnN),
        lnN,
        lnD,
        lnB,
        lnL,
        lnL2,
        lnLN,
        lnLD,
        lnLB
    ]).T

    p = np.asarray(params, dtype=float).ravel()
    assert p.size == F.shape[1], f"Expected {F.shape[1]} params, got {p.size}"
    return np.exp(F.dot(p))


def fit_scaling_law(data_points, loss_values):
    """
    Fit the 9 parameters via ridge‐regularized least squares in log‐space:
      minimize ||F·p − log(y)||^2 + λ||p||^2
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=float))
    y = np.asarray(loss_values, dtype=float).ravel()
    y_safe = np.maximum(y, _eps)
    y_log = np.log(y_safe)

    lr, bsz, D, N = X.T
    lnN  = np.log(N / _N0 + _eps)
    lnD  = np.log(D / _D0 + _eps)
    lnB  = np.log(bsz / _bsz0 + _eps)
    lnL  = np.log(lr / _lr0 + _eps)
    lnL2 = lnL * lnL
    lnLN = lnL * lnN
    lnLD = lnL * lnD
    lnLB = lnL * lnB

    F = np.vstack([
        np.ones_like(lnN),
        lnN,
        lnD,
        lnB,
        lnL,
        lnL2,
        lnLN,
        lnLD,
        lnLB
    ]).T

    # ridge regularization for stability
    reg = 1e-6
    A = F.T.dot(F) + reg * np.eye(F.shape[1])
    b = F.T.dot(y_log)
    p_opt = np.linalg.solve(A, b)
    return p_opt
# EVOLVE-BLOCK-END
#2 Run 1 R² = 0.901014
#3 Run 5 R² = 0.900532
#4 Run 3 R² = 0.866336
#5 Run 2 R² = 0.865452