SLD - LR-BSZ Scaling Law - SLDAgent + o4-mini

All Runs (sorted by R²)

Best Run 4 R² = 0.906301

▼

Python

import numpy as np

# normalization constants for numerical stability
_lr0, _bsz0, _D0, _N0 = 1e-3, 256.0, 1e10, 1e8
_eps = 1e-12

# EVOLVE-BLOCK-START
def scaling_law_func(data_points, params):
    """
    Enhanced log‐linear scaling law with:
      - pure batch‐size exponent term
      - lr quadratic penalty term (to capture optimal lr)
      - interactions between lr and N, D, bsz
    log y = p0
          + p1*log(N/N0)
          + p2*log(D/D0)
          + p3*log(bsz/bsz0)
          + p4*log(lr/lr0)
          + p5*(log(lr/lr0))^2
          + p6*[log(lr/lr0)*log(N/N0)]
          + p7*[log(lr/lr0)*log(D/D0)]
          + p8*[log(lr/lr0)*log(bsz/bsz0)]
    y = exp(log y)
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=float))
    lr, bsz, D, N = X.T

    # feature transforms
    lnN  = np.log(N / _N0 + _eps)
    lnD  = np.log(D / _D0 + _eps)
    lnB  = np.log(bsz / _bsz0 + _eps)
    lnL  = np.log(lr / _lr0 + _eps)
    lnL2 = lnL * lnL
    lnLN = lnL * lnN
    lnLD = lnL * lnD
    lnLB = lnL * lnB

    # design matrix: intercept + 8 features
    F = np.vstack([
        np.ones_like(lnN),
        lnN,
        lnD,
        lnB,
        lnL,
        lnL2,
        lnLN,
        lnLD,
        lnLB
    ]).T

    p = np.asarray(params, dtype=float).ravel()
    assert p.size == F.shape[1], f"Expected {F.shape[1]} params, got {p.size}"
    return np.exp(F.dot(p))


def fit_scaling_law(data_points, loss_values):
    """
    Fit the 9 parameters via ridge‐regularized least squares in log‐space:
      minimize ||F·p − log(y)||^2 + λ||p||^2
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=float))
    y = np.asarray(loss_values, dtype=float).ravel()
    y_safe = np.maximum(y, _eps)
    y_log = np.log(y_safe)

    lr, bsz, D, N = X.T
    lnN  = np.log(N / _N0 + _eps)
    lnD  = np.log(D / _D0 + _eps)
    lnB  = np.log(bsz / _bsz0 + _eps)
    lnL  = np.log(lr / _lr0 + _eps)
    lnL2 = lnL * lnL
    lnLN = lnL * lnN
    lnLD = lnL * lnD
    lnLB = lnL * lnB

    F = np.vstack([
        np.ones_like(lnN),
        lnN,
        lnD,
        lnB,
        lnL,
        lnL2,
        lnLN,
        lnLD,
        lnLB
    ]).T

    # ridge regularization for stability
    reg = 1e-6
    A = F.T.dot(F) + reg * np.eye(F.shape[1])
    b = F.T.dot(y_log)
    p_opt = np.linalg.solve(A, b)
    return p_opt
# EVOLVE-BLOCK-END

#2 Run 1 R² = 0.901014

▼

Python

import numpy as np
from scipy.optimize import minimize

# normalization constants (approximate geometric means)
_lr0, _bsz0, _D0, _N0 = 1e-3, 256.0, 1e10, 1e8
_eps = 1e-12

def scaling_law_func(data_points, params):
    """
    Composite 7‐parameter scaling law:
      L = Linf
        + A * (N/N0)^(-α) * (D/D0)^(-β)
        + B * (lr/lr0)^(γ) * (bsz/bsz0)^(-δ)

    params = [logLinf, logA, logα, logβ, logB, γ, δ]
    """
    X = np.atleast_2d(np.asarray(data_points, float))
    lr, bsz, D, N = X[:,0], X[:,1], X[:,2], X[:,3]
    # normalized inputs
    x_lr  = lr  / _lr0 + _eps
    x_bsz = bsz / _bsz0 + _eps
    x_D   = D   / _D0   + _eps
    x_N   = N   / _N0   + _eps

    p = np.asarray(params, float)
    if p.ndim == 1:
        p = p[None,:]
    assert p.shape[1] == 7, f"Expected 7 params, got {p.shape[1]}"

    logLinf, logA, logα, logβ, logB, γ, δ = p[0]
    Linf = np.exp(logLinf)
    A    = np.exp(logA);    α = np.exp(logα);    β = np.exp(logβ)
    B    = np.exp(logB)

    term_nd   = A * x_N**(-α) * x_D**(-β)
    term_lrbs = B * x_lr**(γ)  * x_bsz**(-δ)
    return Linf + term_nd + term_lrbs

def fit_scaling_law(data_points, loss_values):
    """
    Fit the 7‐parameter scaling law by minimizing log‐space MSE:
      obj = mean( (log y_pred − log y_true)^2 )
    """
    X = np.atleast_2d(np.asarray(data_points, float))
    y = np.asarray(loss_values, float).ravel()
    y = np.maximum(y, _eps)

    # initialize params
    y_min, y_ptp = np.min(y), np.ptp(y)
    p0 = np.array([
        np.log(y_min * 0.9),
        np.log(y_ptp * 0.5 + _eps),
        np.log(1.0), np.log(1.0),
        np.log(y_ptp * 0.5 + _eps),
        0.0, 0.0
    ])

    # bounds to keep exponents/coefs reasonable
    bounds = [
        (-10, 10),   # logLinf
        (-15, 15),   # logA
        (-5, 5),     # logα
        (-5, 5),     # logβ
        (-15, 15),   # logB
        (-5, 5),     # γ
        (-5, 5)      # δ
    ]

    def objective(p):
        y_pred = scaling_law_func(X, p)
        y_pred = np.maximum(y_pred, _eps)
        return np.mean((np.log(y_pred) - np.log(y))**2)

    res = minimize(objective, p0,
                   method='L-BFGS-B',
                   bounds=bounds,
                   options={'ftol':1e-9, 'maxiter':1000})
    return res.x if res.success else p0

#3 Run 5 R² = 0.900532

▼

Python

# EVOLVE-BLOCK-START
import numpy as np
from scipy.optimize import minimize

# normalization constants
_lr0, _bsz0, _D0, _N0 = 1e-3, 256.0, 1e10, 1e8

def scaling_law_func(data_points, params):
    """
    Enhanced scaling law:
      L = Linf
        + A * (N/N0)^(-alpha) * (D/D0)^(-beta)
            * [1 + B * (lr/lr0)^(gamma) * (bsz/bsz0)^(-delta)]^rho

    params (length 8):
      0: Linf       – asymptotic loss floor
      1: log(A)     – size‐term prefactor
      2: log(alpha) – exponent on model size
      3: log(beta)  – exponent on data size
      4: log(B)     – hyperparam coupling prefactor
      5: gamma      – lr exponent
      6: delta      – bsz exponent
      7: log(rho)   – saturation exponent on hyper‐term
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=float))
    # unpack and normalize features
    lr_rat  = X[:,0] / _lr0
    bsz_rat = X[:,1] / _bsz0
    D_rat   = X[:,2] / _D0
    N_rat   = X[:,3] / _N0

    p = np.asarray(params, dtype=float)
    if p.ndim == 1:
        p = p[None, :]
    assert p.shape[1] == 8, f"Expected 8 params, got {p.shape[1]}"
    T = p.shape[0]

    # parameter transforms
    Linf  = p[:,0]
    A     = np.exp(p[:,1])
    alpha = np.exp(p[:,2])
    beta  = np.exp(p[:,3])
    B     = np.exp(p[:,4])
    gamma = p[:,5]
    delta = p[:,6]
    rho   = np.exp(p[:,7])

    # broadcast to (N, T)
    lr_m   = lr_rat[:, None]
    bsz_m  = bsz_rat[:, None]
    D_m    = D_rat[:, None]
    N_m    = N_rat[:, None]

    size_term = A[None, :] * (N_m ** (-alpha[None, :])) * (D_m ** (-beta[None, :]))
    hyper     = 1.0 + B[None, :] * (lr_m ** (gamma[None, :])) * (bsz_m ** (-delta[None, :]))
    Y         = Linf[None, :] + size_term * (hyper ** rho[None, :])

    # return shape (N,) if single parameter set, else (N, T)
    return Y[:, 0] if T == 1 else Y

def fit_scaling_law(data_points, loss_values):
    """
    Fit the 8‐parameter enhanced scaling law by minimizing
    MSE in log‐loss space, with positivity and stability bounds.
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=float))
    y = np.asarray(loss_values, dtype=float).ravel()

    # initial guesses
    linf0    = 0.9 * np.min(y)
    A0       = max(1e-6, np.median(y) - linf0)
    alpha0   = 0.5
    beta0    = 0.5
    B0       = 0.1
    gamma0   = 0.0
    delta0   = 0.0
    rho0     = 1.0

    p0 = np.array([
        linf0,
        np.log(A0),
        np.log(alpha0),
        np.log(beta0),
        np.log(B0),
        gamma0,
        delta0,
        np.log(rho0)
    ], dtype=float)

    # bounds ensure physical/scalable predictions
    bnds = [
        (0.0,       np.min(y)),   # Linf
        (-20.0,     20.0),        # log(A)
        (-5.0,      5.0),         # log(alpha)
        (-5.0,      5.0),         # log(beta)
        (-20.0,     20.0),        # log(B)
        (-10.0,     10.0),        # gamma
        (-10.0,     10.0),        # delta
        (-5.0,      5.0)          # log(rho)
    ]

    def objective(p):
        y_pred = scaling_law_func(X, p)
        # heavy penalty on non-positive predictions
        if np.any(y_pred <= 0.0):
            return 1e6 + np.sum((np.minimum(y_pred, 1e-6))**2)
        # log-space MSE
        d = np.log(y_pred) - np.log(y)
        return np.mean(d * d)

    res = minimize(objective, p0, method='L-BFGS-B', bounds=bnds)
    return res.x if (res.success and res.x.shape == p0.shape) else p0
# EVOLVE-BLOCK-END

#4 Run 3 R² = 0.866336

▼

Python

import numpy as np
from scipy.optimize import minimize

# normalization constants
_lr0  = 1e-3
_bsz0 = 256.0
_D0   = 1e10
_N0   = 1e8

# EVOLVE-BLOCK-START
def scaling_law_func(data_points, params):
    """
    Composite scaling law:
      L = Linf
        + A*(N/N0)^(-alpha)*(D/D0)^(-beta)
        + H*(a*ln(lr/lr0) + b*ln(bsz/bsz0) + c)^2

    params = [Linf,
              log(A), log(alpha), log(beta),
              log(H), a, b, c]
    """
    X = np.atleast_2d(np.asarray(data_points))
    lr   = X[:,0]; bsz = X[:,1]
    D    = X[:,2]; Np  = X[:,3]

    # normalized inputs (clip lr and bsz to avoid log(0))
    x_lr   = np.clip(lr  / _lr0, 1e-12, None)
    x_bsz  = np.clip(bsz / _bsz0, 1e-12, None)
    x_D    = D   / _D0
    x_N    = Np  / _N0

    p = np.asarray(params)
    if p.ndim == 1:
        p = p[None,:]       # shape (1,8)

    # unpack parameters
    Linf  = p[:,0]
    A     = np.exp(p[:,1])
    alpha = np.exp(p[:,2])
    beta  = np.exp(p[:,3])
    H     = np.exp(p[:,4])
    a     = p[:,5]
    b     = p[:,6]
    c     = p[:,7]

    # data/model term
    data_term = A[None,:] * (x_N[:,None]**(-alpha[None,:])) * (x_D[:,None]**(-beta[None,:]))
    # hyperparameter interaction term (quadratic in log-space)
    ln_lr   = np.log(x_lr)[:,None]
    ln_bsz  = np.log(x_bsz)[:,None]
    hyp_term = H[None,:] * (a[None,:]*ln_lr + b[None,:]*ln_bsz + c[None,:])**2

    Y = Linf[None,:] + data_term + hyp_term
    return Y[:,0] if Y.shape[1] == 1 else Y

def fit_scaling_law(data_points, loss_values):
    """
    Fit the 8‐parameter scaling law by minimizing
    log-space MSE: mean((ln(L_pred) - ln(L_true))^2).
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values).ravel()

    # initialize parameters
    Linf0 = np.min(y) * 0.9
    A0    = max(1e-3, np.max(y) - Linf0)
    p0 = np.zeros(8, dtype=float)
    p0[0] = Linf0         # Linf
    p0[1] = np.log(A0)    # log(A)
    p0[2] = 0.0           # log(alpha)
    p0[3] = 0.0           # log(beta)
    p0[4] = np.log(0.1)   # log(H)
    p0[5] = 0.0           # a
    p0[6] = 0.0           # b
    p0[7] = 0.0           # c

    # parameter bounds for stability
    bnds = [
        (0.0,         np.min(y)),  # Linf >= 0, <= min observed
        (-20.0, 20.0),             # log(A)
        (-5.0, 5.0),               # log(alpha)
        (-5.0, 5.0),               # log(beta)
        (-15.0, 5.0),              # log(H)
        (-5.0, 5.0),               # a
        (-5.0, 5.0),               # b
        (-5.0, 5.0)                # c
    ]

    def objective(p):
        y_pred = scaling_law_func(X, p)
        # penalty for non-positive predictions
        if np.any(y_pred <= 0):
            return 1e6 + np.sum((np.minimum(y_pred, 1e-6))**2)
        return np.mean((np.log(y_pred) - np.log(y))**2)

    res = minimize(objective, p0, method='L-BFGS-B', bounds=bnds,
                   options={'maxiter':1000})
    return res.x if res.success else p0
# EVOLVE-BLOCK-END

#5 Run 2 R² = 0.865452

▼

Python

import numpy as np
from scipy.optimize import minimize

# normalization constants for numerical stability
_lr0, _bsz0, _D0, _N0 = 1e-3, 256.0, 1e10, 1e8

def scaling_law_func(data_points, params):
    """
    Composite scaling law:
      L = Linf
        + A*(N/N0)^(-alpha)*(D/D0)^(-beta)
        + B*(bsz/bsz0)^(-delta)
        + C * exp[-0.5*(ln(lr/lr0)/sigma)^2]

    params: [Linf,
             logA, log_alpha, log_beta,
             logB, log_delta,
             logC, log_sigma]
    """
    X = np.atleast_2d(np.asarray(data_points, float))
    if X.shape[1] != 4:
        raise ValueError(f"Expected data_points with 4 columns, got {X.shape[1]}")
    lr, bsz, D, Np = X[:,0], X[:,1], X[:,2], X[:,3]

    # normalized inputs
    xN = Np / _N0
    xD = D   / _D0
    xB = bsz / _bsz0
    loglr = np.log(lr/_lr0 + 1e-16)

    p = np.ravel(params).astype(float)
    if p.size != 8:
        raise ValueError(f"Expected 8 parameters, got {p.size}")

    Linf   = p[0]
    A      = np.exp(p[1]); alpha = np.exp(p[2]); beta  = np.exp(p[3])
    Bcoef  = np.exp(p[4]); delta = np.exp(p[5])
    C      = np.exp(p[6]); sigma = np.exp(p[7]) + 1e-16

    term_nd = A * (xN ** -alpha) * (xD ** -beta)
    term_bs = Bcoef * (xB ** -delta)
    term_lr = C * np.exp(-0.5 * (loglr / sigma)**2)

    return Linf + term_nd + term_bs + term_lr

def fit_scaling_law(data_points, loss_values):
    """
    Fit the 8-parameter composite law by minimizing a pseudo-Huber
    loss on log-residuals to emphasize relative errors.
    Returns optimized parameter vector of length 8.
    """
    X = np.asarray(data_points, float)
    y = np.ravel(loss_values).astype(float)
    if X.shape[0] != y.size:
        raise ValueError("Mismatched number of data points and loss values")

    # data-driven initialization
    y_min, y_max = y.min(), y.max()
    dy = max(y_max - y_min, 1e-8)

    Linf0   = y_min * 0.9
    A0      = dy * 0.5;    alpha0 = 0.3;  beta0  = 0.3
    B0      = dy * 0.2;    delta0 = 0.5
    C0      = dy * 0.1;    sigma0 = 1.0

    theta0 = np.array([
        Linf0,
        np.log(A0), np.log(alpha0), np.log(beta0),
        np.log(B0), np.log(delta0),
        np.log(C0), np.log(sigma0)
    ], dtype=float)

    eps = 1e-16
    h   = 0.1  # pseudo-Huber scale on log-residuals

    def objective(p):
        y_pred = scaling_law_func(X, p)
        y_pred = np.maximum(y_pred, eps)
        r = np.log(y_pred) - np.log(y + eps)
        # pseudo-Huber on relative log error
        return np.mean(np.sqrt(1.0 + (r/h)**2) - 1.0)

    # bounds: Linf>=0, sigma>=1e-3 to avoid degenerate
    bounds = [(0.0, None),
              (None, None), (None, None), (None, None),
              (None, None), (None, None),
              (None, None), (np.log(1e-3), None)]

    res = minimize(objective, theta0, method='L-BFGS-B', bounds=bounds)
    return res.x if res.success else theta0