SLD - LR-BSZ Scaling Law - SLDAgent + GPT-5

All Runs (sorted by R²)

Best Run 1 R² = 0.983442

▼

Python

# EVOLVE-BLOCK-START
import numpy as np

# Log-polynomial + rational diminishing-returns + steps features; weighted ridge fit
def _design_matrix(X):
    X = np.atleast_2d(np.asarray(X, dtype=float))
    eps = 1e-12
    Xc = np.clip(X, eps, None)
    L = np.log10(Xc)
    lr_log, bsz_log, data_log, params_log = L.T
    D0, P0, B0 = 1.0e10, 3.0e8, 256.0
    S0 = D0 / B0
    D, P, B = Xc[:, 2], Xc[:, 3], Xc[:, 1]
    S = D / np.maximum(B, eps)
    ones = np.ones(X.shape[0], float)
    feats = [
        ones,
        lr_log, bsz_log, data_log, params_log,
        lr_log**2, bsz_log**2, data_log**2, params_log**2,
        lr_log*bsz_log, lr_log*data_log, lr_log*params_log,
        bsz_log*data_log, bsz_log*params_log, data_log*params_log,
        lr_log**3,
        (D/D0)**-0.5, (D/D0)**(-1.0/3.0),
        (P/P0)**-0.5, (P/P0)**(-1.0/3.0),
        (B/B0)**-0.5,
        np.log10(np.maximum(S/S0, eps)),
        np.log10(np.maximum(S/S0, eps))**2,
        (S/S0)**-0.5,
    ]
    return np.column_stack(feats)

def scaling_law_func(data_points, params):
    Phi = _design_matrix(data_points)
    p = np.asarray(params, dtype=float)
    if p.ndim == 1: p = p[None, :]
    y = Phi @ p.T
    return y[:, 0] if y.shape[1] == 1 else y

def fit_scaling_law(data_points, loss_values):
    Phi = _design_matrix(data_points)
    y = np.asarray(loss_values, dtype=float)
    Y = y[:, None] if y.ndim == 1 else y
    N, M = Phi.shape
    T = Y.shape[1]

    s = Phi.std(axis=0)
    s[0] = 1.0
    s[s < 1e-12] = 1.0
    Z = Phi / s

    w = np.ones(M, dtype=float)
    w[0] = 0.0
    w[1:5] = 1.0
    w[5:9] = 5.0
    w[9:15] = 5.0
    w[15] = 8.0
    w[16:21] = 10.0
    w[21:24] = 6.0
    lam = 1e-2

    A = Z.T @ Z + lam * np.diag(w)
    B = Z.T @ Y
    try:
        P_s = np.linalg.solve(A, B)
    except np.linalg.LinAlgError:
        P_s = np.linalg.pinv(A) @ B
    P = (P_s.T / s).astype(float)
    return P[0] if T == 1 else P
# EVOLVE-BLOCK-END

#2 Run 2 R² = 0.982606

▼

Python

# EVOLVE-BLOCK-START
import numpy as np

_A = np.array([1e-3, 256.0, 1e10, 1e8], float)
_UMIN, _UMAX = -9.0, 7.0

def _phi(X):
    X = np.atleast_2d(np.asarray(X, float))
    z = np.log(X / _A)
    z1, z2, z3, z4 = z.T
    return np.column_stack([
        z1, z2, z3, z4,
        0.5*z1**2, 0.5*z2**2, 0.5*z3**2, 0.5*z4**2,
        z1*z2, z1*z3, z1*z4, z2*z3, z2*z4, z3*z4
    ])

def scaling_law_func(data_points, params):
    Phi = _phi(data_points)
    P = Phi.shape[1]
    W = np.asarray(params, float)
    if W.ndim == 1:
        L0, A = W[0], W[1]
        w = W[2:2+P]
        u = Phi @ w
        e = np.exp(np.clip(u, _UMIN, _UMAX))
        return L0 + A * e
    else:
        L0, A = W[:, 0], W[:, 1]
        w = W[:, 2:2+P]
        u = Phi @ w.T
        e = np.exp(np.clip(u, _UMIN, _UMAX))
        return L0[None, :] + A[None, :] * e

def fit_scaling_law(data_points, loss_values):
    Phi = _phi(data_points)
    y = np.asarray(loss_values, float).ravel()
    N, P = Phi.shape
    A0 = Phi.T @ Phi
    lam_grid = np.array([1e-4, 5e-4, 1e-3, 5e-3, 1e-2, 5e-2, 1e-1])
    ymin = float(np.min(y))
    lo = ymin - 0.9
    hi = ymin - 0.03
    if not (lo < hi):
        lo = ymin - 0.2
        hi = ymin - 1e-3
    L0_grid = np.linspace(lo, hi, 25)
    best = (np.inf, hi, 1.0, np.zeros(P), lam_grid[0])

    def solve_w(L0, lam):
        r = y - L0
        r = np.where(r > 1e-8, r, 1e-8)
        t = np.log(r)
        M = A0.copy()
        M.flat[::P+1] += lam
        bt = Phi.T @ t
        try:
            w = np.linalg.solve(M, bt)
        except np.linalg.LinAlgError:
            w, *_ = np.linalg.lstsq(np.vstack([Phi, np.sqrt(lam)*np.eye(P)]),
                                    np.concatenate([t, np.zeros(P)]), rcond=None)
        u = Phi @ w
        e = np.exp(np.clip(u, _UMIN, _UMAX))
        A = (e @ r) / max(e @ e, 1e-12)
        A = float(max(A, 1e-8))
        yhat = L0 + A * e
        mse = ((yhat - y)**2).mean()
        return mse, w, A

    # coarse search over L0 and ridge strength
    for L0 in L0_grid:
        for lam in lam_grid:
            mse, w, A = solve_w(L0, lam)
            if mse < best[0]:
                best = (mse, L0, A, w, lam)

    mse_b, L0_b, A_b, w_b, lam_b = best

    # refine L0 with golden-section search
    gl, gh = max(lo, L0_b - 0.3), min(hi, L0_b + 0.15)
    if gl >= gh:
        gl, gh = lo, hi
    phi = (np.sqrt(5) - 1) / 2
    c = gh - phi * (gh - gl)
    d = gl + phi * (gh - gl)
    mc, wc, Ac = solve_w(c, lam_b)
    md, wd, Ad = solve_w(d, lam_b)
    for _ in range(14):
        if mc < md:
            gh, md, wd, Ad = d, mc, wc, Ac
            d = c
            c = gh - phi * (gh - gl)
            mc, wc, Ac = solve_w(c, lam_b)
        else:
            gl, mc, wc, Ac = c, md, wd, Ad
            c = d
            d = gl + phi * (gh - gl)
            md, wd, Ad = solve_w(d, lam_b)
    if mc < md:
        L0_b, w_b, A_b = c, wc, Ac
    else:
        L0_b, w_b, A_b = d, wd, Ad

    return np.concatenate([[L0_b, A_b], w_b])
# EVOLVE-BLOCK-END

#3 Run 4 R² = 0.982010

▼

Python

# EVOLVE-BLOCK-START
import numpy as np
from scipy.optimize import minimize

def _sp(z):
    z = np.asarray(z, dtype=float)
    return np.log1p(np.exp(-np.abs(z))) + np.maximum(z, 0.0)

def scaling_law_func(data_points, params):
    X = np.atleast_2d(np.asarray(data_points, dtype=float))
    assert X.shape[1] == 4, "data_points must have 4 columns: [lr, bsz, data_size, non_embedding_param_size]"
    lr, bsz, D, P = [X[:, i:i+1] for i in range(4)]
    eps = 1e-12
    P0, D0 = 1e8, 1e10

    p = np.asarray(params, dtype=float).ravel()
    assert p.size >= 17, "params must have length >= 17"
    p = p[:17]
    (Linf, A_P, aP, A_PD, aPD, A_S, aS,
     lr0, C_lr, kpos, kneg, v_b, v_P,
     B_b, g_b, logb0, cP) = p

    # enforce positivity where needed
    A_P=_sp(A_P); aP=_sp(aP)
    A_PD=_sp(A_PD); aPD=_sp(aPD)
    A_S=_sp(A_S); aS=_sp(aS)
    C_lr=_sp(C_lr); kpos=_sp(kpos); kneg=_sp(kneg)
    B_b=_sp(B_b); g_b=_sp(g_b)

    llr = np.log(np.clip(lr, eps, None))
    lb  = np.log(np.clip(bsz, eps, None))
    lnP = np.log(np.clip(P/P0, eps, None))
    lnD = np.log(np.clip(D/D0, eps, None))
    lDP = lnD - lnP
    lS  = lnD - lb  # effective steps ~ D/bsz

    # scale-improvement terms
    term_P  = A_P  * np.exp(-aP  * lnP)   # (P/P0)^(-aP)
    term_PD = A_PD * np.exp(-aPD * lDP)   # (D/P)^(-aPD)
    term_S  = A_S  * np.exp(-aS  * lS)    # (D/bsz)^(-aS)

    # LR penalty: asymmetric softplus around optimum depending on b and P
    mu_lr = lr0 + v_b * lb + v_P * lnP
    z = llr - mu_lr
    lr_pen = C_lr * (_sp(kpos * z) + _sp(kneg * (-z)) - np.log(4.0))

    # Batch-size insufficiency penalty with P-dependent critical batch
    lbcrit = logb0 + cP * lnP
    bpen = B_b * (_sp(g_b * (lbcrit - lb)) - np.log(2.0))

    pred = Linf + term_P + term_PD + term_S + lr_pen + bpen
    return pred.ravel()

def fit_scaling_law(data_points, loss_values):
    X = np.atleast_2d(np.asarray(data_points, dtype=float))
    y = np.asarray(loss_values, dtype=float).ravel()
    assert X.shape[1] == 4, "data_points must have 4 columns"
    N = X.shape[0]
    eps = 1e-12
    llr = np.log(np.clip(X[:,0], eps, None))
    lb  = np.log(np.clip(X[:,1], eps, None))
    lnP = np.log(np.clip(X[:,3]/1e8, eps, None))

    # Regress log-lr on [1, lb, lnP] to seed lr0, v_b, v_P
    M = np.column_stack([np.ones(N), lb, lnP])
    try:
        coef, _, _, _ = np.linalg.lstsq(M, llr, rcond=None)
    except np.linalg.LinAlgError:
        coef = np.array([np.mean(llr), 0.2, 0.1], dtype=float)
    lr0_init, vb_init, vP_init = map(float, coef)

    Linf_init  = float(np.percentile(y, 5))
    logb0_init = float(np.median(lb))

    p0 = np.array([
        Linf_init,   # Linf
        0.6, 0.10,   # A_P, aP
        0.3, 0.12,   # A_PD, aPD
        0.2, 0.10,   # A_S, aS
        lr0_init,    # lr0
        0.15, 2.0, 2.5,   # C_lr, kpos, kneg
        vb_init, vP_init, # v_b, v_P
        0.12, 0.60,  # B_b, g_b
        logb0_init,  # logb0
        0.30         # cP
    ], dtype=float)

    def huber(r, d=0.15):
        a = np.abs(r)
        return np.where(a <= d, 0.5 * r*r, d * (a - 0.5 * d))

    def obj(p):
        pred = scaling_law_func(X, p)
        return np.mean(huber(pred - y)) + 1e-6 * np.sum(p*p)

    best_p = p0.copy()
    best_f = obj(best_p)

    rng = np.random.RandomState(42)
    starts = [p0]
    ql = np.quantile(llr, [0.25, 0.5, 0.75])
    qb = np.quantile(lb,  [0.25, 0.5, 0.75])
    for l0 in ql:
        for b0 in qb:
            pp = p0.copy()
            pp[7]  = l0   # lr0
            pp[15] = b0   # logb0
            starts.append(pp)
    for _ in range(8):
        noise = rng.normal(scale=0.2, size=p0.size)
        noise[[0,7,15]] *= 0.1
        starts.append(p0 + noise)

    for start in starts:
        res = minimize(obj, start, method="L-BFGS-B", options={"maxiter": 450, "ftol": 1e-12})
        if res.success and res.fun < best_f:
            best_f, best_p = res.fun, res.x

    return best_p
# EVOLVE-BLOCK-END

#4 Run 5 R² = 0.915408

▼

Python

# EVOLVE-BLOCK-START
import numpy as np
from scipy.optimize import minimize

def _softplus(x):
    return np.log1p(np.exp(x))

def scaling_law_func(data_points, params):
    X = np.atleast_2d(np.asarray(data_points, dtype=float))
    P = np.asarray(params, dtype=float)
    if P.ndim == 1: P = P[None, :]
    T, K = P.shape
    if K != 10:
        raise ValueError("params must have length 10: [c,sP,sD,sB,sLR,q_raw,a0,a1,a2,a3]")
    z = np.log(X + 1e-12)
    lr, b, d, m = z[:, 0], z[:, 1], z[:, 2], z[:, 3]
    out = np.empty((X.shape[0], T))
    for t in range(T):
        c, sP, sD, sB, sLR, q_r, a0, a1, a2, a3 = P[t]
        q = _softplus(q_r) + 1e-12          # positive curvature
        lr_opt = a0 + a1 * b + a2 * m + a3 * d
        out[:, t] = c + sP * m + sD * d + sB * b + sLR * lr + q * (lr - lr_opt) ** 2
    return out[:, 0] if out.shape[1] == 1 else out

def fit_scaling_law(data_points, loss_values):
    X = np.atleast_2d(np.asarray(data_points, dtype=float))
    y = np.asarray(loss_values, dtype=float)
    z = np.log(X + 1e-12)
    lr, b, d, m = z[:, 0], z[:, 1], z[:, 2], z[:, 3]
    Y = y[:, None] if y.ndim == 1 else y
    T, K = Y.shape[1], 10

    c0 = float(np.clip(np.median(Y), 1.5, 5.0))
    a0_init = float(np.mean(lr) - 0.5 * np.mean(b))
    init_row = np.array([c0, -0.06, -0.08, -0.01, 0.0, -1.0, a0_init, 0.5, -0.2, -0.1], dtype=float)
    init = np.tile(init_row, (T, 1))

    bounds_row = [
        (1.5, 5.0),   # c
        (-1.0, 0.0),  # sP
        (-1.0, 0.0),  # sD
        (-0.5, 0.5),  # sB
        (-0.5, 0.5),  # sLR
        (-6.0, 6.0),  # q_raw
        (-12.0, 0.0), # a0
        (0.0, 2.0),   # a1
        (-2.0, 2.0),  # a2
        (-2.0, 2.0),  # a3
    ]
    bounds = bounds_row * T
    lam = 1e-6
    delta = 0.08  # Huber threshold for robustness

    def obj(flat):
        p = flat.reshape(T, K)
        pred = scaling_law_func(X, p)
        E = (pred[:, None] if T == 1 else pred) - Y
        huber = delta**2 * (np.sqrt(1.0 + (E / delta)**2) - 1.0)
        return float(np.mean(huber) + lam * np.mean(p**2))

    best_x, best_f = init.ravel(), np.inf
    rng = np.random.default_rng(0)
    starts = [init,
              init + rng.normal(0.0, 0.15, init.shape),
              init + rng.normal(0.0, 0.3, init.shape)]
    for x0 in starts:
        res = minimize(obj, x0.ravel(), method='L-BFGS-B', bounds=bounds,
                       options={'maxiter': 1200, 'ftol': 1e-9})
        if res.fun < best_f:
            best_f, best_x = res.fun, res.x

    p_opt = best_x.reshape(T, K)
    return p_opt[0] if T == 1 else p_opt
# EVOLVE-BLOCK-END

#5 Run 3 R² = 0.910510

▼

Python

# EVOLVE-BLOCK-START
import numpy as np
from scipy.optimize import minimize

# Fixed anchors for nondimensionalization (data-independent)
_DR, _PR, _BR = 1.0e10, 3.0e8, 256.0
_UR = _DR / _BR  # reference number of updates

def _sp(x):
    x = np.asarray(x, float)
    return np.where(x > 20.0, x, np.log1p(np.exp(x)))

def _spi(v):
    v = np.maximum(v, 1e-12)
    return np.log(np.expm1(v))

def _pred(X, th):
    lr = np.clip(X[:,0], 1e-12, None)
    b  = np.clip(X[:,1], 1.0, None)
    D  = np.clip(X[:,2], 1.0, None)
    P  = np.clip(X[:,3], 1.0, None)

    # Parameters (16):
    # 0 L0
    # 1 aD_r, 2 beta_r
    # 3 aP_r, 4 alpha_r
    # 5 aB_r, 6 eta_r
    # 7 q2_r, 8 log_lr0, 9 s_b
    # 10 aC_r, 11 b2_r, 12 phi_r
    # 13 s_pd, 14 s_u
    # 15 q4_r
    L0    = float(th[0])
    aD    = _sp(th[1]);  beta  = _sp(th[2])
    aP    = _sp(th[3]);  alpha = _sp(th[4])
    aB    = _sp(th[5]);  eta   = _sp(th[6])
    q2    = _sp(th[7]);  l0    = float(th[8]); sb = float(th[9])
    aC    = _sp(th[10]); b2    = _sp(th[11]);  phi = _sp(th[12])
    spd   = float(th[13]); su = float(th[14])
    q4    = _sp(th[15])

    # Core power-law terms
    tD = aD * (_DR / D) ** beta
    tP = aP * (_PR / P) ** alpha
    tB = aB * (_BR / b) ** eta

    # Data adequacy coupling: required tokens ∝ P^phi
    Dreq = _DR * (P / _PR) ** phi
    tC   = aC * (Dreq / D) ** b2

    # LR sweet spot with dependence on batch, P/D, and update count U = D/B
    ln_lr = np.log(lr)
    ln_b  = np.log(b / _BR + 1e-12)
    ln_pd = np.log(P / _PR) - np.log(D / _DR)
    ln_u  = np.log((D / _DR) / (b / _BR) + 1e-12)  # log(U/_UR) up to a constant
    center = l0 + sb * ln_b + spd * ln_pd + su * ln_u
    d = ln_lr - center
    lrp = q2 * d * d + q4 * d ** 4

    return L0 + tD + tP + tB + tC + lrp

def scaling_law_func(data_points, params):
    X = np.atleast_2d(np.asarray(data_points, float))
    p = np.asarray(params, float)
    if p.ndim == 1:
        return _pred(X, p)
    return np.column_stack([_pred(X, p[i]) for i in range(p.shape[0])])

def fit_scaling_law(data_points, loss_values):
    X = np.atleast_2d(np.asarray(data_points, float))
    y = np.asarray(loss_values, float)
    Y = y[:, None] if y.ndim == 1 else y
    T = Y.shape[1]

    ln_lr = np.log(np.clip(X[:,0], 1e-12, None))

    def init_vec(yt):
        m = float(np.median(yt))
        span = float(np.clip(np.percentile(yt, 90) - np.percentile(yt, 10), 0.1, 5.0))
        # Scales
        aD0 = aP0 = aC0 = 0.30 * span
        aB0 = 0.10 * span
        # Exponents/prior slopes
        beta0 = alpha0 = 0.25
        eta0  = 0.50
        b20   = 0.25
        phi0  = 0.50
        # LR curvature and center slopes
        q20   = 0.10 * span
        q40   = 0.01 * span
        l0    = float(np.median(ln_lr))
        sb0   = 0.50
        spd0  = 0.05
        su0   = 0.10
        return np.array([
            m,
            _spi(aD0), _spi(beta0),
            _spi(aP0), _spi(alpha0),
            _spi(aB0), _spi(eta0),
            _spi(q20), l0, sb0,
            _spi(aC0), _spi(b20), _spi(phi0),
            spd0, su0,
            _spi(q40)
        ], float)

    def obj(th, yt, delta):
        r = scaling_law_func(X, th) - yt
        s = r / delta
        phuber = delta * delta * (np.sqrt(1.0 + s * s) - 1.0)
        beta=_sp(th[2]); alpha=_sp(th[4]); eta=_sp(th[6]); b2=_sp(th[11]); phi=_sp(th[12])
        q2=_sp(th[7]); q4=_sp(th[15])
        # Regularization: priors for exponents/slopes, small quartic
        reg = 1e-6 * np.sum(th[1:]**2)
        reg += 5e-5 * ((beta-0.25)**2 + (alpha-0.25)**2 + (eta-0.50)**2 + (b2-0.25)**2 + (phi-0.50)**2)
        reg += 2e-5 * (q2**2) + 1e-4 * (q4**2)
        reg += 1e-4 * (th[0]**2 + th[8]**2) + 5e-5 * (th[9]**2 + th[13]**2 + th[14]**2)
        return float(np.mean(phuber) + reg)

    out = np.zeros((T, 16), float)
    rng = np.random.default_rng(123)
    for t in range(T):
        yt = Y[:, t]
        base = init_vec(yt)
        delta = max(0.1, 0.25 * float(np.std(yt)))
        best, bestv = base.copy(), np.inf
        for _ in range(10):
            n = np.zeros_like(base)
            n[0] = rng.normal(0, 0.15)            # L0
            n[1:8] = rng.normal(0, 0.25, 7)       # positive raw params
            n[8] = rng.normal(0, 0.4)             # log_lr0
            n[9] = rng.normal(0, 0.2)             # s_b
            n[10:13] = rng.normal(0, 0.25, 3)     # aC, b2, phi
            n[13] = rng.normal(0, 0.15)           # s_pd
            n[14] = rng.normal(0, 0.15)           # s_u
            n[15] = rng.normal(0, 0.2)            # q4
            x0 = base + n
            res = minimize(lambda th: obj(th, yt, delta), x0, method='L-BFGS-B', options={'maxiter': 1000})
            if res.success and res.fun < bestv:
                bestv, best = res.fun, res.x
        out[t] = best
    return out[0] if T == 1 else out
# EVOLVE-BLOCK-END