SLD - Parallel Scaling Law - SLDAgent + GPT-5

All Runs (sorted by R²)

Best Run 3 R² = 0.999985

▼

Python

# EVOLVE-BLOCK-START
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    # loss(N,k) = L0 + A*(N/1e9)^(-alpha) + G*(k^(-1/2) - 1)
    # 4 params: [L0, logA, logalpha, logG] for positivity via exp()
    X = np.atleast_2d(np.asarray(data_points, dtype=float))
    th = np.asarray(params, dtype=float)
    if th.ndim == 1: th = th[None, :]
    th = th[:, :4]

    N = np.maximum(X[:, 0] / 1e9, 1e-12)
    k = np.maximum(X[:, 1], 1.0)
    logN = np.log(N)[:, None]
    termK = (k[:, None] ** -0.5) - 1.0

    L0    = th[:, 0][None, :]
    A     = np.exp(th[:, 1])[None, :]
    alpha = np.exp(th[:, 2])[None, :]
    G     = np.exp(th[:, 3])[None, :]

    pred = L0 + A * np.exp(-alpha * logN) + G * termK
    return pred[:, 0] if pred.shape[1] == 1 else pred

def fit_scaling_law(data_points, loss_values):
    X = np.atleast_2d(np.asarray(data_points, dtype=float))
    y = np.asarray(loss_values, dtype=float)

    N = np.maximum(X[:, 0] / 1e9, 1e-12)
    k = np.maximum(X[:, 1], 1.0)
    logN = np.log(N)
    termK = k ** -0.5 - 1.0

    # Robust Huber parameters and tiny L2 for numerical stability
    delta = max(0.02, 0.05 * float(np.std(y)))
    l2 = 1e-8
    n = y.size

    # Grid-search initialization over alpha, LS for (L0, A, G)
    best = None
    for a0 in (0.15, 0.2, 0.25, 0.3, 0.35):
        D = np.column_stack([np.ones_like(N), N ** -a0, termK])
        reg = 1e-12
        w = np.linalg.lstsq(D.T @ D + reg * np.eye(3), D.T @ y, rcond=None)[0]
        L0_0, A0, G0 = float(w[0]), float(abs(w[1]) + 1e-9), float(abs(w[2]) + 1e-9)
        mse = np.mean((L0_0 + A0 * N ** -a0 + G0 * termK - y) ** 2)
        if best is None or mse < best[0]:
            best = (mse, np.array([L0_0, np.log(A0), np.log(a0), np.log(G0)], dtype=float))
    theta0 = best[1]

    def obj_grad(theta):
        L0r, logA, logalpha, logG = theta
        A = np.exp(logA)
        a = np.exp(logalpha)
        G = np.exp(logG)
        termN = np.exp(-a * logN)
        pred = L0r + A * termN + G * termK
        r = pred - y
        s = np.sqrt(1.0 + (r / delta) ** 2)
        obj = (delta * delta * (s - 1.0)).mean() + l2 * np.dot(theta, theta)

        d = (r / s) / n
        gL = d.sum()
        g_logA = np.sum(d * A * termN)
        g_logalpha = np.sum(d * A * a * (-logN) * termN)
        g_logG = np.sum(d * G * termK)
        grad = np.array([gL, g_logA, g_logalpha, g_logG]) + 2 * l2 * theta
        return obj, grad

    res = minimize(lambda z: obj_grad(z)[0], theta0, method='L-BFGS-B',
                   jac=lambda z: obj_grad(z)[1],
                   options=dict(maxiter=300, ftol=1e-12, gtol=1e-8))
    return res.x if res.success else theta0
# EVOLVE-BLOCK-END

#2 Run 4 R² = 0.999957

▼

Python

# EVOLVE-BLOCK-START
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    # L(N,K) = c + a * (N/1e9)^(-alpha) / (1 + b*log(K))
    X = np.atleast_2d(np.asarray(data_points))
    N = np.maximum(X[:, 0], 1e-12) / 1e9
    K = np.maximum(X[:, 1], 1.0)
    lnN = np.log(N)
    lnK = np.log(K)

    P = np.asarray(params, dtype=np.float64)
    def eval_one(p):
        c, a, alpha, b = (np.asarray(p).tolist() + [0, 0, 0, 0])[:4]
        d = 1.0 + b * lnK
        return c + a * np.exp(-alpha * lnN) / d

    if P.ndim == 1:
        return eval_one(P)
    out = np.empty((X.shape[0], P.shape[0]), dtype=np.float64)
    for i in range(P.shape[0]):
        out[:, i] = eval_one(P[i])
    return out

def fit_scaling_law(data_points, loss_values):
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values, dtype=np.float64)
    Y = y[:, None] if y.ndim == 1 else y

    n = np.maximum(X[:, 0], 1e-12) / 1e9
    lnN = np.log(n)
    lnK = np.log(np.maximum(X[:, 1], 1.0))

    T = Y.shape[1]
    Popt = np.zeros((T, 4), dtype=np.float64)
    lam = 1e-8  # tiny L2 to stabilize

    for t in range(T):
        yt = Y[:, t]
        miny = float(np.min(yt))
        best = None
        # coarse grid for c and b, then closed-form init for a, alpha via log-linear fit
        c_cands = [max(0.0, miny - d) for d in (0.03, 0.015, 0.0)]
        b_cands = [0.0, 0.1, 0.25, 0.5, 1.0, 2.0]
        for c0 in c_cands:
            for b0 in b_cands:
                w = 1.0 + b0 * lnK
                z = (yt - c0) * w
                m = z > 1e-10
                if np.count_nonzero(m) < 3:
                    continue
                s, inter = np.polyfit(lnN[m], np.log(z[m]), 1)
                alpha0 = max(1e-6, -s)
                a0 = float(np.exp(inter))
                p0 = np.array([c0, max(1e-8, a0), alpha0, max(0.0, b0)], dtype=np.float64)
                mse = float(np.mean((scaling_law_func(X, p0) - yt) ** 2))
                if best is None or mse < best[0]:
                    best = (mse, p0)
        p0 = best[1] if best is not None else np.array(
            [max(0.0, miny - 0.01), max(1e-6, float(np.ptp(yt))), 0.3, 0.3], dtype=np.float64
        )

        def obj(p):
            c, a, alpha, b = p
            d = 1.0 + b * lnK
            f = np.exp(-alpha * lnN)
            pred = c + a * f / d
            r = pred - yt
            reg = lam * (a*a + alpha*alpha + b*b)
            return np.mean(r * r) + reg

        def jac(p):
            c, a, alpha, b = p
            d = 1.0 + b * lnK
            invd = 1.0 / d
            f = np.exp(-alpha * lnN)
            pred = c + a * f * invd
            r = pred - yt
            fac = 2.0 / r.size
            dc = fac * np.sum(r)
            da = fac * np.sum(r * f * invd) + 2.0 * lam * a
            dalpha = fac * np.sum(r * (a * (-lnN) * f * invd)) + 2.0 * lam * alpha
            db = fac * np.sum(r * (a * f * (-lnK) * invd * invd)) + 2.0 * lam * b
            return np.array([dc, da, dalpha, db], dtype=np.float64)

        bounds = [(0.0, 5.0), (1e-8, 10.0), (1e-6, 3.0), (0.0, 5.0)]
        res = minimize(obj, p0, method="L-BFGS-B", bounds=bounds, jac=jac)
        Popt[t, :] = res.x if res.success else p0

    return Popt[0] if T == 1 else Popt
# EVOLVE-BLOCK-END

#3 Run 1 R² = 0.999954

▼

Python

# EVOLVE-BLOCK-START
import numpy as np
from scipy.optimize import minimize

def _softplus(x):
    x = np.asarray(x, dtype=float)
    return np.log1p(np.exp(-np.abs(x))) + np.maximum(x, 0.0)

def scaling_law_func(data_points, params):
    X = np.atleast_2d(np.asarray(data_points, dtype=float))
    n = np.clip(X[:, 0], 1e-12, None) / 1e9
    logk = np.log(np.clip(X[:, 1], 1.0, None))
    p = np.asarray(params, dtype=float)
    if p.size < 4:
        p = np.pad(p, (0, 4 - p.size))
    a = _softplus(p[0]); alpha = _softplus(p[1]); b = p[2]; c = _softplus(p[3])
    return b + (a * n**(-alpha)) / (1.0 + c * logk)

def fit_scaling_law(data_points, loss_values):
    X = np.atleast_2d(np.asarray(data_points, dtype=float))
    y = np.asarray(loss_values, dtype=float).ravel()
    n = np.clip(X[:, 0], 1e-12, None) / 1e9
    logk = np.log(np.clip(X[:, 1], 1.0, None))
    N = y.size
    def inv_softplus(v):
        v = np.asarray(v, dtype=float)
        return np.log(np.expm1(np.maximum(v, 1e-12)))
    def ls_ab(alpha, c):
        f = n ** (-alpha) / (1.0 + c * logk)
        F = np.column_stack([f, np.ones(N)])
        a, b = np.linalg.lstsq(F, y, rcond=None)[0]
        return max(float(a), 1e-9), float(b)
    As = np.array([0.1, 0.2, 0.3, 0.5, 0.8])
    Cs = np.array([0.0, 0.1, 0.3, 0.7, 1.2])
    best_q, best_mse = None, np.inf
    for A in As:
        for C in Cs:
            a0, b0 = ls_ab(A, C)
            f = n ** (-A) / (1.0 + C * logk)
            mse = np.mean((b0 + a0 * f - y) ** 2)
            if mse < best_mse:
                best_mse = mse
                best_q = np.array([inv_softplus(A), inv_softplus(C)], dtype=float)
    d = max(0.01, 0.1 * np.std(y))
    def obj(q):
        alpha = _softplus(q[0]); c = _softplus(q[1])
        a, b = ls_ab(alpha, c)
        r = b + a * n**(-alpha) / (1.0 + c * logk) - y
        return np.mean(d * d * (np.sqrt(1.0 + (r / d) ** 2) - 1.0)) + 1e-9 * (q[0] * q[0] + q[1] * q[1])
    starts = [best_q,
              best_q + np.array([0.2, -0.1]),
              best_q + np.array([-0.2, 0.1])]
    best_val, q_opt = np.inf, best_q
    for s in starts:
        res = minimize(obj, s, method='L-BFGS-B', options={'maxiter': 400, 'ftol': 1e-10})
        if res.success and res.fun < best_val:
            best_val, q_opt = res.fun, res.x
    alpha = _softplus(q_opt[0]); c = _softplus(q_opt[1])
    a, b = ls_ab(alpha, c)
    return np.array([inv_softplus(a), q_opt[0], b, q_opt[1]], dtype=float)
# EVOLVE-BLOCK-END

#4 Run 2 R² = 0.999954

▼

Python

# EVOLVE-BLOCK-START
import numpy as np
from scipy.optimize import minimize

def _sp(x):  # softplus
    return np.log1p(np.exp(-np.abs(x))) + np.maximum(x, 0.0)

def scaling_law_func(data_points, params):
    X = np.atleast_2d(np.asarray(data_points))
    lN = np.log(X[:, 0] / 1e9 + 1e-12)
    lS = np.log(X[:, 1] + 1e-12)
    p = np.asarray(params)
    p = np.atleast_2d(p)
    if p.shape[1] < 4: p = np.hstack([p, np.zeros((p.shape[0], 4 - p.shape[1]))])
    if p.shape[1] > 4: p = p[:, :4]
    c0, logA, ar, br = p.T
    a = _sp(ar); b = _sp(br)
    pred = c0[None, :] + np.exp(logA[None, :] - a[None, :] * lN[:, None] - b[None, :] * lS[:, None])
    return pred[:, 0] if pred.shape[1] == 1 else pred

def fit_scaling_law(data_points, loss_values):
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values).ravel()
    lN = np.log(X[:, 0] / 1e9 + 1e-12)
    lS = np.log(X[:, 1] + 1e-12)
    yb = y.mean()

    def obj(p):
        logA, ar, br = p
        a = _sp(ar); b = _sp(br)
        g = np.exp(logA - a * lN - b * lS)
        gb = g.mean()
        r = (g - gb) - (y - yb)
        mse = np.mean(r * r)
        s1 = 1.0 / (1.0 + np.exp(-ar))
        s2 = 1.0 / (1.0 + np.exp(-br))
        d_logA = 2.0 * np.mean(r * g)
        d_ar = 2.0 * np.mean(r * (-g * lN) * s1)
        d_br = 2.0 * np.mean(r * (-g * lS) * s2)
        reg = 1e-8 * (a * a + b * b + logA * logA)
        d_logA += 2e-8 * logA
        d_ar += 2e-8 * a * s1
        d_br += 2e-8 * b * s2
        return mse + reg, np.array([d_logA, d_ar, d_br])

    ymin = float(np.min(y)); ymean = float(np.mean(y))
    la_cands = [np.log(max(ymean - ymin + 1e-3, 1e-3)), np.log(max(0.1, 1e-3))]
    ar_cands = [-3.2, -2.5]
    br_cands = [-3.5, -2.2]
    best = None; bestv = np.inf
    for la in la_cands:
        for ar0 in ar_cands:
            for br0 in br_cands:
                p0 = np.array([la, ar0, br0]) + np.random.normal(0, 0.01, 3)
                res = minimize(lambda p: obj(p)[0], p0, jac=lambda p: obj(p)[1],
                               method='L-BFGS-B',
                               bounds=[(-10, 10), (-7, 7), (-7, 7)],
                               options={'maxiter': 500, 'ftol': 1e-10})
                if res.success and res.fun < bestv:
                    bestv, best = res.fun, res.x
    if best is None: best = p0
    logA, ar, br = best
    a = _sp(ar); b = _sp(br)
    g = np.exp(logA - a * lN - b * lS)
    c0 = np.clip(yb - g.mean(), 0.0, 5.0)
    return np.array([c0, logA, ar, br])
# EVOLVE-BLOCK-END

#5 Run 5 R² = 0.999927

▼

Python

# EVOLVE-BLOCK-START
import numpy as np

_P0 = 1e9  # parameter scale for conditioning

def scaling_law_func(data_points, params):
    X = np.atleast_2d(np.asarray(data_points, dtype=float))
    P = np.clip(X[:, 0] / _P0, 1e-12, None)[:, None]
    S = np.clip(X[:, 1], 1.0, None)[:, None]
    p = np.asarray(params, dtype=float)
    if p.ndim == 1: p = p[None, :]
    T = p.shape[0]
    tmp = np.zeros((T, 4), dtype=float); tmp[:, :p.shape[1]] = p
    L0 = tmp[:, 0][None, :]
    A  = np.clip(tmp[:, 1][None, :], 0.0, None)
    beta = np.clip(tmp[:, 2][None, :], 0.0, None)
    C  = np.clip(tmp[:, 3][None, :], 0.0, None)
    pred = L0 + A * (P ** (-beta)) / (1.0 + C * np.log(S))
    return pred[:, 0] if pred.shape[1] == 1 else pred

def fit_scaling_law(data_points, loss_values):
    X = np.atleast_2d(np.asarray(data_points, dtype=float))
    y = np.asarray(loss_values, dtype=float)
    y2d = y[:, None] if y.ndim == 1 else y
    P = np.clip(X[:, 0] / _P0, 1e-12, None)
    S = np.clip(X[:, 1], 1.0, None)
    ls = np.log(S)
    N, T = y2d.shape
    out = np.zeros((T, 4), dtype=float)

    betas = np.array([0.05, 0.08, 0.1, 0.15, 0.2, 0.3, 0.5], dtype=float)
    Cs    = np.array([0.0, 0.03, 0.05, 0.1, 0.15, 0.2, 0.3], dtype=float)

    Pw_all = P[:, None] ** (-betas[None, :])          # (N, B)
    Den_all = 1.0 + ls[:, None] * Cs[None, :]         # (N, C)

    for t in range(T):
        yt = y2d[:, t]
        ymin = float(np.min(yt)); yrng = float(np.ptp(yt)) or 0.1
        L0s = np.linspace(ymin - max(0.5, 0.8 * yrng), ymin - 1e-4, 60)
        best_mse = np.inf; best = None
        for L0 in L0s:
            z = yt - L0
            if np.any(z <= 0): continue
            # w shape: (N, B, C)
            w = Pw_all[:, :, None] / Den_all[:, None, :]
            num = (z[:, None, None] * w).sum(axis=0)                     # (B, C)
            den = (w * w).sum(axis=0) + 1e-18                            # (B, C)
            A = np.clip(num / den, 1e-12, None)                          # (B, C)
            resid = (A[None, :, :] * w - z[:, None, None])               # (N, B, C)
            mse = np.mean(resid * resid, axis=0)                         # (B, C)
            i, j = np.unravel_index(np.argmin(mse), mse.shape)
            cur_mse = mse[i, j]
            if cur_mse < best_mse:
                best_mse = cur_mse
                best = (L0, float(A[i, j]), float(betas[i]), float(Cs[j]))
        if best is None:
            L0 = ymin - 0.1
            z = np.clip(yt - L0, 1e-12, None)
            w_b = Pw_all
            A_b = (z[:, None] * w_b).sum(axis=0) / (w_b * w_b).sum(axis=0).clip(min=1e-18)
            A_b = np.clip(A_b, 1e-12, None)
            mse_b = np.mean((A_b[None, :] * w_b - z[:, None]) ** 2, axis=0)
            i = int(np.argmin(mse_b))
            best = (L0, float(A_b[i]), float(betas[i]), 0.0)
        out[t] = best

    return out[0] if T == 1 else out
# EVOLVE-BLOCK-END