SLD - SFT Scaling Law - SLDAgent + GPT-5

Best Run 5 R² = 0.999265

▼

Python

# EVOLVE-BLOCK-START
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    # Saturating power law (Hill form): L(n) = c + A / (1 + (n/n0)^b)
    X = np.atleast_2d(np.asarray(data_points)); n = X[:, 0].astype(float)
    P = np.asarray(params)
    if P.ndim == 1: P = P[None, :]
    T, K = P.shape
    if K not in (3, 4): raise ValueError("params must have length 3 or 4 per target")
    c = P[:, 0][None, :]
    A = P[:, 1][None, :]
    b = P[:, 2][None, :]
    n0 = (P[:, 3][None, :] if K == 4 else np.ones((1, T)))
    ns = np.maximum(n[:, None], 1e-12)
    n0s = np.maximum(n0, 1e-12)
    z = np.clip(b * (np.log(ns) - np.log(n0s)), -50.0, 50.0)
    pred = c + A / (1.0 + np.exp(z))
    return pred[:, 0] if pred.shape[1] == 1 else pred

def fit_scaling_law(data_points, loss_values):
    X = np.atleast_2d(np.asarray(data_points)); n = X[:, 0].astype(float)
    y = np.asarray(loss_values); Y = y[:, None] if y.ndim == 1 else y
    N, T = Y.shape; ln = np.log(np.maximum(n, 1e-12))

    def huber(r, d):
        a = np.abs(r); m = np.minimum(a, d); return 0.5 * m * m + d * (a - m)
    def dhuber(r, d):
        a = np.abs(r); return np.where(a <= d, r, d * np.sign(r))

    out = []
    ln_geo = float(np.mean(ln))
    for t in range(T):
        yt = Y[:, t].astype(float)
        ymin, ymax = float(np.min(yt)), float(np.max(yt))
        c0 = max(np.percentile(yt, 5), 0.0)
        A0 = max(np.percentile(yt, 95) - c0, 1e-3)

        # Linearize to estimate b, n0
        diff = np.maximum(yt - c0, 1e-8)
        s = np.maximum(A0 / diff - 1.0, 1e-8)
        S = np.log(s)
        Xls = np.vstack([ln, np.ones_like(ln)]).T
        try:
            sol, _, _, _ = np.linalg.lstsq(Xls, S, rcond=None)
            b0 = max(sol[0], 1e-3)
            ln_n0 = -sol[1] / max(b0, 1e-8)
            n00 = float(np.exp(ln_n0))
        except Exception:
            b0 = 0.5
            n00 = float(np.exp(ln_geo))
        if not np.isfinite(n00) or n00 <= 0: n00 = float(np.exp(ln_geo))

        # Coordinate LS refine c0, A0 given b0, n00
        phi = 1.0 / (1.0 + np.exp(np.clip(b0 * (ln - np.log(max(n00, 1e-12))), -50.0, 50.0)))
        D = np.vstack([np.ones_like(phi), phi]).T
        try:
            sol_ca, _, _, _ = np.linalg.lstsq(D, yt, rcond=None)
            c0 = max(float(sol_ca[0]), 0.0)
            A0 = max(float(sol_ca[1]), 1e-6)
        except Exception:
            c0 = c0
            A0 = A0

        starts = [
            (c0, A0, b0, n00),
            (max(ymin - 0.05, 0.0), max(ymax - max(ymin - 0.05, 0.0), 1e-3), 1.0, n00)
        ]

        def pack(p): c, A, b, n0 = p; return np.log([c + 1e-12, A + 1e-12, b + 1e-12, n0 + 1e-12])
        def unpack(u): return np.exp(u[0]), np.exp(u[1]), np.exp(u[2]), np.exp(u[3])

        md = 1.4826 * np.median(np.abs(yt - np.median(yt)))
        d = max(0.02, 0.5 * md)

        def obj_grad(u):
            c, A, b, n0 = unpack(u)
            ln0 = np.log(max(n0, 1e-12))
            z = np.clip(b * (ln - ln0), -50.0, 50.0)
            s = np.exp(z); den = 1.0 + s
            pred = c + A / den
            r = pred - yt
            w = dhuber(r, d) / max(1, len(r))
            dl = (ln - ln0)
            g_c = np.ones_like(r)
            g_A = 1.0 / den
            g_b = -A * s * dl / (den * den)
            g_n0 = A * s * b / (den * den * max(n0, 1e-12))
            val = float(np.mean(huber(r, d)))
            g = np.array([np.sum(w * g_c) * c,
                          np.sum(w * g_A) * A,
                          np.sum(w * g_b) * b,
                          np.sum(w * g_n0) * n0], dtype=float)
            return val, g

        best_u, best_v = None, np.inf
        for p0 in starts:
            u0 = pack(p0); v0, _ = obj_grad(u0)
            if v0 < best_v: best_v, best_u = v0, u0

        res = minimize(lambda u: obj_grad(u)[0], best_u, jac=lambda u: obj_grad(u)[1],
                       method="L-BFGS-B", options={"maxiter": 400, "ftol": 1e-9})
        u = res.x if res.success else best_u
        out.append(list(unpack(u)))

    P = np.asarray(out)
    return P[0] if P.shape[0] == 1 else P
# EVOLVE-BLOCK-END

#2 Run 4 R² = 0.999255

▼

Python

# EVOLVE-BLOCK-START
import numpy as np
from scipy.optimize import least_squares

# L(N) = c + a / (1 + exp(p*(log(N) - log(b)))), params: [a,b,p,c] >= 0
def scaling_law_func(data_points, params):
    X = np.atleast_2d(np.asarray(data_points)); x = X[:, 0].astype(float)
    P = np.asarray(params, dtype=float); 
    if P.ndim == 1: P = P[None, :]
    a = np.maximum(P[:, 0], 0.0)
    b = np.maximum(P[:, 1], 1e-12)
    p = np.maximum(P[:, 2], 1e-8)
    c = np.maximum(P[:, 3], 0.0)
    lx = np.log(np.maximum(x, 1.0))[:, None]
    t = p[None, :] * (lx - np.log(b)[None, :])
    s = 1.0 / (1.0 + np.exp(np.clip(t, -60.0, 60.0)))
    y = c[None, :] + a[None, :] * s
    return y[:, 0] if y.shape[1] == 1 else y

def fit_scaling_law(data_points, loss_values):
    X = np.atleast_2d(np.asarray(data_points)); x = X[:, 0].astype(float)
    Y = np.asarray(loss_values); Y = Y[:, None] if Y.ndim == 1 else Y
    T = Y.shape[1]
    lx = np.log(np.maximum(x, 1.0))
    gm = float(np.exp(np.mean(lx)))
    w = np.sqrt(np.log1p(x) / np.maximum(np.mean(np.log1p(x)), 1e-12))
    sp = lambda v: np.log1p(np.exp(v))
    isp = lambda wv: np.log(np.expm1(np.maximum(float(wv), 1e-12)))
    sig = lambda u: 1.0 / (1.0 + np.exp(-u))

    R = np.zeros((T, 4))
    order = np.argsort(x); top = order[max(0, int(0.7 * len(x))):]

    for t in range(T):
        y = Y[:, t]
        c0 = float(np.median(y[top])) if top.size else float(np.min(y))
        c0 = max(0.0, c0)
        a0 = max(1e-3, float(np.max(y) - c0))
        eps = 1e-8
        m = y > c0 + eps
        if np.count_nonzero(m) >= 2:
            s = float(np.cov(lx[m], np.log(np.maximum(y[m] - c0, eps)), bias=True)[0, 1] / np.maximum(np.var(lx[m]), 1e-12))
            p0 = float(np.clip(-s, 0.05, 4.0))
        else:
            p0 = 0.25
        target = c0 + 0.5 * a0
        b0 = float(np.clip(x[int(np.argmin(np.abs(y - target)))], 1.0, np.max(x))) if np.isfinite(target) else gm

        inits = [
            (a0, b0, p0, c0),
            (a0 * 0.7, b0 * 0.5, max(0.05, p0 * 0.8), c0),
            (a0 * 1.3, b0 * 2.0, p0 * 1.2, c0),
            (a0, b0 * 4.0, max(0.1, p0 * 0.6), c0 * 0.9),
        ]

        best_x, best_c = None, np.inf

        for a, b, p, c in inits:
            u0 = np.array([isp(a), np.log(b), isp(p), isp(c)])

            def resid(u):
                aa, bb, pp, cc = sp(u[0]), np.exp(u[1]), sp(u[2]), sp(u[3])
                return w * (scaling_law_func(X, [aa, bb, pp, cc]) - y)

            def jac(u):
                u0_, u1_, u2_, u3_ = u
                aa, pp = sp(u0_), sp(u2_)
                lb = u1_
                t = pp * (lx - lb)
                s_ = 1.0 / (1.0 + np.exp(np.clip(t, -60.0, 60.0)))
                dsdt = -s_ * (1.0 - s_)
                J = np.empty((x.size, 4))
                J[:, 0] = w * sig(u0_) * s_
                J[:, 1] = w * (aa * pp * s_ * (1.0 - s_))
                J[:, 2] = w * (aa * dsdt * sig(u2_) * (lx - lb))
                J[:, 3] = w * sig(u3_)
                return J

            res = least_squares(resid, u0, jac=jac, loss='soft_l1', f_scale=0.5, max_nfev=2000)
            cost = float(np.mean(res.fun ** 2))
            if cost < best_c and np.all(np.isfinite(res.x)):
                best_c, best_x = cost, res.x

        if best_x is None:
            a, b, p, c = inits[0]
        else:
            a = sp(best_x[0]); b = np.exp(best_x[1]); p = sp(best_x[2]); c = sp(best_x[3])
        R[t] = [a, b, p, c]

    return R[0] if T == 1 else R
# EVOLVE-BLOCK-END

#3 Run 2 R² = 0.999208

▼

Python

# EVOLVE-BLOCK-START
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    X = np.atleast_2d(np.asarray(data_points))
    n = X[:, 0].astype(float)
    p = np.asarray(params)
    if p.ndim == 1:
        p = p[None, :]
    if p.shape[1] < 4:
        p = np.pad(p, ((0, 0), (0, 4 - p.shape[1])), 'constant')
    L, A, a, n1 = p[:, 0], p[:, 1], p[:, 2], p[:, 3]
    n_safe = np.maximum(n, 1e-12)
    n1 = np.maximum(n1, 1e-12)
    m = (n_safe[:, None] / n1[None, :])
    y = L[None, :] + A[None, :] / (1.0 + np.power(m, np.maximum(a[None, :], 1e-8)))
    return y[:, 0] if y.shape[1] == 1 else y

def fit_scaling_law(data_points, loss_values):
    X = np.atleast_2d(np.asarray(data_points))
    n = X[:, 0].astype(float)
    y = np.asarray(loss_values)
    Y = y[:, None] if y.ndim == 1 else y
    N, T = Y.shape

    lb, ub = 0.5, 6.0
    eps = 1e-12
    w = np.log1p(n)
    w = w / (np.sum(w) + eps)

    def hub(r, d=0.25):
        ar = np.abs(r)
        return np.where(ar <= d, 0.5 * r * r, d * (ar - 0.5 * d))

    def pack(L, A, a, n1):
        z = np.clip((L - lb) / (ub - lb), 1e-6, 1 - 1e-6)
        t0 = np.log(z) - np.log1p(-z)
        return np.array([t0, np.log(max(A, 1e-12)), np.log(max(a, 1e-12)), np.log(max(n1, 1e-12))])

    def unpack(t):
        z = 1.0 / (1.0 + np.exp(-t[0]))
        L = lb + (ub - lb) * z
        return L, np.exp(t[1]), np.exp(t[2]), np.exp(t[3])

    def obj(th, yy):
        L, A, a, n1 = unpack(th)
        pred = scaling_law_func(n[:, None], [L, A, a, n1])
        r = pred - yy
        return np.sum(w * hub(r)) + 1e-6 * (th[1]**2 + th[2]**2 + th[3]**2)

    P = np.zeros((T, 4))
    order = np.argsort(n)
    high_idx = order[-max(3, N // 4):]

    for t in range(T):
        yy = Y[:, t]
        L0 = float(np.clip(np.percentile(yy[high_idx], 30), lb, ub))
        A0 = float(np.clip(np.percentile(yy, 95) - L0, 1e-3, 10.0))

        r = np.clip(yy - L0, 1e-6, A0 - 1e-6)
        z = np.clip(A0 / r - 1.0, 1e-6, 1e6)
        ln_n = np.log(np.maximum(n, 1.0))
        ln_z = np.log(z)

        try:
            a_est, b_est = np.polyfit(ln_n, ln_z, 1)
            a0 = float(np.clip(a_est, 0.05, 2.5))
            n10 = float(np.clip(np.exp(-b_est / max(a0, 1e-8)), 100.0, 1e7))
        except Exception:
            a0, n10 = 0.3, 3e3

        seeds = [
            pack(L0, A0, a0, n10),
            pack(L0 * 0.98, A0 * 1.2, max(0.07, a0 * 0.8), n10 * 0.7),
            pack(L0 * 1.02, A0 * 0.8, min(2.2, a0 * 1.25), n10 * 1.6),
            pack(np.clip(np.min(yy), lb, ub), A0 * 0.6, 0.4, np.exp(np.mean(np.log(np.maximum(n, 1.0)))))
        ]
        best, bestv = seeds[0], np.inf
        for s0 in seeds:
            res = minimize(obj, s0, args=(yy,), method='L-BFGS-B', options={'maxiter': 300})
            th = res.x if res.success else s0
            v = obj(th, yy)
            if v < bestv:
                bestv, best = v, th
        P[t] = np.array(unpack(best))
    return P[0] if T == 1 else P
# EVOLVE-BLOCK-END

#4 Run 3 R² = 0.998872

▼

#5 Run 1 R² = 0.998768

▼

Python

# EVOLVE-BLOCK-START
import numpy as np
from scipy.optimize import minimize

# 4-parameter saturating power law: L(N)=L_inf + A*(1+N/N0)^(-alpha)
# Parameters: [L_inf, b=lnA, c=lna, d=lnN0] to enforce positivity of A, alpha, N0.
def scaling_law_func(data_points, params):
    x = np.asarray(data_points, float).reshape(-1)
    p = np.asarray(params, float)
    if p.ndim == 1: p = p[None, :]
    L = p[:, 0]; A = np.exp(p[:, 1]); a = np.exp(p[:, 2]); N0 = np.exp(p[:, 3]) if p.shape[1] > 3 else np.ones(len(L))
    logz = np.log1p(x[:, None] / N0[None, :])
    y = L[None, :] + A[None, :] * np.exp(-a[None, :] * logz)
    return y[:, 0] if y.shape[1] == 1 else y

def fit_scaling_law(data_points, loss_values):
    x = np.asarray(data_points, float).reshape(-1)
    y = np.asarray(loss_values, float)
    Y = y[:, None] if y.ndim == 1 else y
    T = Y.shape[1]
    xpos = np.clip(x, 1.0, None)
    xmin, xmax = float(np.min(xpos)), float(np.max(xpos))
    xg = float(np.exp(np.mean(np.log(xpos))))
    w = (xpos / xmax) ** 0.25; w /= np.mean(w)

    def pred_jac(p):
        L, b, c, d = p
        A = np.exp(b); a = np.exp(c); N0 = np.exp(d)
        logz = np.log1p(x / N0)
        t = np.exp(-a * logz)
        yhat = L + A * t
        dL = np.ones_like(x)
        db = A * t
        dc = -A * a * t * logz
        dd = A * a * t * (x / (N0 + x))  # dy/d lnN0
        J = np.vstack([dL, db, dc, dd]).T
        return yhat, J

    def huber(r, d):
        a = np.abs(r); psi = np.where(a <= d, r, d * np.sign(r))
        val = np.where(a <= d, 0.5 * r * r, d * (a - 0.5 * d))
        return val, psi

    out = []
    for t in range(T):
        yt = Y[:, t]
        ymin, ymax = float(np.min(yt)), float(np.max(yt))
        spread = max(1e-6, ymax - ymin)
        delta = 0.1 * spread + 0.05
        Lc = [max(0.3, ymin - 0.05), max(0.3, ymin - 0.2)]
        N0c = [xg, np.sqrt(xmin * xmax), xmax / 5.0, max(1.0, xmin * 5.0)]
        best_p, best_val = None, np.inf

        for L0 in Lc:
            for N0 in N0c:
                eps = 1e-8
                z = np.maximum(yt - L0, eps)
                lx = np.log1p(x / N0)
                ly = np.log(z)
                W = w
                S0 = np.sum(W); Sx = np.sum(W * lx); Sy = np.sum(W * ly)
                Sxx = np.sum(W * lx * lx); Sxy = np.sum(W * lx * ly)
                den = max(1e-12, S0 * Sxx - Sx * Sx)
                s = (S0 * Sxy - Sx * Sy) / den
                b0 = (Sy * Sxx - Sx * Sxy) / den
                a0 = max(1e-3, -float(s)); A0 = float(np.exp(b0))
                p0 = np.array([L0, np.log(max(A0, eps)), np.log(a0), np.log(max(N0, 1.0))], float)

                for jit in (0.0, 0.1):
                    pj = p0 + jit * np.array([0.0, 0.1, 0.1, 0.1]) * (2.0 * np.random.rand(4) - 1.0)

                    def obj_grad(p):
                        yhat, J = pred_jac(p)
                        r = yhat - yt
                        vals, psi = huber(r, delta)
                        val = np.sum(W * vals) / np.sum(W) + 1e-6 * (p[1] * p[1] + p[2] * p[2] + 0.5 * p[3] * p[3])
                        g = (J.T @ (W * psi)) / np.sum(W)
                        g[1] += 2e-6 * p[1]; g[2] += 2e-6 * p[2]; g[3] += 1e-6 * p[3]
                        return float(val), g

                    res = minimize(lambda q: obj_grad(q)[0], pj, jac=lambda q: obj_grad(q)[1], method='L-BFGS-B')
                    if res.success and np.all(np.isfinite(res.x)):
                        fval = obj_grad(res.x)[0]
                        if fval < best_val: best_val, best_p = fval, res.x

        out.append(best_p if best_p is not None else p0)

    out = np.vstack(out)
    return out[0] if out.shape[0] == 1 else out
# EVOLVE-BLOCK-END

SFT Scaling Law

All Runs (sorted by R²)