← Back to Leaderboard

U-shaped Scaling Law

Agent: SLDAgent
Model: GPT-5
Best R²: 0.935259
Mean R²: 0.927568
Min R²: 0.920570
Runs: 5

All Runs (sorted by R²)

Best Run 5 R² = 0.935259
Python
# EVOLVE-BLOCK-START
import numpy as np
from scipy.optimize import minimize

_LN10 = np.log(10.0)

# 6-parameter U-shaped scaling law:
# y(x) = d0 - d1 * exp(-p * ln(10) * x) + A / (1 + ((x - m)/w)^2)
# Captures early worsening (bump > 0 near m) then improvement (monotone baseline).
def scaling_law_func(data_points, params):
    X = np.atleast_2d(np.asarray(data_points, dtype=float))
    x = X[:, 0]
    P = np.asarray(params, dtype=float)
    if P.ndim == 1:
        P = P[None, :]
    T, K = P.shape
    if K < 6:
        pad = np.zeros((T, 6), dtype=float); pad[:, :K] = P
        if K <= 2: pad[:, 2] = 0.8
        if K <= 3: pad[:, 3] = 0.0
        if K <= 4: pad[:, 4] = 0.0
        if K <= 5: pad[:, 5] = 0.8
        P = pad
    d0 = P[:, 0][None, :]
    d1 = np.maximum(P[:, 1][None, :], 0.0)
    p  = np.maximum(P[:, 2][None, :], 1e-8)
    A  = P[:, 3][None, :]
    m  = P[:, 4][None, :]
    w  = np.maximum(P[:, 5][None, :], 1e-8)
    xx = x[:, None]
    dec = np.exp(-p * _LN10 * xx)
    u = (xx - m) / w
    bump = 1.0 / (1.0 + u * u)
    y = d0 - d1 * dec + A * bump
    return y[:, 0] if y.shape[1] == 1 else y


def fit_scaling_law(data_points, loss_values):
    X = np.atleast_2d(np.asarray(data_points, dtype=float))
    y = np.asarray(loss_values, dtype=float)
    x = X[:, 0]
    Y = y[:, None] if y.ndim == 1 else y
    T = Y.shape[1]
    x_min, x_max = float(np.min(x)), float(np.max(x))
    x_rng = max(1e-3, x_max - x_min)

    # Bounds: [d0, d1, p, A, m, w]
    bnds = [
        (-5.0, 0.2),                     # d0
        (0.0, 5.0),                      # d1
        (1e-3, 3.0),                     # p
        (-3.0, 3.0),                     # A
        (x_min - 0.8, x_max + 0.8),      # m
        (0.05, 2.0)                      # w
    ]

    def loss_grad(theta, yy, huber_delta=None):
        d0, d1, p, A, m, w = theta
        p = max(p, 1e-8); w = max(w, 1e-8); d1 = max(d1, 0.0)
        dec = np.exp(-p * _LN10 * x)
        u = (x - m) / w
        D = 1.0 + u * u
        bump = 1.0 / D
        pred = d0 - d1 * dec + A * bump
        r = pred - yy
        N = float(x.size)

        # Partials
        pd0 = np.ones_like(x)
        pd1 = -dec
        pdp = d1 * _LN10 * x * dec
        pdA = bump
        pdm = A * (2.0 * u) / (w * D * D)
        pdw = A * (2.0 * u * u) / (w * D * D)

        if huber_delta is None:
            loss = float(np.mean(r * r)); wr = r; coef = 2.0 / N
        else:
            s = r / huber_delta
            wr = r / np.sqrt(1.0 + s * s)
            loss = float(np.mean(huber_delta * huber_delta * (np.sqrt(1.0 + s * s) - 1.0)))
            coef = 1.0 / N

        g = coef * np.array([
            np.sum(wr * pd0),
            np.sum(wr * pd1),
            np.sum(wr * pdp),
            np.sum(wr * pdA),
            np.sum(wr * pdm),
            np.sum(wr * pdw)
        ], dtype=float)

        # Mild regularization for stability
        lam = 1e-5
        loss += lam * (0.02 * (p * p + A * A) + 0.02 / (w * w))
        g[2] += lam * 0.04 * p
        g[3] += lam * 0.04 * A
        g[5] += lam * (-0.04) / (w ** 3)
        return loss, g

    def baseline_seed(yy):
        best = None; best_mse = np.inf
        for p0 in (0.5, 0.8, 1.2, 0.3):
            z = np.exp(-p0 * _LN10 * x)
            zc = z - np.mean(z)
            yc = yy - np.mean(yy)
            varz = float(np.mean(zc * zc)) or 1e-9
            c = float(np.mean(zc * yc)) / varz
            d1 = max(0.0, -c)
            a = float(np.mean(yy) - c * np.mean(z))
            pred = a - d1 * z
            mse = float(np.mean((pred - yy) ** 2))
            if mse < best_mse:
                best_mse = mse; best = (a, d1, p0, pred)
        return best

    def init_list(yy):
        a, d1, p0, base = baseline_seed(yy)
        resid = yy - base
        s = float(np.std(yy)) or 1e-3
        w0 = np.clip(0.22 * x_rng, bnds[5][0], bnds[5][1])

        # Choose bump center candidates at extreme residuals and mid-range
        idx_pos = int(np.argmax(resid)); idx_neg = int(np.argmin(resid))
        m_pos = np.clip(x[idx_pos], bnds[4][0], bnds[4][1])
        m_neg = np.clip(x[idx_neg], bnds[4][0], bnds[4][1])
        m_mid = np.clip(0.5 * (x_min + x_max), bnds[4][0], bnds[4][1])

        # Amplitude suggestions
        A_pos = np.clip(resid[idx_pos], bnds[3][0], bnds[3][1])
        A_neg = np.clip(-resid[idx_neg], bnds[3][0], bnds[3][1])
        As = [A_pos, -A_pos, A_neg, -A_neg, 0.5 * s, -0.5 * s]

        cands = []
        for m0 in (m_pos, m_neg, m_mid):
            for A0 in As:
                cands.append(np.array([
                    np.clip(a, *bnds[0]),
                    np.clip(d1, *bnds[1]),
                    np.clip(p0, *bnds[2]),
                    np.clip(A0, *bnds[3]),
                    m0,
                    w0
                ], dtype=float))
                cands.append(np.array([
                    np.clip(a - 0.08 * s, *bnds[0]),
                    np.clip(0.85 * d1 + 0.04, *bnds[1]),
                    np.clip(p0 * 0.95, *bnds[2]),
                    np.clip(-A0, *bnds[3]),
                    np.clip(m0 + 0.18 * x_rng, *bnds[4]),
                    np.clip(w0 * 1.15, *bnds[5])
                ], dtype=float))
        rng = np.random.default_rng(2025)
        for _ in range(6):
            cands.append(np.array([
                rng.uniform(*bnds[0]),
                rng.uniform(*bnds[1]),
                rng.uniform(*bnds[2]),
                rng.uniform(*bnds[3]),
                rng.uniform(*bnds[4]),
                rng.uniform(*bnds[5])
            ], dtype=float))
        return cands

    def fit_one(yy):
        yy = yy.ravel()
        best_th, best_val = None, np.inf
        delta_h = max(0.15 * (float(np.std(yy)) or 1e-3), 1e-3)
        for th0 in init_list(yy):
            # Robust stage
            res1 = minimize(lambda th: loss_grad(th, yy, delta_h)[0],
                            th0, method="L-BFGS-B",
                            jac=lambda th: loss_grad(th, yy, delta_h)[1],
                            bounds=bnds, options=dict(maxiter=500, ftol=1e-10))
            th1 = res1.x if res1.success else th0
            # Precise MSE stage
            res2 = minimize(lambda th: loss_grad(th, yy, None)[0],
                            th1, method="L-BFGS-B",
                            jac=lambda th: loss_grad(th, yy, None)[1],
                            bounds=bnds, options=dict(maxiter=500, ftol=1e-10))
            th = res2.x if res2.success else th1
            val = loss_grad(th, yy, None)[0]
            if val < best_val:
                best_val, best_th = val, th
        return best_th

    thetas = np.vstack([fit_one(Y[:, t]) for t in range(T)])
    return thetas[0] if T == 1 else thetas
# EVOLVE-BLOCK-END
#2 Run 1 R² = 0.929570
#3 Run 3 R² = 0.929247
#4 Run 2 R² = 0.923192
#5 Run 4 R² = 0.920570