SLD - U-shaped Scaling Law - SLDAgent + GPT-5

All Runs (sorted by R²)

Best Run 5 R² = 0.935259

▼

Python

# EVOLVE-BLOCK-START
import numpy as np
from scipy.optimize import minimize

_LN10 = np.log(10.0)

# 6-parameter U-shaped scaling law:
# y(x) = d0 - d1 * exp(-p * ln(10) * x) + A / (1 + ((x - m)/w)^2)
# Captures early worsening (bump > 0 near m) then improvement (monotone baseline).
def scaling_law_func(data_points, params):
    X = np.atleast_2d(np.asarray(data_points, dtype=float))
    x = X[:, 0]
    P = np.asarray(params, dtype=float)
    if P.ndim == 1:
        P = P[None, :]
    T, K = P.shape
    if K < 6:
        pad = np.zeros((T, 6), dtype=float); pad[:, :K] = P
        if K <= 2: pad[:, 2] = 0.8
        if K <= 3: pad[:, 3] = 0.0
        if K <= 4: pad[:, 4] = 0.0
        if K <= 5: pad[:, 5] = 0.8
        P = pad
    d0 = P[:, 0][None, :]
    d1 = np.maximum(P[:, 1][None, :], 0.0)
    p  = np.maximum(P[:, 2][None, :], 1e-8)
    A  = P[:, 3][None, :]
    m  = P[:, 4][None, :]
    w  = np.maximum(P[:, 5][None, :], 1e-8)
    xx = x[:, None]
    dec = np.exp(-p * _LN10 * xx)
    u = (xx - m) / w
    bump = 1.0 / (1.0 + u * u)
    y = d0 - d1 * dec + A * bump
    return y[:, 0] if y.shape[1] == 1 else y


def fit_scaling_law(data_points, loss_values):
    X = np.atleast_2d(np.asarray(data_points, dtype=float))
    y = np.asarray(loss_values, dtype=float)
    x = X[:, 0]
    Y = y[:, None] if y.ndim == 1 else y
    T = Y.shape[1]
    x_min, x_max = float(np.min(x)), float(np.max(x))
    x_rng = max(1e-3, x_max - x_min)

    # Bounds: [d0, d1, p, A, m, w]
    bnds = [
        (-5.0, 0.2),                     # d0
        (0.0, 5.0),                      # d1
        (1e-3, 3.0),                     # p
        (-3.0, 3.0),                     # A
        (x_min - 0.8, x_max + 0.8),      # m
        (0.05, 2.0)                      # w
    ]

    def loss_grad(theta, yy, huber_delta=None):
        d0, d1, p, A, m, w = theta
        p = max(p, 1e-8); w = max(w, 1e-8); d1 = max(d1, 0.0)
        dec = np.exp(-p * _LN10 * x)
        u = (x - m) / w
        D = 1.0 + u * u
        bump = 1.0 / D
        pred = d0 - d1 * dec + A * bump
        r = pred - yy
        N = float(x.size)

        # Partials
        pd0 = np.ones_like(x)
        pd1 = -dec
        pdp = d1 * _LN10 * x * dec
        pdA = bump
        pdm = A * (2.0 * u) / (w * D * D)
        pdw = A * (2.0 * u * u) / (w * D * D)

        if huber_delta is None:
            loss = float(np.mean(r * r)); wr = r; coef = 2.0 / N
        else:
            s = r / huber_delta
            wr = r / np.sqrt(1.0 + s * s)
            loss = float(np.mean(huber_delta * huber_delta * (np.sqrt(1.0 + s * s) - 1.0)))
            coef = 1.0 / N

        g = coef * np.array([
            np.sum(wr * pd0),
            np.sum(wr * pd1),
            np.sum(wr * pdp),
            np.sum(wr * pdA),
            np.sum(wr * pdm),
            np.sum(wr * pdw)
        ], dtype=float)

        # Mild regularization for stability
        lam = 1e-5
        loss += lam * (0.02 * (p * p + A * A) + 0.02 / (w * w))
        g[2] += lam * 0.04 * p
        g[3] += lam * 0.04 * A
        g[5] += lam * (-0.04) / (w ** 3)
        return loss, g

    def baseline_seed(yy):
        best = None; best_mse = np.inf
        for p0 in (0.5, 0.8, 1.2, 0.3):
            z = np.exp(-p0 * _LN10 * x)
            zc = z - np.mean(z)
            yc = yy - np.mean(yy)
            varz = float(np.mean(zc * zc)) or 1e-9
            c = float(np.mean(zc * yc)) / varz
            d1 = max(0.0, -c)
            a = float(np.mean(yy) - c * np.mean(z))
            pred = a - d1 * z
            mse = float(np.mean((pred - yy) ** 2))
            if mse < best_mse:
                best_mse = mse; best = (a, d1, p0, pred)
        return best

    def init_list(yy):
        a, d1, p0, base = baseline_seed(yy)
        resid = yy - base
        s = float(np.std(yy)) or 1e-3
        w0 = np.clip(0.22 * x_rng, bnds[5][0], bnds[5][1])

        # Choose bump center candidates at extreme residuals and mid-range
        idx_pos = int(np.argmax(resid)); idx_neg = int(np.argmin(resid))
        m_pos = np.clip(x[idx_pos], bnds[4][0], bnds[4][1])
        m_neg = np.clip(x[idx_neg], bnds[4][0], bnds[4][1])
        m_mid = np.clip(0.5 * (x_min + x_max), bnds[4][0], bnds[4][1])

        # Amplitude suggestions
        A_pos = np.clip(resid[idx_pos], bnds[3][0], bnds[3][1])
        A_neg = np.clip(-resid[idx_neg], bnds[3][0], bnds[3][1])
        As = [A_pos, -A_pos, A_neg, -A_neg, 0.5 * s, -0.5 * s]

        cands = []
        for m0 in (m_pos, m_neg, m_mid):
            for A0 in As:
                cands.append(np.array([
                    np.clip(a, *bnds[0]),
                    np.clip(d1, *bnds[1]),
                    np.clip(p0, *bnds[2]),
                    np.clip(A0, *bnds[3]),
                    m0,
                    w0
                ], dtype=float))
                cands.append(np.array([
                    np.clip(a - 0.08 * s, *bnds[0]),
                    np.clip(0.85 * d1 + 0.04, *bnds[1]),
                    np.clip(p0 * 0.95, *bnds[2]),
                    np.clip(-A0, *bnds[3]),
                    np.clip(m0 + 0.18 * x_rng, *bnds[4]),
                    np.clip(w0 * 1.15, *bnds[5])
                ], dtype=float))
        rng = np.random.default_rng(2025)
        for _ in range(6):
            cands.append(np.array([
                rng.uniform(*bnds[0]),
                rng.uniform(*bnds[1]),
                rng.uniform(*bnds[2]),
                rng.uniform(*bnds[3]),
                rng.uniform(*bnds[4]),
                rng.uniform(*bnds[5])
            ], dtype=float))
        return cands

    def fit_one(yy):
        yy = yy.ravel()
        best_th, best_val = None, np.inf
        delta_h = max(0.15 * (float(np.std(yy)) or 1e-3), 1e-3)
        for th0 in init_list(yy):
            # Robust stage
            res1 = minimize(lambda th: loss_grad(th, yy, delta_h)[0],
                            th0, method="L-BFGS-B",
                            jac=lambda th: loss_grad(th, yy, delta_h)[1],
                            bounds=bnds, options=dict(maxiter=500, ftol=1e-10))
            th1 = res1.x if res1.success else th0
            # Precise MSE stage
            res2 = minimize(lambda th: loss_grad(th, yy, None)[0],
                            th1, method="L-BFGS-B",
                            jac=lambda th: loss_grad(th, yy, None)[1],
                            bounds=bnds, options=dict(maxiter=500, ftol=1e-10))
            th = res2.x if res2.success else th1
            val = loss_grad(th, yy, None)[0]
            if val < best_val:
                best_val, best_th = val, th
        return best_th

    thetas = np.vstack([fit_one(Y[:, t]) for t in range(T)])
    return thetas[0] if T == 1 else thetas
# EVOLVE-BLOCK-END

#2 Run 1 R² = 0.929570

▼

Python

# EVOLVE-BLOCK-START
import numpy as np
from scipy.optimize import minimize

def _sigmoid(z):
    return 0.5 * (1.0 + np.tanh(0.5 * z))

def scaling_law_func(data_points, params):
    # U-shaped/double-descent via a band-pass sigmoid window on a linear baseline:
    # y(x) = c + b*x + A * S((x - t1)/s) * (1 - S((x - t2)/s))
    # Params (<=6): [c, b, A, tc, log_dt, log_s], with t1=tc-0.5*exp(log_dt), t2=tc+0.5*exp(log_dt)
    X = np.atleast_2d(np.asarray(data_points, dtype=float))
    x = X[:, 0][:, None]
    p = np.asarray(params, dtype=float)
    if p.ndim == 1:
        p = p[None, :]
    T, P = p.shape
    if P < 6:
        p = np.pad(p, ((0, 0), (0, 6 - P)), mode='constant')
    elif P > 6:
        p = p[:, :6]
    c  = p[:, 0][None, :]
    b  = p[:, 1][None, :]
    A  = p[:, 2][None, :]
    tc = p[:, 3][None, :]
    dt = np.exp(p[:, 4])[None, :] + 1e-12
    s  = np.exp(p[:, 5])[None, :] + 1e-12
    t1 = tc - 0.5 * dt
    t2 = tc + 0.5 * dt
    w = _sigmoid((x - t1) / s) * (1.0 - _sigmoid((x - t2) / s))
    y = c + b * x + A * w
    return y[:, 0] if y.shape[1] == 1 else y

def fit_scaling_law(data_points, loss_values):
    X = np.atleast_2d(np.asarray(data_points, dtype=float))
    x = X[:, 0]
    y = np.asarray(loss_values, dtype=float)
    if y.ndim == 1:
        Y = y[:, None]
    else:
        Y = y
    N, T = Y.shape
    x_lo, x_hi = float(np.min(x)), float(np.max(x))
    xr = max(1e-6, x_hi - x_lo)

    def huber(r, d):
        a = np.abs(r)
        return np.where(a <= d, 0.5 * r * r, d * (a - 0.5 * d))

    def init_linear(yvec):
        Phi = np.column_stack([np.ones_like(x), x])
        beta, *_ = np.linalg.lstsq(Phi, yvec, rcond=None)
        return float(beta[0]), float(beta[1])

    def fit_one(yvec):
        c0, b0 = init_linear(yvec)
        spread = float(np.percentile(yvec, 90) - np.percentile(yvec, 10))
        spread = max(spread, 1e-6)
        # Localize hump center by positive residuals vs. baseline
        r = yvec - (c0 + b0 * x)
        rw = np.clip(r, 0.0, None)
        tc0 = float(np.sum(x * rw) / (np.sum(rw) + 1e-12)) if np.sum(rw) > 0 else float(0.5 * (x_lo + x_hi))
        dt0 = float(np.clip(np.percentile(x, 75) - np.percentile(x, 25), 0.05 * xr, xr))
        s0 = max(0.05 * xr, 0.25 * dt0)
        A0 = float(np.clip(np.percentile(r, 90) - np.percentile(r, 10), 0.05 * spread, 2.0 * spread))
        p0 = np.array([c0, b0, A0, tc0, np.log(dt0), np.log(s0)], dtype=float)

        slope_bound = 5.0 * (spread / xr) if xr > 0 else 1.0
        bounds = [
            (np.min(yvec) - 2.0 * spread, np.max(yvec) + 2.0 * spread),   # c
            (-slope_bound, slope_bound),                                   # b
            (-5.0 * spread, 5.0 * spread),                                 # A
            (x_lo, x_hi),                                                  # tc
            (np.log(0.02 * xr), np.log(xr)),                               # log_dt
            (np.log(0.01 * xr), np.log(0.6 * xr))                          # log_s
        ]

        delta = 0.2 * spread

        def obj_full(p):
            pred = scaling_law_func(X, p)
            r = pred - yvec
            dt = np.exp(p[4]) + 1e-12
            s = np.exp(p[5]) + 1e-12
            over = max(0.0, s - dt)
            loss = np.mean(huber(r, delta))
            loss += 1e-6 * (p[1] ** 2) + 1e-6 * (p[2] ** 2) + 1e-6 * (over * over + dt + 1.0 / dt + s + 1.0 / s)
            return loss

        # Stage-1: optimize shape (A, tc, log_dt, log_s) with baseline fixed
        def obj_shape(q):
            p = np.array([c0, b0, q[0], q[1], q[2], q[3]], dtype=float)
            return obj_full(p)

        rng = np.random.default_rng(123)
        inits = []
        base = p0.copy()
        inits.append(base)
        inits.append(base * np.array([1, 1, -1, 1, 1, 1], float))
        for _ in range(8):
            jitter = np.array([
                rng.normal(0, 0.2 * spread),                 # c
                rng.normal(0, 0.2 * slope_bound),            # b
                rng.normal(0, 0.6 * A0 if A0 > 0 else 0.3*spread),  # A
                rng.normal(0, 0.25 * xr),                    # tc
                rng.normal(0, 0.35),                         # log_dt
                rng.normal(0, 0.35)                          # log_s
            ], dtype=float)
            pin = np.clip(p0 + jitter, [b[0] for b in bounds], [b[1] for b in bounds])
            inits.append(pin)

        # Shape-only refinement
        best = None
        for init in inits[:6]:
            q0 = np.array([init[2], init[3], init[4], init[5]], float)
            bnds_q = [bounds[2], bounds[3], bounds[4], bounds[5]]
            res = minimize(obj_shape, q0, method='L-BFGS-B',
                           bounds=bnds_q, options={'maxiter': 200, 'ftol': 1e-12})
            p_try = np.array([c0, b0, *(res.x if res.success else q0)], float)
            f_try = obj_full(p_try)
            if (best is None) or (f_try < best[1]):
                best = (p_try, f_try)

        # Stage-2: full refinement
        best_p, best_f = best if best is not None else (p0, obj_full(p0))
        for init in inits:
            res = minimize(obj_full, init, method='L-BFGS-B', bounds=bounds,
                           options={'maxiter': 500, 'ftol': 1e-12})
            p_try = res.x if res.success else init
            f_try = obj_full(p_try)
            if f_try < best_f:
                best_p, best_f = p_try, f_try
        return best_p

    params = np.vstack([fit_one(Y[:, t]) for t in range(T)])
    return params[0] if T == 1 else params
# EVOLVE-BLOCK-END

#3 Run 3 R² = 0.929247

▼

Python

# EVOLVE-BLOCK-START
"""
U-shaped (double-descent) scaling law with 6 parameters and robust, density-balanced fitting.

Model (6 params):
  pred(x) = b0 + A_up * sigmoid((x - c1)/s_l) - A_dn * sigmoid((x - c2)/s_r)
where
  A_up = softplus(A_up_raw) >= 0       (early degradation amplitude)
  A_dn = softplus(A_dn_raw) >= 0       (later improvement amplitude)
  c1 = c0 - 0.5 * d,  c2 = c0 + 0.5 * d (ordered centers)
  s = softplus(s_raw) + eps            (base width > 0)
  d = s * (1 + softplus(k_raw))        (separation >= s)
  s_l = s * g,  s_r = s / g            (asymmetry with g = (1 + 0.5*softplus(k_raw))**0.5)
The single k_raw parameter controls both separation and a mild left/right width asymmetry,
capturing skewed U-shapes without increasing parameter count beyond 6.

Fitting improvements:
- Robust Huber loss with inverse-density weights along x (log_flops) to avoid bias from uneven sampling.
- Analytic re-centering of b0 at each evaluation (weighted mean residual given other params) for stability.
- Gentle priors: keep amplitudes/widths moderate, ensure meaningful separation, encourage ultimate improvement.
- Multi-start seeding (grid + jitter), L-BFGS-B refinement, least-squares polish with robust soft_l1 loss,
  and a final Powell polish.
"""
import numpy as np
from scipy.optimize import minimize, least_squares

def _softplus(z):
    z = np.asarray(z, dtype=float)
    return np.log1p(np.exp(-np.abs(z))) + np.maximum(z, 0.0)

def _sigmoid(z):
    z = np.clip(z, -60.0, 60.0)
    return 1.0 / (1.0 + np.exp(-z))

def scaling_law_func(data_points, params):
    X = np.atleast_2d(np.asarray(data_points, dtype=float))
    x = X[:, 0][:, None]  # (N,1)
    p = np.asarray(params, dtype=float)
    if p.ndim == 1:
        p = p[None, :]
    # params: [b0, A_up_raw, A_dn_raw, c0, s_raw, k_raw]
    b0 = p[:, 0][None, :]
    A_up = _softplus(p[:, 1])[None, :]
    A_dn = _softplus(p[:, 2])[None, :]
    c0 = p[:, 3][None, :]
    s = _softplus(p[:, 4])[None, :] + 1e-3

    # Single factor k_raw controls separation and mild asymmetry
    ksp = _softplus(p[:, 5])[None, :]  # >=0
    d = s * (1.0 + ksp)                # separation >= s
    # asymmetry factor g in [1, sqrt(1+0.5*ksp)+], mild skew flexibility without extra params
    g = np.sqrt(1.0 + 0.5 * ksp)
    s_l = s * g
    s_r = s / np.maximum(g, 1e-6)

    c1 = c0 - 0.5 * d
    c2 = c0 + 0.5 * d
    z1 = (x - c1) / s_l
    z2 = (x - c2) / s_r

    pred = b0 + A_up * _sigmoid(z1) - A_dn * _sigmoid(z2)
    return pred[:, 0] if pred.shape[1] == 1 else pred

def fit_scaling_law(data_points, loss_values):
    X = np.atleast_2d(np.asarray(data_points, dtype=float))
    y = np.asarray(loss_values, dtype=float)
    N, F = X.shape
    assert F == 1, "Expected single feature: log_flops"
    y2d = y[:, None] if y.ndim == 1 else y
    T = y2d.shape[1]

    x = X[:, 0].astype(float)
    x_min, x_max = float(np.min(x)), float(np.max(x))
    xr = max(float(x_max - x_min), 1e-6)

    # Inverse-density weights over x via quantile binning
    Q = min(20, max(8, N // 25))
    qs = np.linspace(0.0, 1.0, Q + 1)
    edges = np.quantile(x, qs)
    # ensure strictly increasing edges
    edges[0] -= 1e-9
    edges[-1] += 1e-9
    bin_idx = np.clip(np.searchsorted(edges, x, side='right') - 1, 0, Q - 1)
    counts = np.maximum(1, np.bincount(bin_idx, minlength=Q))
    w = 1.0 / counts[bin_idx]
    w = (N * w) / np.sum(w)  # normalize to sum N

    def huber(res, delta):
        a = np.abs(res)
        return np.where(a <= delta, 0.5 * res**2, delta * (a - 0.5 * delta))

    def inv_softplus(v):
        v = np.clip(v, 1e-6, 50.0)
        return np.log(np.expm1(v))

    # Objective utilities
    b0_bounds = (-6.0, 0.5)

    def decode_pred_no_b0(pvec):
        # Compute model prediction with b0 forced to 0 to allow analytic centering of b0
        pv = np.array(pvec, dtype=float)
        pv0 = pv.copy()
        pv0[0] = 0.0
        return scaling_law_func(X, pv0)

    def b0_optimal(pvec, yi):
        # Weighted mean residual (approximate MSE-optimal b0 under weights)
        f0 = decode_pred_no_b0(pvec)
        num = np.sum(w * (yi - f0))
        den = np.sum(w)
        b0hat = num / max(den, 1e-9)
        return float(np.clip(b0hat, b0_bounds[0], b0_bounds[1]))

    def add_b0(pvec, b0hat):
        pv = np.array(pvec, dtype=float).copy()
        pv[0] = b0hat
        return pv

    def objective(pvec, yi):
        # Analytic b0 centering for stability
        b0hat = b0_optimal(pvec, yi)
        p_used = add_b0(pvec, b0hat)
        pred = scaling_law_func(X, p_used)
        r = pred - yi
        # Robust delta via MAD
        mad = np.median(np.abs(yi - np.median(yi)))
        delta = 1.4826 * mad if mad > 1e-8 else 0.1
        loss = np.mean(w * huber(r, delta))

        # Gentle priors/regularization
        A_up = _softplus(pvec[1])
        A_dn = _softplus(pvec[2])
        s = _softplus(pvec[4]) + 1e-3
        ksp = _softplus(pvec[5])
        sep = s * (1.0 + ksp)
        # Encourage meaningful separation and improvement >= degradation
        reg = 3e-4 * (A_up**2 + A_dn**2 + s**2)
        reg += 1.5e-4 * _softplus(A_up - A_dn)          # push A_dn >= A_up
        reg += 2.0e-4 * _softplus(0.3 * s - sep)        # sep >= 0.3*s
        # Keep transitions inside observed x-range
        c0 = pvec[3]
        c1 = c0 - 0.5 * sep
        c2 = c0 + 0.5 * sep
        reg += 1.0e-4 * (_softplus(x_min - c1) + _softplus(c2 - x_max))
        # Discourage positive brier (should be negative); penalize positive asymptote too
        tail = b0hat + A_up - A_dn
        reg += 1.0e-5 * (np.mean(np.maximum(pred, 0.0)) + _softplus(tail))

        return loss + reg

    # Residual function for least_squares polishing (soft_l1 robust)
    def residuals(pvec, yi):
        b0hat = b0_optimal(pvec, yi)
        p_used = add_b0(pvec, b0hat)
        pred = scaling_law_func(X, p_used)
        res = np.sqrt(w) * (pred - yi)
        # Append small regularization terms as residuals
        A_up = _softplus(pvec[1])
        A_dn = _softplus(pvec[2])
        s = _softplus(pvec[4]) + 1e-3
        ksp = _softplus(pvec[5])
        sep = s * (1.0 + ksp)
        c0 = pvec[3]
        c1 = c0 - 0.5 * sep
        c2 = c0 + 0.5 * sep
        reg_terms = np.array([
            1e-2 * (A_up - A_dn),                # encourage A_dn >= A_up
            1e-2 * max(0.3 * s - sep, 0.0),      # ensure separation
            5e-3 * max(x_min - c1, 0.0),
            5e-3 * max(c2 - x_max, 0.0),
            2e-3 * max(b0hat + A_up - A_dn, 0.0) # positive tail penalty
        ], dtype=float)
        return np.concatenate([res, reg_terms])

    # Parameter bounds
    bnds = [
        b0_bounds,                              # b0
        (-8.0, 8.0),                            # A_up_raw
        (-8.0, 8.0),                            # A_dn_raw
        (x_min - 0.2, x_max + 0.2),             # c0
        (-8.0, 8.0),                            # s_raw
        (-8.0, 8.0),                            # k_raw
    ]
    low = np.array([b[0] for b in bnds], dtype=float)
    high = np.array([b[1] for b in bnds], dtype=float)

    # Smoothing helper to locate peak
    order = np.argsort(x)
    x_sorted = x[order]
    def smooth(vals, w_frac=0.06):
        k = max(5, int(w_frac * N))
        v = np.asarray(vals, dtype=float)[order]
        pad = np.pad(v, (k//2, k - 1 - k//2), mode='edge')
        ker = np.ones(k, dtype=float) / k
        return np.convolve(pad, ker, mode='valid')

    q25, q40, q50, q60, q75 = np.quantile(x, [0.25, 0.40, 0.50, 0.60, 0.75])
    rng = np.random.default_rng(123)
    params_opt = np.zeros((T, 6), dtype=float)

    for ti in range(T):
        yi = y2d[:, ti]

        # Initialization
        sm = smooth(yi, 0.06)
        idx_peak = int(np.clip(np.argmax(sm), 0, N - 1))
        c0_init = float(x_sorted[idx_peak]) if np.isfinite(idx_peak) else float(q50)
        y_low = float(np.mean(yi[x <= q25])) if np.any(x <= q25) else float(np.mean(yi))
        y_high = float(np.mean(yi[x >= q75])) if np.any(x >= q75) else float(np.mean(yi))
        y_peak = float(sm[idx_peak]) if np.isfinite(idx_peak) else float(np.median(yi))
        b0_init = float(np.median(yi[x <= q25])) if np.any(x <= q25) else float(np.median(yi))

        Aup0 = max(0.02, y_peak - y_low)
        Adn0 = max(0.02, y_peak - y_high)
        s0 = max(0.08, 0.20 * xr)
        d0 = max(0.25 * xr, 1.1 * s0)

        base = np.array([
            b0_init,
            inv_softplus(0.8 * Aup0),
            inv_softplus(0.8 * Adn0),
            c0_init,
            inv_softplus(s0),
            inv_softplus(max(1.0, d0 / s0 - 1.0)),
        ], dtype=float)

        # Seed pool: grid over c0, s and amplitude scales + jitter
        seeds = []
        for cg in (c0_init, float(q40), float(q60)):
            for sg in (0.18 * xr, 0.28 * xr, 0.38 * xr):
                for asc in (0.7, 1.0, 1.3):
                    seeds.append(np.array([
                        b0_init,
                        inv_softplus(asc * Aup0),
                        inv_softplus(asc * Adn0),
                        cg,
                        inv_softplus(max(0.05, sg)),
                        inv_softplus(max(1.0, d0 / max(0.05, sg)) - 1.0),
                    ], dtype=float))
        seeds.append(base)
        for _ in range(8):
            jitter = np.array([
                rng.normal(0, 0.05),
                rng.normal(0, 0.25),
                rng.normal(0, 0.25),
                rng.normal(0, 0.15 * xr),
                rng.normal(0, 0.25),
                rng.normal(0, 0.25),
            ], dtype=float)
            seeds.append(base + jitter)

        # Score seeds quickly
        scored = []
        for s in seeds:
            s = np.clip(s, low, high)
            try:
                scored.append(objective(s, yi))
            except Exception:
                scored.append(np.inf)
        top_idx = np.argsort(scored)[:min(10, len(seeds))]

        best_p, best_val = None, np.inf
        for idx in top_idx:
            init = np.clip(seeds[idx], low, high)
            # L-BFGS-B refinement
            res = minimize(objective, init, args=(yi,), method='L-BFGS-B', bounds=bnds,
                           options={'maxiter': 800, 'ftol': 1e-9})
            cand_p = res.x if res.success else init
            val = objective(cand_p, yi)
            if val < best_val:
                best_val, best_p = val, cand_p

        # Robust least-squares polish
        try:
            ls = least_squares(lambda pv: residuals(pv, yi), best_p, method='trf',
                               bounds=(low, high), loss='soft_l1', f_scale=1.0,
                               max_nfev=800, xtol=1e-9, ftol=1e-9)
            if ls.success:
                best_p = ls.x
        except Exception:
            pass

        # Final Powell polish
        res2 = minimize(objective, best_p, args=(yi,), method='Powell',
                        options={'maxiter': 500, 'ftol': 1e-7})
        if res2.success and res2.fun <= objective(best_p, yi):
            best_p = res2.x

        # Set final b0 analytically
        b0hat = b0_optimal(best_p, yi)
        best_p = add_b0(best_p, b0hat)

        params_opt[ti] = best_p

    return params_opt[0] if T == 1 else params_opt
# EVOLVE-BLOCK-END

#4 Run 2 R² = 0.923192

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law for U-shaped/double-descent LLM performance vs compute
Model (6 params): linear baseline plus a flexible Lorentzian bump
y(x) = a + c*x + A / (1 + ((x - m)/s)^2)^p
- linear trend captures broad behavior across tasks
- single Lorentzian bump (with exponent p) captures transient worsening (U-shape)
Fitter: robust Huber loss, multi-start L-BFGS-B on a stable reparameterization.
Partial linearization: given (A,m,s,p), solve (a,c) by ridge least squares for stability.
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    X = np.atleast_2d(np.asarray(data_points))
    x = X[:, 0][:, None]
    P = np.asarray(params)
    if P.ndim == 1:
        P = P[None, :]
    a = P[:, 0][None, :]
    c = P[:, 1][None, :]
    A = P[:, 2][None, :]
    m = P[:, 3][None, :]
    s = np.maximum(P[:, 4][None, :], 1e-8)
    p = np.maximum(P[:, 5][None, :], 0.5)
    z = ((x - m) / s) ** 2
    denom = np.exp(np.clip(p * np.log1p(np.clip(z, 0.0, 1e6)), -60.0, 60.0))
    y = a + c * x + A / denom
    return y[:, 0] if y.shape[1] == 1 else y

def fit_scaling_law(data_points, loss_values):
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values)
    x = X[:, 0]
    Y = y[:, None] if y.ndim == 1 else y
    xmin, xmax = float(np.min(x)), float(np.max(x))
    xr = xmax - xmin + 1e-6
    m_c, m_s = 0.5 * (xmin + xmax), 0.6 * xr
    s_min = 1e-3

    def softplus(u): return np.log1p(np.exp(-np.abs(u))) + np.maximum(u, 0.0)
    def huber(res, d):
        ar = np.abs(res); q = 0.5 * ar * ar
        return np.where(ar <= d, q, d * ar - 0.5 * d * d)

    def linear_stats(xv, yv):
        vx = float(np.var(xv)) + 1e-12
        c = float(np.cov(xv, yv, bias=True)[0, 1] / vx) if vx > 0 else 0.0
        a = float(np.mean(yv) - c * np.mean(xv))
        return a, c

    def robust_delta(r):
        med = float(np.median(r))
        mad = float(np.median(np.abs(r - med))) + 1e-12
        return 1.345 * mad if mad > 1e-12 else 1.345 * (float(np.std(r)) + 1e-12)

    def map_z(z, xv, yv, lam_r=1e-8):
        # z: [A, um, us, up] -> physical [a, c, A, m, s, p], with (a,c) solved by ridge LS
        A, um, us, up = z
        m = m_c + m_s * np.tanh(um)
        s = softplus(us) + s_min
        p = softplus(up) + 0.5
        zz = ((xv - m) / s) ** 2
        denom = np.exp(np.clip(p * np.log1p(np.clip(zz, 0.0, 1e6)), -60.0, 60.0))
        f = A / denom
        yt = yv - f
        n = float(len(xv))
        sumx = float(np.sum(xv)); sumx2 = float(np.sum(xv * xv))
        sy = float(np.sum(yt)); sxy = float(np.sum(xv * yt))
        M = np.array([[n + lam_r, sumx], [sumx, sumx2 + lam_r]], dtype=float)
        b = np.array([sy, sxy], dtype=float)
        try:
            a, c = np.linalg.solve(M, b)
        except Exception:
            a, c = float(np.mean(yt)), 0.0
        return np.array([a, c, A, m, s, p])

    def obj_z(z, xv, yv, d, lam_A=1e-4, lam_s=1e-3, lam_p=1e-3, lam_pos=5e-4):
        p = map_z(z, xv, yv)
        yhat = scaling_law_func(xv[:, None], p)
        r = yhat - yv
        loss = np.mean(huber(r, d))
        reg = (lam_A * p[2] ** 2 +
               lam_s * (np.log(p[4] / (m_s + 1e-6))) ** 2 +
               lam_p * (p[5] - 1.5) ** 2 +
               lam_pos * np.mean(softplus(yhat)))
        return loss + reg

    def sign_guess(xv, r):
        idx = np.argsort(xv); xv, r = xv[idx], r[idx]
        n = len(xv); k = max(3, int(0.2 * n))
        mid = float(np.mean(r[n//2 - k//2:n//2 + k//2]))
        ends = 0.5 * (float(np.mean(r[:k])) + float(np.mean(r[-k:])))
        return 1.0 if (mid - ends) > 0 else -1.0

    def single_fit(ycol):
        a0, c0 = linear_stats(x, ycol)
        r0 = ycol - (a0 + c0 * x)
        d = robust_delta(r0)
        yrng = float(np.percentile(r0, 90) - np.percentile(r0, 10))
        amp = (0.6 * yrng) if np.isfinite(yrng) and yrng > 1e-8 else (0.6 * float(np.std(r0) + 1e-3))
        sgn = sign_guess(x, r0)
        A0 = sgn * amp
        i0 = int(np.argmax(np.abs(r0)))
        m0 = float(x[i0]) if np.isfinite(x[i0]) else m_c
        s0 = 0.4 * xr; p0 = 1.5
        um0 = np.arctanh(np.clip((m0 - m_c) / m_s, -0.95, 0.95))
        us0 = np.log(np.expm1(np.maximum(s0 - s_min, 1e-8)))
        up0 = np.log(np.expm1(np.clip(p0 - 0.5, 1e-8, 10.0)))
        z0 = np.array([A0, um0, us0, up0])

        rng = np.random.default_rng(123)
        inits = [z0,
                 z0 + np.array([0.5 * A0, -0.3, +0.2, +0.2]),
                 z0 + np.array([-0.5 * A0, +0.3, -0.2, -0.1])]
        for _ in range(6):
            inits.append(z0 + rng.standard_normal(4) * np.array([0.3 * amp + 1e-3, 0.5, 0.5, 0.5]))

        best_val, best_z = None, None
        for z in inits:
            try:
                res = minimize(obj_z, z, args=(x, ycol, d), method='L-BFGS-B',
                               options={'maxiter': 600, 'ftol': 1e-9})
                zopt = res.x if res.success else z
                val = obj_z(zopt, x, ycol, d)
                if (best_val is None) or (val < best_val):
                    best_val, best_z = val, zopt
            except Exception:
                continue
        return map_z(best_z if best_z is not None else z0, x, ycol)

    Ps = np.vstack([single_fit(Y[:, t]) for t in range(Y.shape[1])])
    return Ps[0] if Ps.shape[0] == 1 else Ps
# EVOLVE-BLOCK-END

#5 Run 4 R² = 0.920570

▼

Python

# EVOLVE-BLOCK-START
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    X = np.atleast_2d(np.asarray(data_points)); x = X[:, 0]
    P = np.atleast_2d(np.asarray(params)); T = P.shape[0]
    sig = lambda z: 0.5 * (1.0 + np.tanh(0.5 * z))
    y = np.empty((x.size, T))
    for t in range(T):
        # params: [c, b, A, k, x1, x2]
        c, b, A, k, x1, x2 = P[t, :6]
        y[:, t] = c + b * x + A * (sig(k * (x - x1)) - sig(k * (x - x2)))
    return y[:, 0] if T == 1 else y

def fit_scaling_law(data_points, loss_values):
    X = np.atleast_2d(np.asarray(data_points)); x = X[:, 0]
    Y = np.asarray(loss_values); Y = Y[:, None] if Y.ndim == 1 else Y
    N, T = Y.shape
    xmin, xmax = float(np.min(x)), float(np.max(x)); span = max(1e-8, xmax - xmin)
    q25, q50, q75 = np.quantile(x, [0.25, 0.5, 0.75]); w_mid = max(1e-6, q75 - q25)
    k_ref = 6.0 / w_mid; w_low, w_high = 0.10 * span, 0.95 * span

    softplus = lambda v: np.log1p(np.exp(-np.abs(v))) + np.maximum(v, 0.0)
    def inv_softplus(u): return np.log(np.expm1(float(max(u, 1e-12))))
    def sigmoid(v): return 1.0 / (1.0 + np.exp(-v))
    def inv_sig(r): 
        r = float(np.clip(r, 1e-9, 1.0 - 1e-9)); return np.log(r / (1.0 - r))
    def huber(res, d):
        a = np.abs(res); return np.where(a <= d, 0.5 * res * res, d * (a - 0.5 * d))

    def theta_to_params(theta):
        c = theta[0]; b = theta[1]
        A = softplus(theta[2]) + 1e-12
        k = softplus(theta[3]) + 1e-6
        s1 = sigmoid(theta[4])
        s2 = s1 + sigmoid(theta[5]) * (1.0 - s1)  # enforce x2 > x1
        x1 = xmin + s1 * span
        x2 = xmin + s2 * span
        return np.array([c, b, A, k, x1, x2])

    rng = np.random.default_rng(77)
    Xlin = np.vstack([np.ones_like(x), x]).T
    params_out = np.zeros((T, 6))

    for t in range(T):
        y = Y[:, t]
        # Robust baseline slope/offset
        coef, *_ = np.linalg.lstsq(Xlin, y, rcond=None)
        c_ls, b_ls = float(coef[0]), float(coef[1])
        r = y - (c_ls + b_ls * x)
        rmax = float(np.max(r)); imax = int(np.argmax(r))
        m0 = float(np.clip(x[imax], xmin + 0.02 * span, xmax - 0.02 * span))
        # Robust scale for Huber
        med = float(np.median(y)); mad = float(np.median(np.abs(y - med))) + 1e-8
        delta = 1.345 * mad

        A0 = max(1e-8, rmax)
        k0 = k_ref
        w0 = np.clip(0.50 * span, w_low, w_high)
        x1g = np.clip(m0 - 0.5 * w0, xmin + 0.02 * span, xmax - 0.10 * span)
        x2g = np.clip(m0 + 0.5 * w0, xmin + 0.10 * span, xmax - 0.02 * span)

        base_theta = np.array([
            c_ls, b_ls,
            inv_softplus(A0),
            inv_softplus(k0),
            inv_sig((x1g - xmin) / span),
            inv_sig((x2g - x1g) / (span - (x1g - xmin) + 1e-12))
        ])

        seeds = [base_theta]
        ystd = float(np.std(y) + 1e-8)
        for _ in range(10):
            cj = c_ls + rng.normal(0.0, 0.10 * ystd)
            bj = b_ls * rng.uniform(0.7, 1.3)
            Aj = max(1e-8, A0 * rng.uniform(0.6, 1.6))
            kj = max(1e-6, k0 * rng.uniform(0.6, 2.0))
            s1 = np.clip((x1g - xmin) / span + rng.normal(0.0, 0.06), 0.03, 0.65)
            s2p = np.clip((x2g - x1g) / (span * max(1e-6, 1.0 - s1)) + rng.normal(0.0, 0.06), 0.20, 0.97)
            seeds.append(np.array([cj, bj, inv_softplus(Aj), inv_softplus(kj), inv_sig(s1), inv_sig(s2p)]))

        def obj(theta):
            p = theta_to_params(theta)
            pred = scaling_law_func(X, p)
            res = pred - y
            loss = np.mean(huber(res, delta))
            c, b, A, k, x1, x2 = p
            w = x2 - x1
            reg = 0.0
            reg += 1e-6 * (c * c) + 3e-6 * (b * b) + 2e-7 * (A * A)
            reg += 5e-6 * (k - k_ref) ** 2
            reg += 1e-3 * max(0.0, w_low - w) ** 2 + 1e-3 * max(0.0, w - w_high) ** 2
            reg += 1e-6 * np.mean(np.maximum(pred, 0.0) ** 2)
            return loss + reg

        best_val, best_p = np.inf, None
        for th0 in seeds:
            res = minimize(obj, th0, method='L-BFGS-B')
            if res.success and res.fun < best_val:
                best_val, best_p = res.fun, theta_to_params(res.x)
        if best_p is None: best_p = theta_to_params(base_theta)
        params_out[t, :] = best_p

    return params_out[0] if params_out.shape[0] == 1 else params_out
# EVOLVE-BLOCK-END