SLD - Data-Constrained Scaling Law

All Runs (sorted by R²)

Best Run 4 R² = 0.926947

▼

Python

# EVOLVE-BLOCK-START
import numpy as np
from scipy.optimize import minimize

_P0, _D0, _U0 = 1.1e9, 1.0e12, 5.0e8
_EPS = 1e-12

def _normalize(X):
    U = np.clip(X[:, 0] / _U0, _EPS, None)
    P = np.clip(X[:, 1] / _P0, _EPS, None)
    D = np.clip(X[:, 2] / _D0, _EPS, None)
    return U, P, D

def scaling_law_func(data_points, params):
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    if X.shape[1] != 3:
        raise ValueError("data_points must have 3 columns: [unique_tokens, params, tokens]")
    U, P, D = _normalize(X)
    p = np.atleast_2d(np.asarray(params, dtype=np.float64))
    if p.shape[1] != 7:
        raise ValueError("params must have 7 elements: [L0,cP,aP,cD,aD,cU,aU]")
    L0, cP, aP, cD, aD, cU, aU = [p[:, i] for i in range(7)]
    aP = np.clip(aP, 0.0, None); aD = np.clip(aD, 0.0, None); aU = np.clip(aU, 0.0, None)
    lp, ld, lu = np.log(P)[:, None], np.log(D)[:, None], np.log(U)[:, None]
    pred = (L0[None, :]
            + cP[None, :] * np.exp(-aP[None, :] * lp)
            + cD[None, :] * np.exp(-aD[None, :] * ld)
            + cU[None, :] * np.exp(-aU[None, :] * lu))
    return pred[:, 0] if p.shape[0] == 1 else pred

def fit_scaling_law(data_points, loss_values):
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64)
    if X.shape[1] != 3:
        raise ValueError("data_points must have 3 columns: [unique_tokens, params, tokens]")
    y2d = y[:, None] if y.ndim == 1 else y
    U, P, D = _normalize(X)
    lp, ld, lu = np.log(P), np.log(D), np.log(U)

    def pseudo_huber(r, d=0.25):
        return d*d * (np.sqrt(1.0 + (r/d)**2) - 1.0)

    def ridge_amplitudes(y_col, L0, aP, aD, aU):
        Phi = np.stack([np.exp(-aP * lp), np.exp(-aD * ld), np.exp(-aU * lu)], axis=1)
        b = y_col - L0
        AtA = Phi.T @ Phi + 1e-3 * np.eye(3)
        Atb = Phi.T @ b
        c = np.linalg.solve(AtA, Atb)
        return np.clip(c, 1e-12, 100.0)

    def obj_phi(phi, y_col):
        L0, aP, aD, aU = phi
        aP = max(aP, 0.0); aD = max(aD, 0.0); aU = max(aU, 0.0)
        cP, cD, cU = ridge_amplitudes(y_col, L0, aP, aD, aU)
        pred = (L0
                + cP * np.exp(-aP * lp)
                + cD * np.exp(-aD * ld)
                + cU * np.exp(-aU * lu))
        r = pred - y_col
        reg = 1e-6 * (cP*cP + cD*cD + cU*cU + aP*aP + aD*aD + aU*aU)
        return np.mean(pseudo_huber(r)) + reg

    def make_inits(y_col):
        y_min = float(np.min(y_col))
        inits = []
        for L0 in [max(y_min - 0.1, 0.0), y_min, min(y_min + 0.2, 10.0)]:
            for aP, aD, aU in [(0.5,0.5,0.5),(0.8,0.6,0.4),(0.3,0.9,0.4),(1.0,0.4,0.3)]:
                inits.append(np.array([L0, aP, aD, aU], dtype=np.float64))
        rng = np.random.default_rng(123)
        for _ in range(6):
            inits.append(np.array([
                float(np.clip(y_min + 0.2 * rng.uniform(-1, 1), 0.0, 10.0)),
                rng.uniform(0.05, 1.5),
                rng.uniform(0.05, 1.5),
                rng.uniform(0.05, 1.5)
            ], dtype=np.float64))
        return inits

    bounds_phi = [(0.0, 10.0), (0.02, 2.5), (0.02, 2.5), (0.02, 2.5)]
    T = y2d.shape[1]
    params_all = np.zeros((T, 7), dtype=np.float64)

    for t in range(T):
        y_col = y2d[:, t]
        best_val, best_phi = np.inf, None
        for init in make_inits(y_col):
            try:
                res = minimize(obj_phi, init, args=(y_col,), method="L-BFGS-B",
                               bounds=bounds_phi, options=dict(maxiter=500, ftol=1e-9))
                phi = res.x if res.success else init
                val = obj_phi(phi, y_col)
            except Exception:
                phi, val = init, obj_phi(init, y_col)
            if val < best_val:
                best_val, best_phi = val, phi
        L0, aP, aD, aU = best_phi
        cP, cD, cU = ridge_amplitudes(y_col, L0, aP, aD, aU)
        theta = np.array([L0, cP, aP, cD, aD, cU, aU], dtype=np.float64)

        def obj_full(th):
            L0, cP, aP, cD, aD, cU, aU = th
            aP = np.clip(aP, bounds_phi[1][0], bounds_phi[1][1])
            aD = np.clip(aD, bounds_phi[2][0], bounds_phi[2][1])
            aU = np.clip(aU, bounds_phi[3][0], bounds_phi[3][1])
            cP = np.clip(cP, 1e-12, 100.0); cD = np.clip(cD, 1e-12, 100.0); cU = np.clip(cU, 1e-12, 100.0)
            L0 = np.clip(L0, 0.0, 10.0)
            pred = (L0
                    + cP * np.exp(-aP * lp)
                    + cD * np.exp(-aD * ld)
                    + cU * np.exp(-aU * lu))
            r = pred - y_col
            reg = 1e-6 * (cP*cP + cD*cD + cU*cU + aP*aP + aD*aD + aU*aU)
            return np.mean(pseudo_huber(r)) + reg

        b_full = [(0.0, 10.0), (1e-12, 100.0), bounds_phi[1], (1e-12, 100.0),
                  bounds_phi[2], (1e-12, 100.0), bounds_phi[3]]
        try:
            res2 = minimize(obj_full, theta, method="L-BFGS-B",
                            bounds=b_full, options=dict(maxiter=300, ftol=1e-9))
            theta = res2.x if res2.success else theta
        except Exception:
            pass
        params_all[t, :] = theta

    return params_all[0] if T == 1 else params_all
# EVOLVE-BLOCK-END

#2 Run 5 R² = 0.925137

▼

Python

# EVOLVE-BLOCK-START
import numpy as np
from scipy.optimize import least_squares

"""
Improved scaling law under data-constrained regimes.

Loss(P, D, U) = L0 + a * P^{-alpha} * f_ratio + b * Deff^{-beta}
Deff = U * (1 - exp(-D / (c * U)))
f_ratio = r / (1 + r),  with r = D / (k * P)

Params: [L0, a, alpha, b, beta, c, k]  (<=7)
- L0: irreducible loss floor (bounded to [1,8])
- a,b: amplitudes (>=0)
- alpha,beta: exponents (>0, bounded to [0.05, 1.2])
- c: saturation scale for coupon-collector Deff (>0)
- k: data-per-parameter ratio scale (>0), controls data-limited attenuation

Notes:
- Coupon-collector Deff captures dedup saturation via unique tokens U.
- The ratio factor f_ratio attenuates capacity gains when D << k*P, improving
  modeling in data-limited regimes without adding interaction w-term.
- Log-space exponentiation ensures numerical stability across large ranges.
"""

def scaling_law_func(data_points, params):
    X = np.atleast_2d(np.asarray(data_points))
    U = np.asarray(X[:, 0], dtype=float)
    Pm = np.asarray(X[:, 1], dtype=float)
    D = np.asarray(X[:, 2], dtype=float)

    theta = np.asarray(params)
    if theta.ndim == 1:
        theta = theta[None, :]
    T, K = theta.shape

    # Defaults if fewer than 7 provided: [L0, a, alpha, b, beta, c, k]
    L0    = theta[:, 0] if K > 0 else np.full(T, 3.0)
    a     = theta[:, 1] if K > 1 else np.ones(T)
    alpha = theta[:, 2] if K > 2 else np.full(T, 0.25)
    b     = theta[:, 3] if K > 3 else np.ones(T)
    beta  = theta[:, 4] if K > 4 else np.full(T, 0.25)
    c     = theta[:, 5] if K > 5 else np.ones(T)
    k     = theta[:, 6] if K > 6 else np.full(T, 100.0)

    eps = 1e-12
    logP = np.log(np.maximum(Pm, eps))[:, None]

    # Effective data with unique-token saturation (coupon collector)
    c_ = np.maximum(c, eps)[None, :]
    U_ = np.maximum(U, eps)[:, None]
    D_ = np.maximum(D, eps)[:, None]
    Deff = U_ * (1.0 - np.exp(-D_ / (c_ * U_)))
    logDeff = np.log(np.maximum(Deff, eps))

    # Data-to-model ratio attenuation
    k_ = np.maximum(k, eps)[None, :]
    r = D_ / (k_ * np.maximum(Pm[:, None], eps))
    f_ratio = r / (1.0 + r)

    p_pow = np.exp(-alpha[None, :] * logP)      # P^{-alpha}
    d_pow = np.exp(-beta[None, :] * logDeff)    # Deff^{-beta}

    pred = L0[None, :] + a[None, :] * (p_pow * f_ratio) + b[None, :] * d_pow
    return pred[:, 0] if pred.shape[1] == 1 else pred


def fit_scaling_law(data_points, loss_values):
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values)
    y2d = y[:, None] if y.ndim == 1 else y
    T = y2d.shape[1]

    # Stable transforms
    def sp(z):  # softplus
        return np.log1p(np.exp(-np.abs(z))) + np.maximum(z, 0)
    def inv_sp(v):
        v = np.maximum(v, 1e-12)
        return np.log(np.expm1(v))
    def sig(z):
        return 1.0 / (1.0 + np.exp(-z))
    def logit(p):
        p = np.clip(p, 1e-9, 1 - 1e-9)
        return np.log(p / (1 - p))

    # Map unconstrained u -> physical theta
    def u_to_theta(u):
        u = u.reshape(T, 7)
        L0    = 1.0 + 7.0 * sig(u[:, 0])            # [1,8]
        a     = sp(u[:, 1])                         # >=0
        alpha = 0.05 + 1.15 * sig(u[:, 2])          # [0.05,1.2]
        b     = sp(u[:, 3])                         # >=0
        beta  = 0.05 + 1.15 * sig(u[:, 4])          # [0.05,1.2]
        c     = np.exp(u[:, 5])                     # >0
        k     = np.exp(u[:, 6])                     # >0
        return np.stack([L0, a, alpha, b, beta, c, k], axis=1)

    # Residuals with robust scaling and modest regularization
    y_scale = np.std(y2d, axis=0) + 1e-8
    lam_u = 1e-5
    lam_c = 1e-6
    lam_k = 1e-6
    lam_ab = 1e-6

    def residuals(u_flat):
        theta = u_to_theta(u_flat)
        pred = scaling_law_func(X, theta)
        r = (pred - (y2d if T > 1 else y2d[:, 0])) / (y_scale if T > 1 else y_scale[0])
        # Regularization via pseudo-residuals
        u = u_flat.reshape(T, 7)
        phys = u_to_theta(u_flat)
        c_phys = phys[:, 5]
        k_phys = phys[:, 6]
        alpha_phys = phys[:, 2]
        beta_phys  = phys[:, 4]
        reg_generic = np.sqrt(lam_u) * u.ravel()
        reg_c = np.sqrt(lam_c) * np.log(c_phys + 1e-12)
        reg_k = np.sqrt(lam_k) * (np.log(k_phys + 1e-12) - np.log(100.0))
        reg_ab = np.sqrt(lam_ab) * np.concatenate([alpha_phys - 0.3, beta_phys - 0.25])
        return np.concatenate([r.ravel(), reg_generic, reg_c, reg_k, reg_ab])

    # Initialization
    y_min = np.min(y2d, axis=0)
    y_max = np.max(y2d, axis=0)
    span = np.maximum(0.1, y_max - y_min)
    L0_init = np.clip(y_min + 0.2 * span, 1.0, 8.0)
    a0, b0 = 0.4 * span, 0.4 * span
    alpha0, beta0 = 0.3, 0.25
    c0, k0 = 1.0, 100.0
    base_theta = np.stack([L0_init,
                           a0,
                           np.full(T, alpha0),
                           b0,
                           np.full(T, beta0),
                           np.full(T, c0),
                           np.full(T, k0)], axis=1)
    u0 = np.empty((T, 7))
    u0[:, 0] = logit((base_theta[:, 0] - 1.0) / 7.0)
    u0[:, 1] = inv_sp(base_theta[:, 1])
    u0[:, 2] = logit((base_theta[:, 2] - 0.05) / 1.15)
    u0[:, 3] = inv_sp(base_theta[:, 3])
    u0[:, 4] = logit((base_theta[:, 4] - 0.05) / 1.15)
    u0[:, 5] = np.log(base_theta[:, 5])
    u0[:, 6] = np.log(base_theta[:, 6])

    # Multi-start robust least-squares
    rng = np.random.default_rng(123)
    best_cost, best_u = np.inf, u0.ravel()
    for _ in range(10):
        u_init = (u0 + rng.normal(0, 0.25, size=u0.shape)).ravel()
        res = least_squares(residuals, u_init, method='trf', loss='soft_l1', f_scale=0.5, max_nfev=2000)
        cost = np.mean(res.fun**2)
        if cost < best_cost:
            best_cost, best_u = cost, res.x

    theta_opt = u_to_theta(best_u)
    return theta_opt[0] if T == 1 else theta_opt
# EVOLVE-BLOCK-END

#3 Run 2 R² = 0.906964

▼

Python

# EVOLVE-BLOCK-START
import numpy as np
from scipy.optimize import minimize

# Dimensionless anchors for stability within given ranges
_U0, _P0, _D0 = 1e8, 3e8, 1e11
_EPS = 1e-12

def scaling_law_func(data_points, params):
    # data_points: (N,3) -> [unique_tokens, params, tokens]
    X = np.atleast_2d(np.asarray(data_points, dtype=float))
    U, P, D = X[:, 0], X[:, 1], X[:, 2]
    u = U / _U0 + _EPS
    m = P / _P0 + _EPS
    d = D / _D0 + _EPS

    p = np.asarray(params, dtype=float)
    if p.ndim == 1: p = p[None, :]
    if p.shape[1] < 7: p = np.pad(p, ((0, 0), (0, 7 - p.shape[1])), constant_values=1.0)
    p = p[:, :7]

    # Clamp to physical domain for stability
    L0   = np.maximum(p[:, 0], 0.0)[None, :]
    a    = np.maximum(p[:, 1], _EPS)[None, :]
    b    = np.maximum(p[:, 2], _EPS)[None, :]
    c    = np.maximum(p[:, 3], _EPS)[None, :]
    beta = np.maximum(p[:, 4], _EPS)[None, :]
    g    = np.maximum(p[:, 5], _EPS)[None, :]
    rho  = np.maximum(p[:, 6], _EPS)[None, :]

    r = d[:, None] / u[:, None]
    E = d[:, None] / (1.0 + c * np.exp(g * np.log(r + _EPS)))

    alpha = rho * beta
    phiP = np.exp(-alpha * np.log(m[:, None]))
    phiE = np.exp(-beta  * np.log(E))
    s = np.sqrt(a * b) * (rho / (1.0 + rho))

    pred = L0 + a * phiP + b * phiE + s * (phiP * phiE)
    return pred[:, 0] if pred.shape[1] == 1 else pred

def fit_scaling_law(data_points, loss_values):
    X = np.atleast_2d(np.asarray(data_points, dtype=float))
    y = np.asarray(loss_values, dtype=float)
    Y = y[:, None] if y.ndim == 1 else y
    N, T = Y.shape

    U, P, D = X[:, 0], X[:, 1], X[:, 2]
    u = U / _U0 + _EPS
    m = P / _P0 + _EPS
    d = D / _D0 + _EPS
    r = d / u

    # Dispersion-informed seeds for repetition penalty and coupling
    sr = float(np.std(np.log(r + _EPS)))
    c0 = np.clip(0.12 * np.exp(0.25 * sr), 0.05, 0.35)
    g0 = np.clip(0.60 + 0.20 * np.tanh(sr), 0.45, 0.85)
    beta0, rho0 = 0.28, 1.25  # target alpha ≈ 0.35

    init = np.zeros((T, 7), dtype=float)
    for t in range(T):
        yt = Y[:, t]
        L0 = max(0.5, float(np.percentile(yt, 5)))
        E0 = d / (1.0 + c0 * np.power(r, g0))
        phiP = np.exp(-(rho0 * beta0) * np.log(m))
        phiE = np.exp(-beta0 * np.log(E0))
        Phi = np.stack([phiP, phiE, phiP * phiE], axis=1)
        amps, *_ = np.linalg.lstsq(Phi, yt - L0, rcond=None)
        a0 = max(amps[0], 1e-6)
        b0 = max(amps[1], 1e-6)
        s_amp = max(amps[2], 1e-6)
        lam = np.clip(s_amp / np.sqrt(max(a0, _EPS) * max(b0, _EPS)), 1e-3, 0.95)
        rho_init = lam / (1.0 - lam)
        init[t] = np.array([L0, a0, b0, c0, beta0, g0, rho_init], dtype=float)

    def _sp(z): return np.log1p(np.exp(-np.abs(z))) + np.maximum(z, 0.0)
    def _inv_sp(v): v = np.maximum(v, 1e-12); return np.log(np.expm1(v))

    Xp = np.stack([U, P * 1.4, D], axis=1)
    Xd = np.stack([U, P, D * 1.4], axis=1)
    delta, lam, alpha_prior = 0.55, 1e-4, 0.35

    def obj(q_flat):
        q = q_flat.reshape(T, 7)
        p = _sp(q)
        pred = scaling_law_func(X, p)
        res = pred - (Y if T > 1 else Y[:, 0])
        a = np.abs(res)
        huber = np.where(a <= delta, 0.5 * res**2, delta * (a - 0.5 * delta))
        loss = np.mean(huber)
        # Monotonicity and exponent prior
        mono = 1e-3 * (np.mean(np.maximum(0.0, scaling_law_func(Xp, p) - pred)**2) +
                       np.mean(np.maximum(0.0, scaling_law_func(Xd, p) - pred)**2))
        prior = 5e-4 * np.sum((p[:, 6] * p[:, 4] - alpha_prior)**2)
        rng_pen = 1e-4 * np.mean((np.maximum(0.0, pred - 12.0))**2 + (np.maximum(0.0, 0.0 - pred))**2)
        return float(loss + lam * np.sum(q**2) + mono + prior + rng_pen)

    rng = np.random.default_rng(42)
    starts = max(6, 2 * T + 4)
    best_val, best_q = np.inf, None
    for _ in range(starts):
        p0 = init.copy()
        p0[:, :3] = np.clip(p0[:, :3] * np.exp(rng.normal(0.0, 0.25, size=(T, 3))), 1e-8, None)
        p0[:, 3]  = np.clip(p0[:, 3]  * np.exp(rng.normal(0.0, 0.20, size=T)), 1e-8, None)
        p0[:, 4:] = np.clip(p0[:, 4:] + rng.normal(0.0, 0.12, size=(T, 3)), 1e-3, None)
        q0 = _inv_sp(p0)
        res = minimize(obj, q0.ravel(), method='L-BFGS-B', options={'maxiter': 600})
        q_opt = res.x if res.success else q0.ravel()
        val = obj(q_opt)
        if val < best_val:
            best_val, best_q = val, q_opt

    p_opt = _sp(best_q.reshape(T, 7))
    return p_opt[0] if T == 1 else p_opt
# EVOLVE-BLOCK-END

#4 Run 3 R² = 0.900716

▼

Python

# EVOLVE-BLOCK-START
import numpy as np
from scipy.optimize import minimize

_P_REF = 1e9
_D_REF = 1e11
_U_REF = 1e8
_EPS   = 1e-12

def _effective_tokens(U, D):
    u = np.maximum(U, _EPS) / _U_REF
    d = np.maximum(D, _EPS) / _D_REF
    return u * (1.0 - np.exp(-d / (u + _EPS))) + _EPS

def scaling_law_func(data_points, params):
    # data_points: (N,3) -> [unique_tokens, params, tokens]
    X = np.atleast_2d(np.asarray(data_points))
    U, Pm, D = X[:, 0], X[:, 1], X[:, 2]
    p   = np.maximum(Pm, _EPS) / _P_REF
    te  = _effective_tokens(U, D)
    r   = np.maximum(p / te, _EPS)

    th  = np.atleast_2d(np.asarray(params))[:, :7]
    L   = th[:, 0][None, :]
    Ap  = th[:, 1][None, :]
    alp = th[:, 2][None, :]
    Ad  = th[:, 3][None, :]
    ald = th[:, 4][None, :]
    Kc  = th[:, 5][None, :]
    eta = th[:, 6][None, :]

    pN   = p[:, None]
    teN  = te[:, None]
    ln_p = np.log(pN)
    ln_t = np.log(teN)
    ln_r = np.log(r[:, None])

    comp_p  = Ap * np.exp(-alp * ln_p)          # ~ Ap * p^{-alp}
    comp_te = Ad * np.exp(-ald * ln_t)          # ~ Ad * te^{-ald}
    denom   = 1.0 + np.maximum(Kc, 0.0) * np.exp(eta * ln_r)  # 1 + K * r^{eta}
    resid   = (comp_p + comp_te) / denom
    pred    = L + resid
    return pred[:, 0] if pred.shape[1] == 1 else pred

def fit_scaling_law(data_points, loss_values):
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values)
    y2d = y[:, None] if y.ndim == 1 else y
    T = y2d.shape[1]

    U, Pm, D = X[:, 0], X[:, 1], X[:, 2]
    p   = np.maximum(Pm, _EPS) / _P_REF
    te  = _effective_tokens(U, D)
    r   = np.maximum(p / te, _EPS)
    ln_p = np.log(p + _EPS)
    ln_t = np.log(te + _EPS)
    ln_r = np.log(r + _EPS)

    bounds = [
        (0.2, 10.0),    # L_inf
        (1e-8, 200.0),  # Ap
        (0.05, 1.5),    # alpha_p
        (1e-8, 200.0),  # Ad
        (0.05, 1.5),    # alpha_d
        (0.0, 10.0),    # K
        (0.0, 2.5),     # eta
    ]

    def huber_mean(rm, delta=0.45):
        a = np.abs(rm)
        return np.mean(np.where(a <= delta, 0.5 * rm**2, delta * (a - 0.5 * delta)))

    lam = 1e-6
    def objective(theta, target):
        pred = scaling_law_func(X, theta)
        res  = pred - target
        reg  = lam * (theta[1]**2 + theta[3]**2 + theta[2]**2 + theta[4]**2) \
             + 5e-5 * (theta[5] - 0.5)**2 + 5e-5 * (theta[6] - 1.0)**2
        return huber_mean(res) + reg

    # Closed-form ridge LS for Ap, Ad given {L, alp, ald, K, eta}
    def infer_scales(L0, a0, b0, K0, e0, target):
        den  = 1.0 + np.maximum(K0, 0.0) * np.exp(e0 * ln_r)
        gp   = np.exp(-a0 * ln_p) / den
        gd   = np.exp(-b0 * ln_t) / den
        G    = np.stack([gp, gd], axis=1)
        bvec = target - L0
        reg  = 1e-8
        GTG  = G.T @ G + reg * np.eye(2)
        GTb  = G.T @ bvec
        sol  = np.linalg.solve(GTG, GTb)
        Ap0  = float(np.clip(sol[0], bounds[1][0], bounds[1][1]))
        Ad0  = float(np.clip(sol[1], bounds[3][0], bounds[3][1]))
        return Ap0, Ad0

    y_min  = np.min(y2d, axis=0)
    y_mean = np.mean(y2d, axis=0)
    rng    = np.random.default_rng(123)

    best = np.zeros((T, 7), dtype=np.float64)
    for t in range(T):
        target = y2d[:, t]
        L0a = float(np.clip(y_min[t] * 0.95, bounds[0][0], bounds[0][1]))
        L0b = float(np.clip(y_mean[t] - 0.5, bounds[0][0], bounds[0][1]))

        exp_grid  = [(0.30, 0.35), (0.25, 0.30), (0.40, 0.20), (0.20, 0.45)]
        inter_grid= [(0.0, 0.8), (0.2, 0.6), (0.8, 1.0), (1.5, 1.2)]
        seeds = []
        for L0 in (L0a, L0b):
            for (ap0, ad0) in exp_grid:
                for (K0, e0) in inter_grid:
                    Ap0, Ad0 = infer_scales(L0, ap0, ad0, K0, e0, target)
                    seeds.append([L0, Ap0, ap0, Ad0, ad0, K0, e0])

        # Add randomized seeds for exploration
        for _ in range(4):
            seeds.append([rng.uniform(lo, hi) for (lo, hi) in bounds])

        # Stage-1 (no interaction) then full refinement
        def obj5(th5, target):
            th7 = np.array([th5[0], th5[1], th5[2], th5[3], th5[4], 0.0, 1.0], dtype=np.float64)
            pred = scaling_law_func(X, th7)
            res  = pred - target
            reg  = lam * (th5[1]**2 + th5[3]**2 + th5[2]**2 + th5[4]**2)
            return huber_mean(res, 0.50) + reg

        b5 = [bounds[i] for i in range(5)]
        val_best = np.inf
        th_best  = np.array(seeds[0], dtype=np.float64)
        for s in seeds:
            # First compress to 5D and fit without interaction
            th5_init = np.array(s[:5], dtype=np.float64)
            th5_init = np.array([np.clip(th5_init[i], *b5[i]) for i in range(5)], dtype=np.float64)
            res5 = minimize(lambda th: obj5(th, target), th5_init, method='L-BFGS-B', bounds=b5)
            th5 = res5.x if res5.success else th5_init
            # Expand to 7D using the seed's interaction, then optimize full model
            th7_init = np.array([th5[0], th5[1], th5[2], th5[3], th5[4], s[5], s[6]], dtype=np.float64)
            th7_init = np.array([np.clip(th7_init[i], *bounds[i]) for i in range(7)], dtype=np.float64)
            res7 = minimize(lambda th: objective(th, target), th7_init, method='L-BFGS-B', bounds=bounds)
            cand = res7.x if res7.success else th7_init
            v = objective(cand, target)
            if v < val_best:
                val_best = v
                th_best = cand

        # Final local refinement with tighter Huber
        def objective_tight(theta, target):
            pred = scaling_law_func(X, theta)
            res  = pred - target
            reg  = lam * (theta[1]**2 + theta[3]**2 + theta[2]**2 + theta[4]**2) \
                 + 5e-5 * (theta[5] - 0.5)**2 + 5e-5 * (theta[6] - 1.0)**2
            # Slightly tighter delta improves MAE without destabilizing NMSE
            return huber_mean(res, 0.35) + reg

        res_final = minimize(lambda th: objective_tight(th, target), th_best, method='L-BFGS-B', bounds=bounds)
        best[t] = res_final.x if res_final.success else th_best

    return best[0] if T == 1 else best
# EVOLVE-BLOCK-END

#5 Run 1 R² = 0.900311

▼

Python

# EVOLVE-BLOCK-START
import numpy as np
from scipy.optimize import minimize

_P_REF = 1e9
_D_REF = 1e11
_EPS   = 1.0

def _sp(z):
    return np.log1p(np.exp(-np.abs(z))) + np.maximum(z, 0.0)

def _sg(z):
    return 1.0 / (1.0 + np.exp(-z))

def scaling_law_func(data_points, params):
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    U, P, D = X[:, 0], X[:, 1], X[:, 2]
    th = np.atleast_2d(np.asarray(params, dtype=np.float64))
    if th.shape[1] < 6:
        raise ValueError("params must have at least 6 elements: [L_inf, A, alpha, B, beta, U0] (+optional C)")
    use_C = th.shape[1] >= 7

    N, T = X.shape[0], th.shape[0]
    out = np.empty((N, T), dtype=np.float64)
    Pc = np.clip(P, 1.0, None)

    for i in range(T):
        L, A, a, B, b, U0 = th[i, :6]
        C = th[i, 6] if use_C else 0.0
        U0 = max(float(U0), 1.0)
        # Diversity-adjusted effective data: D_eff = D * (1 - exp(-U/U0))
        s = np.exp(-np.clip(U / U0, 0.0, 50.0))
        De = np.clip(D * (1.0 - s) + _EPS, 1.0, None)

        fP  = (_P_REF / Pc) ** a
        fD  = (_D_REF / De) ** b
        fPD = fP * fD
        out[:, i] = L + A * fP + B * fD + C * fPD

    return out[:, 0] if out.shape[1] == 1 else out

def fit_scaling_law(data_points, loss_values):
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64)
    Y = y[:, None] if y.ndim == 1 else y
    T = Y.shape[1]
    U, P, D = X[:, 0], X[:, 1], X[:, 2]
    Pc = np.clip(P, 1.0, None)

    L_MIN, L_MAX = 0.5, 10.0
    AMAX, BMAX = 2.5, 2.5

    def pack(z):
        z = np.atleast_2d(z)
        L  = L_MIN + (L_MAX - L_MIN) * _sg(z[:, 0])
        A  = _sp(z[:, 1]) + 1e-8
        a  = AMAX * _sg(z[:, 2])
        B  = _sp(z[:, 3]) + 1e-8
        b  = BMAX * _sg(z[:, 4])
        U0 = _sp(z[:, 5]) + 1.0
        C  = _sp(z[:, 6])  # synergy term, nonnegative
        return np.stack([L, A, a, B, b, U0, C], axis=1)

    def sp_inv(x):
        x = np.maximum(np.asarray(x, dtype=np.float64), 1e-12)
        return np.where(x > 20.0, x, np.log(np.expm1(x)))

    def inv_sig(x, lo, hi):
        p = np.clip((x - lo) / (hi - lo), 1e-8, 1 - 1e-8)
        return np.log(p / (1.0 - p))

    def obj(z, ycol):
        th = pack(z.reshape(1, -1))
        pred = scaling_law_func(X, th)
        e = pred - ycol
        loss = np.mean(np.log(np.cosh(e)))
        # Mild L2 regularization plus slightly stronger on synergy C
        reg = 1e-6 * np.sum(th**2) + 5e-6 * np.sum(th[:, 6]**2)
        return loss + reg

    params_out = np.zeros((T, 7), dtype=np.float64)
    rng = np.random.default_rng(42)

    # Seed grids for exponents and diversity scale
    u_cands = [float(np.percentile(U, q)) for q in (30, 50, 70)]
    u_cands = [max(1.0, u) for u in u_cands]
    a_cands = [0.20, 0.35, 0.50]
    b_cands = [0.20, 0.35, 0.50]

    for t in range(T):
        y_t = Y[:, t]
        seeds = []

        for a0 in a_cands:
            fP = (_P_REF / Pc) ** a0
            for b0 in b_cands:
                for u0 in u_cands:
                    s = np.exp(-np.clip(U / max(u0, 1.0), 0.0, 50.0))
                    De = np.clip(D * (1.0 - s) + _EPS, 1.0, None)
                    fD  = (_D_REF / De) ** b0
                    fPD = fP * fD
                    # Regularized least squares for [L, A, B, C]
                    A_mat = np.column_stack([np.ones_like(fP), fP, fD, fPD])
                    lam_r = 1e-8
                    G = A_mat.T @ A_mat + lam_r * np.eye(4)
                    rhs = A_mat.T @ y_t
                    coef = np.linalg.solve(G, rhs)
                    L0, A0, B0, C0 = coef
                    L0 = float(np.clip(L0, L_MIN, L_MAX))
                    A0 = float(max(A0, 1e-8))
                    B0 = float(max(B0, 1e-8))
                    C0 = float(max(C0, 0.0))
                    seeds.append(np.array([
                        inv_sig(L0, L_MIN, L_MAX),
                        sp_inv(A0),
                        inv_sig(a0, 0.0, AMAX),
                        sp_inv(B0),
                        inv_sig(b0, 0.0, BMAX),
                        sp_inv(u0 - 1.0),
                        sp_inv(C0)
                    ], dtype=np.float64))

        vals = np.array([obj(s, y_t) for s in seeds])
        idxs = np.argsort(vals)[:6]
        starts = []
        for i in idxs:
            s = seeds[i]
            starts.append(s)
            starts.append(s + rng.normal(0.0, 0.2, size=s.shape))

        best_v, best_th = np.inf, None
        for z0 in starts:
            res = minimize(obj, z0, args=(y_t,), method='L-BFGS-B',
                           options={'maxiter': 800, 'ftol': 1e-9})
            cand = res.x if res.success else z0
            v = obj(cand, y_t)
            if v < best_v:
                best_v = v
                best_th = pack(cand.reshape(1, -1))[0]

        params_out[t, :] = best_th if best_th is not None else pack(starts[0].reshape(1, -1))[0]

    return params_out[0] if T == 1 else params_out
# EVOLVE-BLOCK-END