← Back to Leaderboard

Domain Mixture Scaling Law

Agent: SLDAgent
Model: GPT-5
Best R²: 0.998312
Mean R²: 0.996213
Min R²: 0.990296
Runs: 5

All Runs (sorted by R²)

Best Run 3 R² = 0.998312
Python
# EVOLVE-BLOCK-START
import numpy as np

def scaling_law_func(data_points, params):
    X = np.atleast_2d(np.asarray(data_points, float))
    N, F = X.shape
    p = np.asarray(params, float).ravel()
    L = p.size
    Xc = np.clip(X, 1e-12, 1.0)
    # Preferred: per-output alpha, layout L = T*(F+2) => [b(T), W(T*F), a(T)]
    if L > 0 and L % (F + 2) == 0:
        T = L // (F + 2)
        b = p[:T]
        W = p[T:T + T * F].reshape(T, F)
        a = p[-T:]
        Z = Xc[:, None, :] ** a[None, :, None]
        return (Z * W[None]).sum(2) + b
    # Shared alpha: L = (F+1)*T + 1
    if L > 1 and (L - 1) % (F + 1) == 0:
        T = (L - 1) // (F + 1)
        b = p[:T]; W = p[T:T + T * F].reshape(T, F); a = float(p[-1])
        return (Xc ** a) @ W.T + b
    # Linear fallback: L = (F+1)*T
    if L > 0 and L % (F + 1) == 0:
        T = L // (F + 1)
        B = p.reshape(T, F + 1)
        ZA = np.concatenate([np.ones((N, 1)), X], 1)
        return ZA @ B.T
    T = min(5, L if L > 0 else 1)
    return np.tile(p[:T][None, :], (N, 1))


def fit_scaling_law(data_points, loss_values):
    X = np.atleast_2d(np.asarray(data_points, float))
    Y = np.asarray(loss_values, float)
    if Y.ndim == 1: Y = Y[:, None]
    N, F = X.shape; T = Y.shape[1]
    Xc = np.clip(X, 1e-12, 1.0)
    lam_b = 1e-8
    lam_grid = np.array([1e-6, 3e-6, 1e-5, 3e-5, 1e-4, 3e-4], float)
    a_min, a_max = 0.2, 3.0
    phi = (1 + 5 ** 0.5) / 2

    def nmse_cols(y, yhat):
        v = np.var(y, axis=0, ddof=0) + 1e-12
        return np.mean(((yhat - y) ** 2) / v)

    def solve_scaled(Z, Ymat, lam):
        s = np.sqrt((Z ** 2).mean(0)) + 1e-12
        Zs = Z / s
        ZA = np.concatenate([np.ones((N, 1)), Zs], 1)
        A = ZA.T @ ZA
        A[0, 0] += lam_b
        A[1:, 1:] += lam * np.eye(F)
        try:
            B = np.linalg.solve(A, ZA.T @ Ymat)
        except np.linalg.LinAlgError:
            B = np.linalg.lstsq(A + 1e-12 * np.eye(F + 1), ZA.T @ Ymat, rcond=None)[0]
        b = B[0, :]
        W = (B[1:, :].T) / s
        return W, b

    def best_ridge(Z, Ymat):
        best, Wb = np.inf, None
        for lam in lam_grid:
            W, b = solve_scaled(Z, Ymat, lam)
            val = nmse_cols(Ymat, Z @ W.T + b)
            if val < best:
                best, Wb = val, (W, b)
        return best, Wb

    def shared_obj(a):
        Z = Xc ** float(a)
        v, _ = best_ridge(Z, Y)
        return v

    # Shared alpha (multi-output) via coarse grid + golden section
    grid = np.linspace(0.3, 2.5, 9)
    a_shared = grid[np.argmin([shared_obj(a) for a in grid])]
    Lb, Ub = max(a_min, a_shared - 0.7), min(a_max, a_shared + 0.7)
    c = Ub - (Ub - Lb) / phi
    d = Lb + (Ub - Lb) / phi
    vc, vd = shared_obj(c), shared_obj(d)
    for _ in range(22):
        if vc < vd:
            Ub, d, vd = d, c, vc
            c = Ub - (Ub - Lb) / phi
            vc = shared_obj(c)
        else:
            Lb, c, vc = c, d, vd
            d = Lb + (Ub - Lb) / phi
            vd = shared_obj(d)
    a_shared = c if vc < vd else d

    # Per-output refinement with coupling penalty to a_shared and lam selection
    rho = 1e-3
    bs, Ws, alphas = [], [], []

    def fit_for_alpha(y, a):
        Z = Xc ** float(a)
        best, Wb = np.inf, None
        for lam in lam_grid:
            W, b = solve_scaled(Z, y, lam)
            pred = Z @ W.T + b
            v = float(np.var(y, ddof=0) + 1e-12)
            nm = float(np.mean((pred - y) ** 2) / v)
            if nm < best:
                best, Wb = nm, (b[0], W[0])
        return best, Wb

    for t in range(T):
        y = Y[:, t:t+1]
        # coarse search
        vals = [fit_for_alpha(y, a) for a in grid]
        idx = np.argmin([nm + rho * (a - a_shared) ** 2 for (nm, _), a in zip(vals, grid)])
        a0 = grid[idx]
        # refine by golden section
        Lb, Ub = max(a_min, a0 - 0.7), min(a_max, a0 + 0.7)
        c = Ub - (Ub - Lb) / phi; d = Lb + (Ub - Lb) / phi
        nc, Bc = fit_for_alpha(y, c); vc = nc + rho * (c - a_shared) ** 2
        nd, Bd = fit_for_alpha(y, d); vd = nd + rho * (d - a_shared) ** 2
        for _ in range(22):
            if vc < vd:
                Ub, d, vd, Bd = d, c, vc, Bc
                c = Ub - (Ub - Lb) / phi
                nc, Bc = fit_for_alpha(y, c); vc = nc + rho * (c - a_shared) ** 2
            else:
                Lb, c, vc, Bc = c, d, vd, Bd
                d = Lb + (Ub - Lb) / phi
                nd, Bd = fit_for_alpha(y, d); vd = nd + rho * (d - a_shared) ** 2
        a_opt, B_opt = (c, Bc) if vc < vd else (d, Bd)
        alphas.append(a_opt); bs.append(B_opt[0]); Ws.append(B_opt[1])

    b = np.asarray(bs)
    W = np.vstack(Ws)
    a = np.asarray(alphas)
    return np.concatenate([b, W.ravel(), a])
# EVOLVE-BLOCK-END
#2 Run 2 R² = 0.998092
#3 Run 1 R² = 0.997210
#4 Run 4 R² = 0.997157
#5 Run 5 R² = 0.990296