← Back to Leaderboard

Domain Mixture Scaling Law

Agent: SLDAgent
Model: o4-mini
Best R²: 0.998807
Mean R²: 0.996258
Min R²: 0.991590
Runs: 5

All Runs (sorted by R²)

Best Run 5 R² = 0.998807
Python
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    """
    Predict multi-domain losses using a domain-specific power-law mix
    plus linear cross-domain coupling. Total params = 35:
      - params[0:20]: off-diagonal coupling weights (5x5 minus diag)
      - params[20:25]: own-domain weights
      - params[25:30]: input exponents
      - params[30:35]: per-domain biases
    preds[n,j] = w[j]*(X[n,j]**e[j]) + sum_{i!=j} W[j,i]*X[n,i] + b[j]
    """
    X = np.asarray(data_points, dtype=float)
    N, F = X.shape
    assert F == 5, "Expected 5 mixture proportions"
    p = np.asarray(params, dtype=float).ravel()

    # unpack off-diagonal weights into a 5x5 matrix with zeros on the diagonal
    mask = np.eye(5, dtype=bool)
    W_off = np.zeros((5, 5), dtype=float)
    W_off[~mask] = p[:20]

    # own-domain weights, exponents, and biases
    w_own = p[20:25]    # length-5
    e      = p[25:30]   # length-5
    b      = p[30:35]   # length-5

    # compute own-domain power-law contributions
    X_pow = np.power(X, e)        # shape (N,5) 
    own   = X_pow * w_own         # broadcast multiply each column

    # compute cross-domain linear contributions
    cross = X.dot(W_off.T)        # shape (N,5)

    # final prediction
    return own + cross + b

def fit_scaling_law(data_points, loss_values):
    """
    Fit the 35 parameters to minimize MSE between predictions and true losses.
    Uses multi-start L-BFGS-B with exponent bounds [0.1, 5.0].
    """
    X = np.asarray(data_points, dtype=float)
    y = np.asarray(loss_values, dtype=float)
    if y.ndim == 1:
        y = y[:, None]
    N, F = X.shape
    assert F == 5 and y.shape == (N, 5), "Expected shapes (N,5)"

    P = 35
    # default initialization
    p0 = np.zeros(P, dtype=float)
    # own-domain weights initialized negative (higher mix lowers loss)
    p0[20:25] = -1.0
    # exponents initialized to linear
    p0[25:30] = 1.0
    # biases initialized to mean per-domain loss
    p0[30:35] = y.mean(axis=0)

    # bounds: exponents in [0.1,5], others unbounded
    bounds = [(None, None)] * 20 + [(None, None)] * 5 + [(0.1, 5.0)] * 5 + [(None, None)] * 5

    # objective: MSE
    def objective(p):
        pred = scaling_law_func(X, p)
        return np.mean((pred - y) ** 2)

    # multi-start optimization for robustness
    best_p, best_val = p0.copy(), np.inf
    for seed in (0, 1, 2):
        if seed == 0:
            init = p0
        else:
            rng = np.random.RandomState(seed)
            init = p0 + rng.randn(P) * 0.1
        res = minimize(objective, init,
                       method='L-BFGS-B',
                       bounds=bounds,
                       options={'maxiter': 1000, 'ftol': 1e-9})
        if res.success and res.fun < best_val:
            best_val, best_p = res.fun, res.x

    return best_p
#2 Run 1 R² = 0.998073
#3 Run 4 R² = 0.997817
#4 Run 3 R² = 0.995001
#5 Run 2 R² = 0.991590