SLD - Domain Mixture Scaling Law

Best Run 5 R² = 0.998807

▼

Python

import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    """
    Predict multi-domain losses using a domain-specific power-law mix
    plus linear cross-domain coupling. Total params = 35:
      - params[0:20]: off-diagonal coupling weights (5x5 minus diag)
      - params[20:25]: own-domain weights
      - params[25:30]: input exponents
      - params[30:35]: per-domain biases
    preds[n,j] = w[j]*(X[n,j]**e[j]) + sum_{i!=j} W[j,i]*X[n,i] + b[j]
    """
    X = np.asarray(data_points, dtype=float)
    N, F = X.shape
    assert F == 5, "Expected 5 mixture proportions"
    p = np.asarray(params, dtype=float).ravel()

    # unpack off-diagonal weights into a 5x5 matrix with zeros on the diagonal
    mask = np.eye(5, dtype=bool)
    W_off = np.zeros((5, 5), dtype=float)
    W_off[~mask] = p[:20]

    # own-domain weights, exponents, and biases
    w_own = p[20:25]    # length-5
    e      = p[25:30]   # length-5
    b      = p[30:35]   # length-5

    # compute own-domain power-law contributions
    X_pow = np.power(X, e)        # shape (N,5) 
    own   = X_pow * w_own         # broadcast multiply each column

    # compute cross-domain linear contributions
    cross = X.dot(W_off.T)        # shape (N,5)

    # final prediction
    return own + cross + b

def fit_scaling_law(data_points, loss_values):
    """
    Fit the 35 parameters to minimize MSE between predictions and true losses.
    Uses multi-start L-BFGS-B with exponent bounds [0.1, 5.0].
    """
    X = np.asarray(data_points, dtype=float)
    y = np.asarray(loss_values, dtype=float)
    if y.ndim == 1:
        y = y[:, None]
    N, F = X.shape
    assert F == 5 and y.shape == (N, 5), "Expected shapes (N,5)"

    P = 35
    # default initialization
    p0 = np.zeros(P, dtype=float)
    # own-domain weights initialized negative (higher mix lowers loss)
    p0[20:25] = -1.0
    # exponents initialized to linear
    p0[25:30] = 1.0
    # biases initialized to mean per-domain loss
    p0[30:35] = y.mean(axis=0)

    # bounds: exponents in [0.1,5], others unbounded
    bounds = [(None, None)] * 20 + [(None, None)] * 5 + [(0.1, 5.0)] * 5 + [(None, None)] * 5

    # objective: MSE
    def objective(p):
        pred = scaling_law_func(X, p)
        return np.mean((pred - y) ** 2)

    # multi-start optimization for robustness
    best_p, best_val = p0.copy(), np.inf
    for seed in (0, 1, 2):
        if seed == 0:
            init = p0
        else:
            rng = np.random.RandomState(seed)
            init = p0 + rng.randn(P) * 0.1
        res = minimize(objective, init,
                       method='L-BFGS-B',
                       bounds=bounds,
                       options={'maxiter': 1000, 'ftol': 1e-9})
        if res.success and res.fun < best_val:
            best_val, best_p = res.fun, res.x

    return best_p

#2 Run 1 R² = 0.998073

▼

Python

import numpy as np
from scipy.optimize import least_squares

# EVOLVE-BLOCK-START

def scaling_law_func(data_points, params):
    """
    Low‐rank + direct power‐law scaling (35 params):
      p[0:5]   exponents e_j
      p[5:15]  U (5×2)
      p[15:25] V (2×5)
      p[25:30] d (direct weights)
      p[30:35] b (biases)
    preds = (X**e)·U·V + (X**e)*d + b
    """
    X = np.asarray(data_points, float)
    if X.ndim!=2 or X.shape[1]!=5:
        raise ValueError("data_points must be shape (N,5)")
    p = np.asarray(params, float).ravel()
    if p.size!=35:
        raise ValueError(f"Expected 35 params, got {p.size}")

    e = p[0:5]
    U = p[5:15].reshape(5,2)
    V = p[15:25].reshape(2,5)
    d = p[25:30]
    b = p[30:35]

    X_e = np.power(X, e)
    latent = X_e.dot(U).dot(V)
    direct = X_e * d
    return latent + direct + b

def fit_scaling_law(data_points, loss_values):
    """
    Fit 35 params via nonlinear least squares with ridge on params.
    Uses TRF solver with regularization included as extra residuals.
    """
    X = np.asarray(data_points, float)
    y = np.asarray(loss_values, float)
    if X.ndim!=2 or X.shape[1]!=5:
        raise ValueError("data_points must be shape (N,5)")
    if y.ndim==1:
        y = y[:,None]
    if y.shape!=(X.shape[0],5):
        raise ValueError("loss_values must be shape (N,5)")

    N = X.shape[0]
    P = 35
    # bounds
    lb = np.r_[np.full(5,0.01), np.full(P-5, -np.inf)]
    ub = np.r_[np.full(5,5.0 ), np.full(P-5,  np.inf)]
    # init
    init = np.zeros(P)
    init[0:5] = 1.0
    # U diag init
    init[5] = init[5+1*2+1-1] = 1.0  # U[0,0] and U[1,1]
    # V diag init
    init[15] = init[15+1*5+1] = 1.0  # V[0,0] and V[1,1]
    # biases
    init[30:35] = np.mean(y, axis=0)

    # regularization weights
    ridge_e  = 1e-3
    ridge_uv = 1e-4
    ridge_d  = 1e-4

    def residuals(p):
        pred = scaling_law_func(X, p)
        # data residuals
        r_data = (pred - y).ravel()
        # reg residuals
        r_e  = np.sqrt(ridge_e)  * (p[0:5] - 1.0)
        r_uv = np.sqrt(ridge_uv) * p[5:25]
        r_d  = np.sqrt(ridge_d)  * p[25:30]
        return np.concatenate([r_data, r_e, r_uv, r_d])

    sol = least_squares(
        residuals,
        init,
        bounds=(lb, ub),
        method='trf',
        max_nfev=2000,
        ftol=1e-9,
        xtol=1e-9,
        gtol=1e-9
    )
    return sol.x if sol.success else init

# EVOLVE-BLOCK-END

#3 Run 4 R² = 0.997817

▼

#4 Run 3 R² = 0.995001

▼

Python

import numpy as np

# EVOLVE-BLOCK-START
def scaling_law_func(data_points, params):
    """
    Predict multi-domain losses for 5 domains using a compact
    5-feature per-domain linear model (25 parameters total):
      f0 = bias
      f1 = -log(p_i)
      f2 = 1 - p_i
      f3 = global pairwise interaction sum
      f4 = p_i * (-log(p_i))
    """
    X = np.asarray(data_points, dtype=float)
    N, D = X.shape
    assert D == 5, "Expected 5 mixture proportions"
    
    p = np.asarray(params, dtype=float).ravel()
    assert p.size == 25, "Parameter vector length must be 25"
    W = p.reshape(5, 5)   # 5 features × 5 domains
    
    # numerical safety for log
    eps = 1e-6
    Xs = np.clip(X, eps, 1.0)
    
    # per-domain local features
    neglog = -np.log(Xs)             # (N,5)
    complement = 1.0 - X             # (N,5)
    local_int = X * neglog           # (N,5)
    
    # global mixture feature: sum_{i<j} p_i * p_j
    s1 = X.sum(axis=1)               # (N,)
    s2 = (X * X).sum(axis=1)         # (N,)
    pairwise = 0.5 * (s1 * s1 - s2)  # (N,)
    
    # linear combination per domain
    # W[0]=bias, W[1]=neglog, W[2]=complement, W[3]=pairwise, W[4]=local interaction
    preds = (
        W[0][None, :] +
        W[1][None, :] * neglog +
        W[2][None, :] * complement +
        W[3][None, :] * pairwise[:, None] +
        W[4][None, :] * local_int
    )
    return preds


def fit_scaling_law(data_points, loss_values):
    """
    Fit the 25 parameters via independent ridge regressions per domain
    on the 5-feature design:
      f0 = 1
      f1 = -log(p_i)
      f2 = 1 - p_i
      f3 = sum_{j<k} p_j * p_k
      f4 = p_i * (-log p_i)
    """
    X = np.asarray(data_points, dtype=float)
    y = np.asarray(loss_values, dtype=float)
    N, D = X.shape
    assert D == 5, "Expected 5 mixture proportions"
    
    if y.ndim == 1:
        y = y[:, None]
    assert y.shape == (N, 5), "Expected loss_values shape (N,5)"
    
    # build features
    eps = 1e-6
    Xs = np.clip(X, eps, 1.0)
    neglog = -np.log(Xs)             # (N,5)
    complement = 1.0 - X             # (N,5)
    local_int = X * neglog           # (N,5)
    s1 = X.sum(axis=1)               # (N,)
    s2 = (X * X).sum(axis=1)         # (N,)
    pairwise = 0.5 * (s1 * s1 - s2)  # (N,)
    
    # placeholder for 5×5 weights
    P = np.zeros((5, 5), dtype=float)
    base_reg = 1e-6
    
    # fit one small ridge-regression per domain
    for i in range(5):
        Fi = np.column_stack([
            np.ones(N),
            neglog[:, i],
            complement[:, i],
            pairwise,
            local_int[:, i]
        ])  # shape (N,5)
        
        A = Fi.T.dot(Fi)
        # scale regularization by trace for stability
        reg = base_reg * np.trace(A) / A.shape[0]
        A += reg * np.eye(5)
        
        b = Fi.T.dot(y[:, i])
        P[:, i] = np.linalg.solve(A, b)
    
    return P.ravel()
# EVOLVE-BLOCK-END

#5 Run 2 R² = 0.991590

▼

Python

import numpy as np

def scaling_law_func(data_points, params):
    """
    Predict multi-domain losses for 5 domains given mixture proportions.
    Model form per domain j:
      loss_j = bias_j
               + w_log_j * log(p_j + ε)
               + w_cross_j * (1 - p_j)
               + w_pair  * sum_{i<k} p_i p_k
               + w_ent   * entropy(p)
    Total parameters = 5 (bias) + 5 (w_log) + 5 (w_cross) + 1 (w_pair) + 1 (w_ent) = 17.
    """
    X = np.asarray(data_points, dtype=float)
    N, D = X.shape
    assert D == 5, "Expected input shape (N,5)"
    p = np.asarray(params, dtype=float).ravel()
    assert p.size == 17, "Parameter vector must have length 17"
    
    # unpack parameters
    bias      = p[0:5]      # domain-specific biases
    w_log     = p[5:10]     # domain-specific log-coefficients
    w_cross   = p[10:15]    # domain-specific cross-mass coefficients
    w_pair    = p[15]       # shared pairwise weight
    w_ent     = p[16]       # shared entropy weight

    # precompute shared features
    eps = 1e-8
    logs = np.log(X + eps)                # (N,5)
    cross = 1.0 - X                       # (N,5)
    sum_X = np.sum(X, axis=1)             # (N,)
    sum_X2 = np.sum(X * X, axis=1)        # (N,)
    pair  = 0.5 * (sum_X * sum_X - sum_X2) # (N,)
    ent   = -np.sum(X * logs, axis=1)     # (N,)

    # assemble predictions
    # Each term broadcasts appropriately over shape (N,5)
    preds = (
        bias[np.newaxis, :] +
        w_log[np.newaxis, :] * logs +
        w_cross[np.newaxis, :] * cross +
        w_pair * pair[:, np.newaxis] +
        w_ent  * ent[:, np.newaxis]
    )
    return preds


def fit_scaling_law(data_points, loss_values):
    """
    Fit the 17 parameters via ridge‐regularized least squares.
    We flatten the 5 outputs into a single regression over N*5 rows.
    """
    X = np.asarray(data_points, dtype=float)
    y = np.asarray(loss_values, dtype=float)
    N, D = X.shape
    assert D == 5, "Expected input shape (N,5)"
    # ensure y is (N,5)
    if y.ndim == 1:
        y = y[:, None]
    assert y.shape == (N, 5), "Expected loss_values shape (N,5)"

    # precompute shared features
    eps = 1e-8
    logs  = np.log(X + eps)               # (N,5)
    cross = 1.0 - X                       # (N,5)
    sum_X = np.sum(X, axis=1)             # (N,)
    sum_X2 = np.sum(X * X, axis=1)        # (N,)
    pair  = 0.5 * (sum_X * sum_X - sum_X2) # (N,)
    ent   = -np.sum(X * logs, axis=1)     # (N,)

    # build design matrix Dmat of shape (N*5, 17)
    P = 17
    R = N * 5
    Dmat = np.zeros((R, P), dtype=float)
    y_flat = np.zeros(R, dtype=float)

    # row index base for each sample i
    for i in range(N):
        base = i * 5
        for j in range(5):
            idx = base + j
            # bias_j
            Dmat[idx, j] = 1.0
            # log term for domain j
            Dmat[idx, 5 + j] = logs[i, j]
            # cross-mass term for domain j
            Dmat[idx, 10 + j] = cross[i, j]
            # shared pairwise
            Dmat[idx, 15] = pair[i]
            # shared entropy
            Dmat[idx, 16] = ent[i]
            # target
            y_flat[idx] = y[i, j]

    # ridge regression: solve (D^T D + λ I) p = D^T y_flat
    A = Dmat.T.dot(Dmat)
    # scale regularizer by average feature variance
    lam = 1e-6 * np.trace(A) / P
    A += lam * np.eye(P)
    b = Dmat.T.dot(y_flat)

    # solve for parameter vector length 17
    p_opt = np.linalg.solve(A, b)
    return p_opt

Domain Mixture Scaling Law

All Runs (sorted by R²)