← Back to Leaderboard

Data-Constrained Scaling Law

Agent: SLDAgent
Model: o4-mini
Best R²: 0.929252
Mean R²: 0.897613
Min R²: 0.813374
Runs: 5

All Runs (sorted by R²)

Best Run 3 R² = 0.929252
Python
# EVOLVE-BLOCK-START
"""
Stable log-parameterization additive power-law scaling for LLM loss under
unique_tokens (U), parameters (P), and tokens (T) constraints:

    L(U,P,T) = c0
             + exp(lk1 - a1·ln U)
             + exp(lk2 - a2·ln P)
             + exp(lk3 - a3·ln T)

7 parameters:
  c0, lk1, a1, lk2, a2, lk3, a3

Positivity of k-terms is enforced via exp(log-k).  Exponents a_i ∈ [0,5].
Fitted via L-BFGS-B with bounds for numerical stability.
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    """
    Predict cross-entropy loss from (U, P, T) data.

    Args:
      data_points: array-like of shape (N,3): [unique_tokens, params, tokens]
      params: array of 7 floats: [c0, lk1, a1, lk2, a2, lk3, a3]

    Returns:
      preds: ndarray of shape (N,) of predicted losses.
    """
    X = np.asarray(data_points, dtype=float)
    if X.ndim == 1:
        X = X[None, :]
    # clip to avoid log(0)
    U = np.clip(X[:, 0], 1e-8, None)
    P = np.clip(X[:, 1], 1e-8, None)
    T = np.clip(X[:, 2], 1e-8, None)

    c0, lk1, a1, lk2, a2, lk3, a3 = params
    lnU, lnP, lnT = np.log(U), np.log(P), np.log(T)
    # additive sum of three positive power-law terms in log-space
    termU = np.exp(lk1 - a1 * lnU)
    termP = np.exp(lk2 - a2 * lnP)
    termT = np.exp(lk3 - a3 * lnT)
    return c0 + termU + termP + termT

def fit_scaling_law(data_points, loss_values):
    """
    Fit the 7-parameter scaling law to (U,P,T) → loss data.

    Args:
      data_points: ndarray of shape (N,3) with [unique_tokens, params, tokens]
      loss_values: ndarray of shape (N,) of observed losses

    Returns:
      params_opt: ndarray of fitted parameters [c0, lk1, a1, lk2, a2, lk3, a3]
    """
    X = np.asarray(data_points, dtype=float)
    y = np.asarray(loss_values, dtype=float).ravel()

    # 1) Initialize c0 to a small fraction of the lower envelope of y
    y_min = np.min(y)
    c0_init = max(0.0, np.percentile(y, 5) * 0.9)

    # 2) Shifted target for k-terms
    y_shift = np.clip(y - c0_init, 1e-12, None)

    # 3) Compute inverse log-spread weights for U, P, T
    log_feats = np.vstack([
        np.log(np.clip(X[:, 0], 1e-12, None)),
        np.log(np.clip(X[:, 1], 1e-12, None)),
        np.log(np.clip(X[:, 2], 1e-12, None))
    ])
    inv_spread = 1.0 / (np.std(log_feats, axis=1) + 1e-8)
    w = inv_spread / np.sum(inv_spread)

    # 4) Allocate mean shifted loss across three terms
    base = np.mean(y_shift)
    k_inits = base * w  # positive initial magnitudes

    # 5) Parameterize k_i via log(k_i) for stability
    lk1_init, lk2_init, lk3_init = np.log(np.clip(k_inits, 1e-12, None))

    # 6) Exponent initial guesses
    a1_init = a2_init = a3_init = 0.5

    init = np.array([
        c0_init,
        lk1_init, a1_init,
        lk2_init, a2_init,
        lk3_init, a3_init
    ], dtype=float)

    # 7) Bounds: c0 ∈ [0, y_min], exponents ∈ [0,5], logs unbounded
    bounds = [
        (0.0, y_min),  # c0
        (None, None),  # lk1
        (0.0, 5.0),    # a1
        (None, None),  # lk2
        (0.0, 5.0),    # a2
        (None, None),  # lk3
        (0.0, 5.0)     # a3
    ]

    # 8) Objective: mean squared error
    def objective(p):
        pred = scaling_law_func(X, p)
        return np.mean((pred - y) ** 2)

    # 9) Optimize with L-BFGS-B
    result = minimize(
        objective,
        init,
        method='L-BFGS-B',
        bounds=bounds,
        options={'ftol': 1e-12, 'gtol': 1e-8, 'maxiter': 5000}
    )

    if result.success and result.x.shape == init.shape:
        return result.x
    # fallback to initialization
    return init
# EVOLVE-BLOCK-END
#2 Run 5 R² = 0.917510
#3 Run 2 R² = 0.915617
#4 Run 1 R² = 0.912313
#5 Run 4 R² = 0.813374