SLD - Data-Constrained Scaling Law

All Runs (sorted by R²)

Best Run 3 R² = 0.929252

▼

Python

# EVOLVE-BLOCK-START
"""
Stable log-parameterization additive power-law scaling for LLM loss under
unique_tokens (U), parameters (P), and tokens (T) constraints:

    L(U,P,T) = c0
             + exp(lk1 - a1·ln U)
             + exp(lk2 - a2·ln P)
             + exp(lk3 - a3·ln T)

7 parameters:
  c0, lk1, a1, lk2, a2, lk3, a3

Positivity of k-terms is enforced via exp(log-k).  Exponents a_i ∈ [0,5].
Fitted via L-BFGS-B with bounds for numerical stability.
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    """
    Predict cross-entropy loss from (U, P, T) data.

    Args:
      data_points: array-like of shape (N,3): [unique_tokens, params, tokens]
      params: array of 7 floats: [c0, lk1, a1, lk2, a2, lk3, a3]

    Returns:
      preds: ndarray of shape (N,) of predicted losses.
    """
    X = np.asarray(data_points, dtype=float)
    if X.ndim == 1:
        X = X[None, :]
    # clip to avoid log(0)
    U = np.clip(X[:, 0], 1e-8, None)
    P = np.clip(X[:, 1], 1e-8, None)
    T = np.clip(X[:, 2], 1e-8, None)

    c0, lk1, a1, lk2, a2, lk3, a3 = params
    lnU, lnP, lnT = np.log(U), np.log(P), np.log(T)
    # additive sum of three positive power-law terms in log-space
    termU = np.exp(lk1 - a1 * lnU)
    termP = np.exp(lk2 - a2 * lnP)
    termT = np.exp(lk3 - a3 * lnT)
    return c0 + termU + termP + termT

def fit_scaling_law(data_points, loss_values):
    """
    Fit the 7-parameter scaling law to (U,P,T) → loss data.

    Args:
      data_points: ndarray of shape (N,3) with [unique_tokens, params, tokens]
      loss_values: ndarray of shape (N,) of observed losses

    Returns:
      params_opt: ndarray of fitted parameters [c0, lk1, a1, lk2, a2, lk3, a3]
    """
    X = np.asarray(data_points, dtype=float)
    y = np.asarray(loss_values, dtype=float).ravel()

    # 1) Initialize c0 to a small fraction of the lower envelope of y
    y_min = np.min(y)
    c0_init = max(0.0, np.percentile(y, 5) * 0.9)

    # 2) Shifted target for k-terms
    y_shift = np.clip(y - c0_init, 1e-12, None)

    # 3) Compute inverse log-spread weights for U, P, T
    log_feats = np.vstack([
        np.log(np.clip(X[:, 0], 1e-12, None)),
        np.log(np.clip(X[:, 1], 1e-12, None)),
        np.log(np.clip(X[:, 2], 1e-12, None))
    ])
    inv_spread = 1.0 / (np.std(log_feats, axis=1) + 1e-8)
    w = inv_spread / np.sum(inv_spread)

    # 4) Allocate mean shifted loss across three terms
    base = np.mean(y_shift)
    k_inits = base * w  # positive initial magnitudes

    # 5) Parameterize k_i via log(k_i) for stability
    lk1_init, lk2_init, lk3_init = np.log(np.clip(k_inits, 1e-12, None))

    # 6) Exponent initial guesses
    a1_init = a2_init = a3_init = 0.5

    init = np.array([
        c0_init,
        lk1_init, a1_init,
        lk2_init, a2_init,
        lk3_init, a3_init
    ], dtype=float)

    # 7) Bounds: c0 ∈ [0, y_min], exponents ∈ [0,5], logs unbounded
    bounds = [
        (0.0, y_min),  # c0
        (None, None),  # lk1
        (0.0, 5.0),    # a1
        (None, None),  # lk2
        (0.0, 5.0),    # a2
        (None, None),  # lk3
        (0.0, 5.0)     # a3
    ]

    # 8) Objective: mean squared error
    def objective(p):
        pred = scaling_law_func(X, p)
        return np.mean((pred - y) ** 2)

    # 9) Optimize with L-BFGS-B
    result = minimize(
        objective,
        init,
        method='L-BFGS-B',
        bounds=bounds,
        options={'ftol': 1e-12, 'gtol': 1e-8, 'maxiter': 5000}
    )

    if result.success and result.x.shape == init.shape:
        return result.x
    # fallback to initialization
    return init
# EVOLVE-BLOCK-END

#2 Run 5 R² = 0.917510

▼

Python

# EVOLVE-BLOCK-START
"""
7-parameter scaling law with coupled P–T interaction and T‐curvature
to capture nonlinearity under data constraints.

Model:
    L(U,P,T) = c0 + exp(
        c1
        - aU * ln(U)
        - aP * ln(P)
        - aT * ln(T)
        + bPT * (ln(P) * ln(T))
        + bT  * (ln(T) ** 2)
    )

Parameters: [c0, c1, aU, aP, aT, bPT, bT]
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    X = np.asarray(data_points, dtype=float)
    if X.ndim == 1:
        X = X[None, :]
    U = np.clip(X[:, 0], 1e-12, None)
    P = np.clip(X[:, 1], 1e-12, None)
    T = np.clip(X[:, 2], 1e-12, None)
    c0, c1, aU, aP, aT, bPT, bT = params

    lnU = np.log(U)
    lnP = np.log(P)
    lnT = np.log(T)

    # coupled P–T interaction and T‐curvature
    ln_term = (
        c1
        - aU * lnU
        - aP * lnP
        - aT * lnT
        + bPT * (lnP * lnT)
        + bT  * (lnT ** 2)
    )
    return c0 + np.exp(ln_term)

def fit_scaling_law(data_points, loss_values):
    X = np.asarray(data_points, dtype=float)
    if X.ndim == 1:
        X = X[None, :]
    y = np.asarray(loss_values, dtype=float).ravel()
    # extract and clamp
    U = np.clip(X[:, 0], 1e-12, None)
    P = np.clip(X[:, 1], 1e-12, None)
    T = np.clip(X[:, 2], 1e-12, None)
    lnU = np.log(U)
    lnP = np.log(P)
    lnT = np.log(T)

    # initialize c0 at half the minimum loss (avoids zero shift)
    y_min = np.min(y)
    c0_init = max(0.0, 0.5 * y_min)

    # shift and log-transform target
    y_shift = np.clip(y - c0_init, 1e-12, None)
    lnY = np.log(y_shift)

    # build design matrix for linear init of [c1, aU, aP, aT, bPT, bT]
    A = np.vstack([
        np.ones_like(lnU),       # intercept -> c1
        -lnU,                    # -aU * lnU
        -lnP,                    # -aP * lnP
        -lnT,                    # -aT * lnT
        lnP * lnT,               # bPT * lnP*lnT
        lnT**2                   # bT  * (lnT)^2
    ]).T

    # least-squares initialization
    sol, *_ = np.linalg.lstsq(A, lnY, rcond=None)
    c1_init, aU_init, aP_init, aT_init, bPT_init, bT_init = sol

    init = np.array([
        c0_init,
        c1_init,
        max(0.0, aU_init),
        max(0.0, aP_init),
        max(0.0, aT_init),
        bPT_init,
        bT_init
    ], dtype=float)

    # bounds: enforce monotonic decrease in each scale and moderate curvature
    bounds = [
        (0.0,    None),  # c0 >= 0
        (None,   None),  # c1 free
        (0.0,   10.0),   # aU >= 0
        (0.0,   10.0),   # aP >= 0
        (0.0,   10.0),   # aT >= 0
        (-1.0,   1.0),   # bPT moderate coupling
        (-1.0,   1.0)    # bT  moderate curvature
    ]

    # Huber loss for robustness + small L2 penalty on couplings
    delta = 1.0
    reg_lambda = 1e-5
    def objective(p):
        pred = scaling_law_func(X, p)
        resid = pred - y
        absr = np.abs(resid)
        # Huber
        quad = 0.5 * resid**2
        lin  = delta * (absr - 0.5 * delta)
        hub = np.where(absr <= delta, quad, lin)
        mse = np.mean(hub)
        # penalize large coupling/curvature
        reg = reg_lambda * (p[5]**2 + p[6]**2)
        return mse + reg

    res = minimize(
        objective,
        init,
        method='L-BFGS-B',
        bounds=bounds,
        options={'maxiter': 1000, 'ftol': 1e-9}
    )
    if res.success and res.x.shape == init.shape:
        return res.x
    else:
        return init
# EVOLVE-BLOCK-END

#3 Run 2 R² = 0.915617

▼

Python

# EVOLVE-BLOCK-START
"""
7-parameter scaling law with an interaction term between unique tokens and total tokens.
Model:
    L(U,P,T) = c0
             + exp(
                   c1
                   - a1*ln(U)
                   - a2*ln(P)
                   - a3*ln(T)
                   + a4*(ln(U)*ln(T))
                   + a5*(ln(T))^2
               )
Parameters: [c0, c1, a1, a2, a3, a4, a5]
This form captures diminishing returns and a cross-effect between U and T.
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    X = np.asarray(data_points, dtype=float)
    if X.ndim == 1:
        X = X[None, :]
    U = np.clip(X[:, 0], 1e-12, None)
    P = np.clip(X[:, 1], 1e-12, None)
    T = np.clip(X[:, 2], 1e-12, None)

    c0, c1, a1, a2, a3, a4, a5 = params
    lnU = np.log(U)
    lnP = np.log(P)
    lnT = np.log(T)

    ln_term = (
        c1
        - a1 * lnU
        - a2 * lnP
        - a3 * lnT
        + a4 * (lnU * lnT)
        + a5 * (lnT ** 2)
    )
    return c0 + np.exp(ln_term)

def fit_scaling_law(data_points, loss_values):
    X = np.asarray(data_points, dtype=float)
    if X.ndim == 1:
        X = X[None, :]
    y = np.asarray(loss_values, dtype=float).ravel()

    U = np.clip(X[:, 0], 1e-12, None)
    P = np.clip(X[:, 1], 1e-12, None)
    T = np.clip(X[:, 2], 1e-12, None)

    # initialize c0 as a fraction of the minimum loss
    y_min = np.min(y)
    c0_init = max(0.0, 0.7 * y_min)

    # shift and log-transform target
    y_shift = np.clip(y - c0_init, 1e-12, None)
    lnY = np.log(y_shift)

    # precompute logs for features
    lnU = np.log(U)
    lnP = np.log(P)
    lnT = np.log(T)

    # design matrix for linear init in log-domain
    # lnY ≈ c1 - a1*lnU - a2*lnP - a3*lnT + a4*(lnU*lnT) + a5*(lnT)^2
    A = np.vstack([
        np.ones_like(lnU),      # intercept → c1
        -lnU,                   # -a1*lnU
        -lnP,                   # -a2*lnP
        -lnT,                   # -a3*lnT
        lnU * lnT,              # a4*(lnU*lnT)
        lnT**2                  # a5*(lnT)^2
    ]).T

    sol, *_ = np.linalg.lstsq(A, lnY, rcond=None)
    c1_init, a1_init, a2_init, a3_init, a4_init, a5_init = sol

    init_params = np.array([
        c0_init,
        c1_init,
        a1_init,
        a2_init,
        a3_init,
        a4_init,
        a5_init
    ], dtype=float)

    # bounds to enforce monotonicity and control curvature/interactions
    bounds = [
        (0.0, None),   # c0 >= 0
        (None, None),  # c1 free
        (0.0, 5.0),    # a1 >= 0
        (0.0, 5.0),    # a2 >= 0
        (0.0, 5.0),    # a3 >= 0
        (-0.5, 0.5),   # a4 moderate interaction
        (-1.0, 1.0)    # a5 moderate curvature
    ]

    # small L2 regularization on exponents and interaction/curvature
    reg_lambda = 1e-6

    def objective(p):
        pred = scaling_law_func(X, p)
        mse = np.mean((pred - y) ** 2)
        reg = reg_lambda * np.sum(p[2:] ** 2)
        return mse + reg

    res = minimize(
        objective,
        init_params,
        method='L-BFGS-B',
        bounds=bounds,
        options={'maxiter': 1000}
    )
    if res.success and res.x.shape == init_params.shape:
        return res.x
    else:
        return init_params
# EVOLVE-BLOCK-END

#4 Run 1 R² = 0.912313

▼

Python

# EVOLVE-BLOCK-START
"""
7-parameter scaling law with log‐quadratic curvature in U and T:
    L(U,P,T) = c0 + exp(c1 
                         - aU*ln(U) 
                         - aP*ln(P) 
                         - aT*ln(T) 
                         + bU*(ln(U))^2 
                         + bT*(ln(T))^2
                        )
Parameters: [c0, c1, aU, aP, aT, bU, bT]
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    X = np.asarray(data_points, dtype=float)
    if X.ndim == 1:
        X = X[None, :]
    U = X[:,0]
    P = X[:,1]
    T = X[:,2]
    c0, c1, aU, aP, aT, bU, bT = params

    # safe logs
    lnU = np.log(np.clip(U, 1e-12, None))
    lnP = np.log(np.clip(P, 1e-12, None))
    lnT = np.log(np.clip(T, 1e-12, None))

    # log-quadratic term
    ln_term = (c1
               - aU * lnU
               - aP * lnP
               - aT * lnT
               + bU * (lnU ** 2)
               + bT * (lnT ** 2))
    return c0 + np.exp(ln_term)

def fit_scaling_law(data_points, loss_values):
    X = np.asarray(data_points, dtype=float)
    y = np.asarray(loss_values, dtype=float).ravel()
    if X.ndim == 1:
        X = X[None, :]
    U = X[:,0]
    P = X[:,1]
    T = X[:,2]

    # initialize c0 from minimum observed loss
    y_min = np.min(y)
    c0_init = max(0.0, 0.8 * y_min)
    y_shift = np.clip(y - c0_init, 1e-12, None)
    lnY = np.log(y_shift)

    # build linear system in log‐domain for initial guess
    lnU = np.log(np.clip(U, 1e-12, None))
    lnP = np.log(np.clip(P, 1e-12, None))
    lnT = np.log(np.clip(T, 1e-12, None))

    # design matrix: [1, -lnU, -lnP, -lnT, (lnU)^2, (lnT)^2]
    A = np.vstack([
        np.ones_like(lnU),
        -lnU,
        -lnP,
        -lnT,
        lnU**2,
        lnT**2
    ]).T

    # solve for [c1, aU, aP, aT, bU, bT]
    sol, *_ = np.linalg.lstsq(A, lnY, rcond=None)
    c1_init, aU_init, aP_init, aT_init, bU_init, bT_init = sol

    init_params = np.array([
        c0_init,
        c1_init,
        aU_init,
        aP_init,
        aT_init,
        bU_init,
        bT_init
    ], dtype=float)

    # bounds to enforce positivity and moderate curvature
    bounds = [
        (0, None),    # c0 >= 0
        (None, None), # c1 free
        (0, 5),       # aU >= 0
        (0, 5),       # aP >= 0
        (0, 5),       # aT >= 0
        (-2, 2),      # bU moderate curvature
        (-2, 2)       # bT moderate curvature
    ]

    def objective(p):
        pred = scaling_law_func(X, p)
        return np.mean((pred - y)**2)

    res = minimize(
        objective,
        init_params,
        method='L-BFGS-B',
        bounds=bounds,
        options={'maxiter': 500}
    )
    if res.success and res.x.size == init_params.size:
        return res.x
    else:
        return init_params
# EVOLVE-BLOCK-END

#5 Run 4 R² = 0.813374

▼

Python

import numpy as np
from scipy.optimize import minimize

# EVOLVE-BLOCK-START
def scaling_law_func(data_points, params):
    """
    7‐parameter additive power‐law:
      L(U,P,D) = c0
               + Cp * P^(−ap)
               + Cr * (D/U)^(−ar)
               + Cu * U^(−au)
    params = [c0, Cp, ap, Cr, ar, Cu, au]
    """
    X = np.asarray(data_points, dtype=float)
    if X.ndim == 1:
        X = X[None, :]
    U = np.clip(X[:, 0], 1e-8, None)
    P = np.clip(X[:, 1], 1e-8, None)
    D = np.clip(X[:, 2], 1e-8, None)
    c0, Cp, ap, Cr, ar, Cu, au = params
    return c0 + Cp * P**(-ap) + Cr * (D / U)**(-ar) + Cu * U**(-au)

def fit_scaling_law(data_points, loss_values):
    """
    Fit the seven‐parameter model via multi‐start L-BFGS-B
    minimizing a composite Huber+MAE+relative‐error loss.
    Returns optimized [c0, Cp, ap, Cr, ar, Cu, au].
    """
    X = np.asarray(data_points, dtype=float)
    y = np.ravel(loss_values).astype(float)
    y_min, y_max = y.min(), y.max()

    # Initialize offset near the minimum loss,
    # then split the residual among the three terms.
    c0_init = max(0.0, 0.9 * y_min)
    resid   = max(1e-6, y_max - c0_init)
    init = np.array([
        c0_init,        # c0
        resid * 0.25,   # Cp
        0.5,            # ap
        resid * 0.50,   # Cr (emphasize D/U term)
        0.5,            # ar
        resid * 0.25,   # Cu
        0.5             # au
    ])

    # Bounds: coefficients ≥0, exponents in [0,3]
    bounds = [
        (0, None), (0, None), (0, 3),
        (0, None), (0, 3),    (0, None),
        (0, 3)
    ]

    # Huber for robustness
    def huber_loss(e, delta=1.0):
        a = np.abs(e)
        return np.where(a <= delta,
                        0.5 * e**2,
                        delta * (a - 0.5 * delta))

    def objective(p):
        pred = scaling_law_func(X, p)
        e = pred - y
        hub = np.mean(huber_loss(e, delta=1.0))
        mae = np.mean(np.abs(e))
        rel = np.mean(np.abs(e) / np.maximum(y, 1.0))
        # weighted composite loss
        return 0.6 * hub + 0.3 * mae + 0.1 * rel

    best_p = init.copy()
    best_val = objective(best_p)

    # multi-start with jittered initials
    for i in range(6):
        if i == 0:
            start = init
        else:
            jitter = 0.2 * (np.random.rand(7) - 0.5)
            start = init * (1.0 + jitter)
            # re-clip exponent inits to valid bounds
            start[2] = np.clip(start[2], 0.0, 3.0)
            start[4] = np.clip(start[4], 0.0, 3.0)
            start[6] = np.clip(start[6], 0.0, 3.0)

        res = minimize(objective,
                       start,
                       method='L-BFGS-B',
                       bounds=bounds)
        if res.success and res.fun < best_val:
            best_p, best_val = res.x, res.fun

    return best_p
# EVOLVE-BLOCK-END