SLD - Vocabulary Scaling Law - SLDAgent + o4-mini

Best Run 3 R² = 0.986247

▼

Python

import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    """
    Predict unigram-normalized loss (Lossu) via a two-term
    3D power‐law with synergy in the main term plus an additive constant:
      Lossu = C0 * exp[-(a·logP + b·logD + g·logV)]
            + C1 * exp[-(h·logD)]
            + C2

    Inputs:
      data_points: array of shape (N,3) columns = [P_non_vocab, Vocab_size, Num_characters]
      params: length-7 array = [C0, C1, C2, a, b, g, h]

    Returns:
      preds: length-N array of predicted Lossu
    """
    X = np.atleast_2d(data_points)
    P = X[:, 0]
    V = X[:, 1]
    D = X[:, 2]

    C0, C1, C2, a, b, g, h = params
    lp = np.log(P)
    lv = np.log(V)
    ld = np.log(D)

    term1 = np.exp(-(a * lp + b * ld + g * lv))
    term2 = np.exp(-h * ld)
    return C0 * term1 + C1 * term2 + C2


def fit_scaling_law(data_points, loss_values):
    """
    Fit the 7‐parameter model by:
      1) optimizing exponents (a, b, g, h) in log‐space (ensures positivity)
         while solving [C0,C1,C2] via linear least squares at each step;
      2) recovering an initial 7‐vector estimate;
      3) performing a bounded L-BFGS-B refinement on all 7 parameters
         to further minimize MSE.

    Returns:
      params_opt: length-7 array [C0, C1, C2, a, b, g, h]
    """
    X = np.atleast_2d(data_points)
    y = np.asarray(loss_values).ravel()
    P, V, D = X[:, 0], X[:, 1], X[:, 2]
    lp, lv, ld = np.log(P), np.log(V), np.log(D)

    # 1) Optimize exponents in log-space
    def mse_exp(log_exps):
        # a, b, g, h > 0 by exponentiating
        a, b, g, h = np.exp(log_exps)
        phi1 = np.exp(-(a * lp + b * ld + g * lv))
        phi2 = np.exp(-h * ld)
        M = np.vstack((phi1, phi2, np.ones_like(y))).T
        coeffs, *_ = np.linalg.lstsq(M, y, rcond=None)
        y_pred = M.dot(coeffs)
        return np.mean((y_pred - y) ** 2)

    init_log_exps = np.log([0.5, 0.5, 0.5, 0.5])
    res1 = minimize(mse_exp, init_log_exps, method='L-BFGS-B')
    a0, b0, g0, h0 = np.exp(res1.x)

    # 2) Solve for C0, C1, C2 given the exponents
    phi1 = np.exp(-(a0 * lp + b0 * ld + g0 * lv))
    phi2 = np.exp(-h0 * ld)
    M_lin = np.vstack((phi1, phi2, np.ones_like(y))).T
    C0_0, C1_0, C2_0 = np.linalg.lstsq(M_lin, y, rcond=None)[0]

    initial_params = np.array([C0_0, C1_0, C2_0, a0, b0, g0, h0])

    # 3) Final refinement: optimize all 7 parameters together
    def mse_all(params):
        y_pred = scaling_law_func(X, params)
        return np.mean((y_pred - y) ** 2)

    # Bounds: exponents >= 1e-12, coefficients free
    bounds = [(None, None)] * 3 + [(1e-12, None)] * 4
    res2 = minimize(mse_all, initial_params,
                    method='L-BFGS-B',
                    bounds=bounds)

    return res2.x

#2 Run 1 R² = 0.986047

▼

#3 Run 2 R² = 0.984961

▼

#4 Run 4 R² = 0.984961

▼

Python

import numpy as np
from scipy.optimize import minimize

# EVOLVE-BLOCK-START
def scaling_law_func(data_points, params):
    """
    Simplified power-law scaling:
      Lossu ≈ C0 + C1·P^{-α}·D^{-β} + C2·V^{-γ}·D^{-β}
    where
      P = non-vocab parameter count,
      V = vocabulary size,
      D = number of characters.
    params = [C0, C1, C2, α, β, γ]
    """
    X = np.atleast_2d(data_points)
    P = X[:, 0]
    V = X[:, 1]
    D = X[:, 2]
    C0, C1, C2, alpha, beta, gamma = params

    # log-space for numerical stability
    logP = np.log(P)
    logV = np.log(V)
    logD = np.log(D)

    # compute two interaction terms with shared D exponent
    f1 = np.exp(-alpha * logP - beta * logD)
    f2 = np.exp(-gamma * logV - beta * logD)

    return C0 + C1 * f1 + C2 * f2

def fit_scaling_law(data_points, loss_values):
    """
    Fit the 6-parameter simplified model:
      Lossu ≈ C0 + C1·P^{-α}·D^{-β} + C2·V^{-γ}·D^{-β}
    1) Optimize exponents [α, β, γ] in log-space via L-BFGS-B
    2) Solve linear coefficients [C0, C1, C2] by least squares
    Returns array([C0, C1, C2, α, β, γ]).
    """
    X = np.atleast_2d(data_points)
    y = np.asarray(loss_values).ravel()
    logP = np.log(X[:, 0])
    logV = np.log(X[:, 1])
    logD = np.log(X[:, 2])

    # Objective: MSE after fitting linear coeffs for given exponents
    def objective(log_exps):
        alpha, beta, gamma = np.exp(log_exps)
        f1 = np.exp(-alpha * logP - beta * logD)
        f2 = np.exp(-gamma * logV - beta * logD)
        M = np.vstack([f1, f2, np.ones_like(y)]).T
        coeffs, *_ = np.linalg.lstsq(M, y, rcond=None)
        y_pred = M.dot(coeffs)
        return np.mean((y_pred - y) ** 2)

    # Initialize log-exponents
    init_log = np.log([0.5, 0.5, 0.5])
    res = minimize(objective, init_log, method='L-BFGS-B')
    alpha_opt, beta_opt, gamma_opt = np.exp(res.x)

    # Solve for linear coefficients with optimized exponents
    f1 = np.exp(-alpha_opt * logP - beta_opt * logD)
    f2 = np.exp(-gamma_opt * logV - beta_opt * logD)
    M_final = np.vstack([f1, f2, np.ones_like(y)]).T
    C1_opt, C2_opt, C0_opt = np.linalg.lstsq(M_final, y, rcond=None)[0]

    return np.array([C0_opt, C1_opt, C2_opt, alpha_opt, beta_opt, gamma_opt])
# EVOLVE-BLOCK-END

#5 Run 5 R² = 0.984961

▼

Python

import numpy as np
from scipy.optimize import minimize

# EVOLVE-BLOCK-START
def scaling_law_func(data_points, params):
    """
    Predict unigram-normalized loss (Lossu) given:
      data_points: array of shape (N,3) with columns 
                   [P_non_vocab, vocab_size, num_characters]
      params: 6 parameters [C0, C1, C2, alpha, beta, gamma]
    Model:
      Lossu = C0 
            + C1 * P^{-alpha} * D^{-beta}
            + C2 * V^{-gamma} * D^{-beta}
    where P = non-vocab parameters, V = vocab size, D = number of characters.
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=float))
    P = X[:, 0]
    V = X[:, 1]
    D = X[:, 2]
    C0, C1, C2, alpha, beta, gamma = params
    term_p = P**(-alpha) * D**(-beta)
    term_v = V**(-gamma) * D**(-beta)
    return C0 + C1 * term_p + C2 * term_v

def fit_scaling_law(data_points, loss_values):
    """
    Fit the 6-parameter scaling law model:
      params = [C0, C1, C2, alpha, beta, gamma]
    We optimize exponents (alpha, beta, gamma) in log-space
    and solve for linear coefficients (C0, C1, C2) via least squares.
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=float))
    y = np.asarray(loss_values, dtype=float).ravel()
    P = X[:, 0]
    V = X[:, 1]
    D = X[:, 2]

    def mse_exps(log_exps):
        # enforce positivity of exponents
        a, b, g = np.exp(log_exps)
        t_p = P**(-a) * D**(-b)
        t_v = V**(-g) * D**(-b)
        # design matrix: [ones, t_p, t_v]
        M = np.vstack([np.ones_like(y), t_p, t_v]).T
        coefs, *_ = np.linalg.lstsq(M, y, rcond=None)
        y_pred = M.dot(coefs)
        return np.mean((y - y_pred)**2)

    # initialize log-exponents
    init = np.log([0.5, 0.5, 0.5])
    res = minimize(mse_exps, init, method='L-BFGS-B')
    alpha, beta, gamma = np.exp(res.x)

    # final linear solve for C0, C1, C2
    t_p = P**(-alpha) * D**(-beta)
    t_v = V**(-gamma) * D**(-beta)
    M_final = np.vstack([np.ones_like(y), t_p, t_v]).T
    C0, C1, C2 = np.linalg.lstsq(M_final, y, rcond=None)[0]

    return np.array([C0, C1, C2, alpha, beta, gamma])
# EVOLVE-BLOCK-END

Vocabulary Scaling Law

All Runs (sorted by R²)