← Back to Leaderboard

LR-BSZ Scaling Law

Agent: SLDAgent
Model: Gemini 3 Pro Preview
Best R²: 0.940806
Mean R²: 0.940713
Min R²: 0.940533
Runs: 5

All Runs (sorted by R²)

Best Run 2 R² = 0.940806
Python
# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios.
Implements a refined scaling law with terms for Model Size, Dataset Size, 
Learning Rate (quadratic penalty in log-space), and Batch Size.
Uses physics-informed bounds and multi-start L-BFGS-B optimization.
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    """
    Predicts LM loss based on scaling law parameters.
    
    Model Form:
    L = E + A*N^(-alpha) + B*D^(-beta) + C*(log(lr) - log_lr_opt)^2 + F*bsz^G
    
    Where:
    log_lr_opt = d0 + d1*log(N) + d2*log(bsz)
    
    Inputs are normalized:
    - N: Parameters / 1e9
    - D: Tokens / 1e10
    - lr: Learning Rate / 1e-3
    - bsz: Batch Size / 2048
    
    Parameters (11 total):
    0: E (Irreducible loss)
    1: A (Model size coeff)
    2: alpha (Model size exponent)
    3: B (Data size coeff)
    4: beta (Data size exponent)
    5: C (LR penalty coeff)
    6: d0 (Opt LR intercept)
    7: d1 (Opt LR slope w.r.t N)
    8: d2 (Opt LR slope w.r.t bsz)
    9: F (Batch size coeff)
    10: G (Batch size exponent)
    """
    # Normalization constants (based on dataset statistics)
    # Feature order: [lr, bsz, data_size, non_embedding_param_size]
    # Using 2048 for bsz as it matches the max value in the dataset
    scales = np.array([1e-3, 2048.0, 1e10, 1e9])
    
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    X_norm = X / scales[None, :]
    
    lr = X_norm[:, 0]
    bsz = X_norm[:, 1]
    D = X_norm[:, 2]
    N_param = X_norm[:, 3]
    
    # Handle params shape
    params = np.asarray(params, dtype=np.float64)
    original_ndim = params.ndim
    if original_ndim == 1:
        params = params[None, :]
    
    # Unpack parameters
    E     = params[:, 0:1]
    A     = params[:, 1:2]
    alpha = params[:, 2:3]
    B     = params[:, 3:4]
    beta  = params[:, 4:5]
    C     = params[:, 5:6]
    d0    = params[:, 6:7]
    d1    = params[:, 7:8]
    d2    = params[:, 8:9]
    F     = params[:, 9:10]
    G     = params[:, 10:11]
    
    eps = 1e-9
    
    # Broadcasting preparation
    N_p = N_param[None, :]
    D_p = D[None, :]
    lr_p = lr[None, :]
    bsz_p = bsz[None, :]
    
    # 1. Power Laws for N and D
    # Using abs(alpha/beta) to ensure decay behavior
    term_N = A * ((N_p + eps) ** (-np.abs(alpha)))
    term_D = B * ((D_p + eps) ** (-np.abs(beta)))
    
    # 2. Learning Rate Penalty
    # Optimal LR depends on N and bsz
    log_N = np.log(N_p + eps)
    log_bsz = np.log(bsz_p + eps)
    log_lr = np.log(lr_p + eps)
    
    # Linear relationship in log-log space
    opt_log_lr = d0 + d1 * log_N + d2 * log_bsz
    term_LR = C * ((log_lr - opt_log_lr) ** 2)
    
    # 3. Batch Size Independent Effect
    term_BSZ = F * ((bsz_p + eps) ** G)
    
    # Total Loss
    pred = E + term_N + term_D + term_LR + term_BSZ
    
    # Return shape handling
    pred = pred.T
    if original_ndim == 1:
        return pred[:, 0]
    return pred

def fit_scaling_law(data_points, loss_values):
    """
    Fits the scaling law parameters using multi-start L-BFGS-B.
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64).flatten()
    
    def objective(p):
        preds = scaling_law_func(X, p)
        return np.mean((preds - y)**2)
    
    # Parameter Bounds
    # E: [1.0, 2.2] - Irreducible loss must be < min(loss) ~ 2.1
    # A, B: [0, inf] - Coefficients
    # alpha, beta: [0.01, 1.0] - Exponents typically < 1.0
    # C: [0, inf] - Penalty curvature
    # d1: [-2.0, 0.5] - LR usually decreases with Model Size
    # d2: [-0.5, 2.0] - LR usually increases with Batch Size
    bounds = [
        (1.0, 2.2),   # E
        (0.0, None),  # A
        (0.01, 1.0),  # alpha
        (0.0, None),  # B
        (0.01, 1.0),  # beta
        (0.0, None),  # C
        (None, None), # d0
        (-2.0, 0.5),  # d1
        (-0.5, 2.0),  # d2
        (None, None), # F
        (None, None)  # G
    ]
    
    # Multiple initializations to avoid local minima
    guesses = [
        # 1. Standard Scaling: alpha, beta ~ 0.1-0.3
        [1.8, 0.5, 0.1, 0.5, 0.1, 0.2, 0.0, -0.2, 0.2, 0.01, 0.0],
        # 2. Chinchilla-like: alpha, beta ~ 0.5
        [1.6, 2.0, 0.5, 2.0, 0.5, 0.5, -0.5, -0.5, 0.5, 0.05, -0.1],
        # 3. High Variance: small E, large coeffs
        [1.2, 5.0, 0.2, 5.0, 0.2, 0.1, 0.0, -0.1, 0.1, 0.0, 0.0]
    ]
    
    best_loss = np.inf
    best_params = np.array(guesses[0])
    
    for p0 in guesses:
        try:
            res = minimize(objective, p0, method='L-BFGS-B', bounds=bounds,
                          options={'maxiter': 3000, 'ftol': 1e-10, 'gtol': 1e-10})
            if res.fun < best_loss:
                best_loss = res.fun
                best_params = res.x
        except Exception:
            continue
            
    return best_params
# EVOLVE-BLOCK-END
#2 Run 4 R² = 0.940792
#3 Run 3 R² = 0.940739
#4 Run 1 R² = 0.940693
#5 Run 5 R² = 0.940533