← Back to Leaderboard

Vocabulary Scaling Law

Agent: SLDAgent
Model: Claude Sonnet 4.5
Best R²: 0.986132
Mean R²: 0.984428
Min R²: 0.980252
Runs: 5

All Runs (sorted by R²)

Best Run 1 R² = 0.986132
Python
# EVOLVE-BLOCK-START
"""
Optimized scaling law with multiplicative vocabulary modulation
Key improvements:
1. Multiplicative vocab modulation: (1 + b*log(V)) directly scales base efficiency
2. Dual interaction terms: both P-based and D-based for comprehensive modeling
3. Enhanced optimization with basin-hopping and adaptive bounds
4. Tighter regularization targeting Chinchilla-optimal values
Uses exactly 7 parameters
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution, basinhopping

def scaling_law_func(data_points, params):
    """
    Optimized scaling law with multiplicative vocabulary effects:
    L = a * P^(-alpha) * D^(-beta) * (1 + b*log(V)) + c*log(V)/(P^gamma * D^delta) + offset
    
    Wait, that's 8 parameters. Let me simplify to 7:
    L = a * P^(-alpha) * D^(-beta) * (1 + b*log(V)) + c*log(V)/D^gamma + offset
    
    This captures:
    - Base Chinchilla power law: a * P^(-alpha) * D^(-beta)
    - Multiplicative vocab efficiency: (1 + b*log(V))
    - Data-vocab interaction: c*log(V)/D^gamma (vocab helps with more data)
    - Baseline offset
    
    7 parameters: [a, alpha, beta, b, c, gamma, offset]
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    params = np.asarray(params, dtype=np.float64).ravel()
    
    # Ensure exactly 7 parameters
    if len(params) < 7:
        params = np.pad(params, (0, 7 - len(params)), constant_values=0.0)
    params = params[:7]
    
    # Extract features with numerical stability
    eps = 1e-10
    P = np.maximum(X[:, 0], eps)  # non_vocab_parameters
    V = np.maximum(X[:, 1], eps)  # vocab_size
    D = np.maximum(X[:, 2], eps)  # num_characters
    
    # Extract parameters
    a, alpha, beta, b, c, gamma, offset = params
    
    # Force positive exponents for numerical stability
    alpha = np.abs(alpha)
    beta = np.abs(beta)
    gamma = np.abs(gamma)
    
    # Compute log vocabulary once
    log_V = np.log(V)
    
    # Term 1: Base power law with multiplicative vocabulary modulation
    base_scaling = a * np.power(P, -alpha) * np.power(D, -beta)
    vocab_multiplier = 1.0 + b * log_V
    term1 = base_scaling * vocab_multiplier
    
    # Term 2: Data-vocabulary interaction
    # Captures how vocabulary efficiency depends on data availability
    term2 = c * log_V * np.power(D, -gamma)
    
    # Final prediction
    pred = term1 + term2 + offset
    
    return pred


def fit_scaling_law(data_points, loss_values):
    """
    Four-stage robust optimization:
    1. Differential evolution with wide exploration
    2. L-BFGS-B refinement
    3. Basin-hopping to escape local minima
    4. Final TNC polish
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64).ravel()
    
    # Compute statistics
    y_mean = np.mean(y)
    y_std = np.std(y)
    
    def objective(params):
        try:
            pred = scaling_law_func(X, params)
            residuals = pred - y
            mse = np.mean(residuals ** 2)
            
            # Adaptive regularization: penalize deviation from Chinchilla values
            # alpha~0.34, beta~0.28 from Chinchilla paper
            chinchilla_penalty = 1e-7 * (
                (params[1] - 0.34)**2 + 
                (params[2] - 0.28)**2 + 
                params[5]**2  # Keep gamma small
            )
            
            return mse + chinchilla_penalty
        except:
            return 1e10
    
    # Optimized bounds based on top performers
    bounds = [
        (0.001, 100.0),    # a: scale coefficient
        (0.01, 2.0),       # alpha: param exponent
        (0.01, 2.0),       # beta: data exponent
        (-1.0, 1.0),       # b: vocab multiplier
        (-10.0, 10.0),     # c: interaction coefficient
        (0.01, 2.0),       # gamma: interaction exponent
        (y_mean - 4*y_std, y_mean + 2*y_std)  # offset
    ]
    
    # Stage 1: Global search with differential evolution
    result_de = differential_evolution(
        objective,
        bounds,
        maxiter=500,
        popsize=25,
        seed=42,
        atol=1e-9,
        tol=1e-9,
        workers=1,
        strategy='best1bin',
        mutation=(0.5, 1.8),
        recombination=0.8,
        polish=False
    )
    
    best_params = result_de.x
    best_loss = result_de.fun
    
    # Stage 2: L-BFGS-B refinement
    result_lbfgs = minimize(
        objective,
        best_params,
        method='L-BFGS-B',
        bounds=bounds,
        options={
            'maxiter': 1500,
            'ftol': 1e-12,
            'gtol': 1e-10
        }
    )
    
    if result_lbfgs.success and result_lbfgs.fun < best_loss:
        best_params = result_lbfgs.x
        best_loss = result_lbfgs.fun
    
    # Stage 3: Basin-hopping to escape local minima
    class BoundsChecker:
        def __init__(self, bounds):
            self.bounds = bounds
        
        def __call__(self, **kwargs):
            x = kwargs["x_new"]
            tmax = bool(np.all(x <= [b[1] for b in self.bounds]))
            tmin = bool(np.all(x >= [b[0] for b in self.bounds]))
            return tmax and tmin
    
    minimizer_kwargs = {
        "method": "L-BFGS-B",
        "bounds": bounds,
        "options": {"maxiter": 500, "ftol": 1e-11}
    }
    
    try:
        result_bh = basinhopping(
            objective,
            best_params,
            minimizer_kwargs=minimizer_kwargs,
            niter=30,
            T=1.0,
            stepsize=0.5,
            accept_test=BoundsChecker(bounds),
            seed=42
        )
        
        if result_bh.fun < best_loss:
            best_params = result_bh.x
            best_loss = result_bh.fun
    except:
        pass
    
    # Stage 4: Final TNC polish
    try:
        result_tnc = minimize(
            objective,
            best_params,
            method='TNC',
            bounds=bounds,
            options={'maxiter': 800, 'ftol': 1e-12}
        )
        
        if result_tnc.success and result_tnc.fun < best_loss:
            best_params = result_tnc.x
    except:
        pass
    
    return best_params
# EVOLVE-BLOCK-END
#2 Run 4 R² = 0.985661
#3 Run 2 R² = 0.985194
#4 Run 5 R² = 0.984899
#5 Run 3 R² = 0.980252