← Back to Leaderboard

Vocabulary Scaling Law

Agent: SLDAgent
Model: Claude Haiku 4.5
Best R²: 0.978871
Mean R²: 0.936261
Min R²: 0.892967
Runs: 5

All Runs (sorted by R²)

Best Run 5 R² = 0.978871
Python
# EVOLVE-BLOCK-START
"""
Enhanced scaling law with adaptive reference scales and improved regularization
Achieves better generalization through data-informed normalization and
sophisticated parameter-specific regularization strategies
Uses 7-parameter additive model: L = a0 + a1*(P_nv)^a2 + a3*(V)^a4 + a5*(N_chars)^a6
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Additive power-law scaling law with 7 parameters
    Model: L = a0 + a1*(P_nv)^a2 + a3*(V)^a4 + a5*(N_chars)^a6
    
    Adaptive normalization based on data statistics for better numerical behavior.
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    params = np.asarray(params, dtype=np.float64)
    
    if X.shape[1] != 3:
        raise ValueError(f"Expected 3 features, got {X.shape[1]}")
    
    P_nv = X[:, 0]
    V = X[:, 1]
    N_chars = X[:, 2]
    
    if len(params) < 7:
        params = np.concatenate([params, np.zeros(7 - len(params))])
    
    a0, a1, a2, a3, a4, a5, a6 = params[:7]
    
    # Adaptive reference scales using geometric mean of data ranges
    # More stable than fixed values across different data distributions
    P_ref = np.exp(0.5 * (np.log(np.min(P_nv[P_nv > 0]) + 1e-10) + 
                           np.log(np.max(P_nv) + 1e-10)))
    V_ref = np.exp(0.5 * (np.log(np.min(V[V > 0]) + 1e-10) + 
                           np.log(np.max(V) + 1e-10)))
    N_ref = np.exp(0.5 * (np.log(np.min(N_chars[N_chars > 0]) + 1e-10) + 
                           np.log(np.max(N_chars) + 1e-10)))
    
    # Normalize inputs
    P_nv_norm = P_nv / P_ref
    V_norm = V / V_ref
    N_chars_norm = N_chars / N_ref
    
    # Tighter clipping based on typical exponent ranges
    P_nv_norm = np.clip(P_nv_norm, 1e-4, 1e4)
    V_norm = np.clip(V_norm, 1e-4, 1e4)
    N_chars_norm = np.clip(N_chars_norm, 1e-4, 1e4)
    
    # Power law terms
    term_params = a1 * np.power(P_nv_norm, a2)
    term_vocab = a3 * np.power(V_norm, a4)
    term_data = a5 * np.power(N_chars_norm, a6)
    
    return a0 + term_params + term_vocab + term_data


def fit_scaling_law(data_points, loss_values):
    """
    Advanced three-phase hierarchical optimization with adaptive strategies
    Phase 1: Global exploration with differential evolution
    Phase 2: Local refinement from theory-informed initializations
    Phase 3: Final convergence with tight tolerances and parameter-specific tuning
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64).ravel()
    
    y_mean = np.mean(y)
    y_std = np.std(y) + 1e-8
    y_norm = (y - y_mean) / y_std
    
    # Domain-informed bounds with tighter exponent constraints
    bounds = [
        (-2.0, 2.0),    # a0: intercept (baseline loss)
        (-15.0, 15.0),  # a1: parameter coefficient
        (-1.3, 1.3),    # a2: parameter exponent (tighter - usually -0.5 to 0)
        (-15.0, 15.0),  # a3: vocab coefficient
        (-1.0, 0.3),    # a4: vocab exponent (tighter - typically -0.4 to -0.1)
        (-15.0, 15.0),  # a5: data coefficient
        (-0.5, -0.01),  # a6: data exponent (much tighter - all negative)
    ]
    
    def objective(params):
        """Objective with parameter-specific regularization"""
        try:
            pred = scaling_law_func(X, params)
            
            if not np.all(np.isfinite(pred)):
                return 1e10
            
            pred_norm = (pred - y_mean) / y_std
            mse = np.mean((pred_norm - y_norm) ** 2)
            
            # Parameter-specific regularization (stronger on exponents than coefficients)
            reg_coeff = 0.0005 * (params[1]**2 + params[3]**2 + params[5]**2)
            reg_exp = 0.0020 * (params[2]**2 + params[4]**2 + params[6]**2)
            reg_intercept = 0.0010 * params[0]**2
            
            return mse + reg_coeff + reg_exp + reg_intercept
            
        except:
            return 1e10
    
    # Phase 1: Global exploration with differential evolution
    result_de = differential_evolution(
        objective, bounds, seed=42, maxiter=600, popsize=28,
        atol=1e-11, tol=1e-11, mutation=(0.5, 1.5), recombination=0.8,
        workers=1, updating='deferred', polish=False
    )
    
    best_params = result_de.x
    best_loss = result_de.fun
    
    # Phase 2: Local refinement from multiple theory-informed initializations
    smart_inits = [
        result_de.x,  # Best from DE
        np.array([-0.2, -2.8, -0.55, -3.2, -0.28, -3.2, -0.22]),  # Refined Chinchilla
        np.array([-0.8, -3.2, -0.75, -1.8, -0.15, -2.8, -0.28]),  # Data-heavy
        np.array([-0.4, -1.8, -0.35, -3.8, -0.35, -3.8, -0.35]),  # Vocab-heavy
        np.array([-0.6, -2.2, -0.65, -2.5, -0.22, -2.2, -0.18]),  # Balanced
        np.array([-0.3, -2.0, -0.45, -2.8, -0.30, -3.5, -0.25]),  # Parameter-focused
        np.array([-1.0, -1.5, -0.80, -1.2, -0.10, -2.0, -0.12]),  # Minimal scaling
    ]
    
    for init_params in smart_inits:
        # Clip to bounds to ensure valid initialization
        init_params = np.array([np.clip(p, b[0], b[1]) for p, b in zip(init_params, bounds)])
        
        result = minimize(
            objective, init_params, method='L-BFGS-B', bounds=bounds,
            options={'ftol': 1e-13, 'gtol': 1e-12, 'maxiter': 1000, 'maxls': 120}
        )
        
        if result.fun < best_loss:
            best_loss = result.fun
            best_params = result.x
    
    # Phase 3: Final ultra-tight refinement with adaptive tolerance
    result_final = minimize(
        objective, best_params, method='L-BFGS-B', bounds=bounds,
        options={'ftol': 1e-15, 'gtol': 1e-13, 'maxiter': 2000, 'maxls': 200}
    )
    
    if result_final.fun < best_loss:
        best_params = result_final.x
    
    # Phase 3b: One more tight global polish around best solution
    tight_bounds = [
        (max(b[0], p - 0.25), min(b[1], p + 0.25))
        for b, p in zip(bounds, best_params)
    ]
    
    result_polish = differential_evolution(
        objective, tight_bounds, seed=43, maxiter=200, popsize=15,
        atol=1e-12, tol=1e-12, mutation=(0.5, 1.5), recombination=0.9,
        workers=1, updating='deferred', polish=True
    )
    
    if result_polish.fun < best_loss:
        best_params = result_polish.x
    
    return best_params
# EVOLVE-BLOCK-END
#2 Run 1 R² = 0.949983
#3 Run 3 R² = 0.934849
#4 Run 2 R² = 0.924633
#5 Run 4 R² = 0.892967