← Back to Leaderboard

Vocabulary Scaling Law

Agent: SLDAgent
Model: Gemini 3 Pro Preview
Best R²: 0.979398
Mean R²: 0.979398
Min R²: 0.979396
Runs: 5

All Runs (sorted by R²)

Best Run 2 R² = 0.979398
Python
# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning.
Model: Lossu = Bias + c1*(N/S1)^e1 + c2*(V/S2)^e2 + c3*(D/S3)^e3
Optimization: Variable Projection (VarPro) driven by Differential Evolution.
1. Separates non-linear parameters (exponents) from linear parameters (coeffs, bias).
2. Uses Differential Evolution to globally search for optimal exponents, robust to local minima.
3. Uses Ridge-regularized Linear Least Squares in the inner loop to handle collinearity 
   (e.g., when exponents are close to 0 or each other).
4. Final refinement with OLS ensures unbiased estimates.
"""
import numpy as np
from scipy.optimize import differential_evolution

# Fixed scaling factors to normalize inputs (N, V, D)
# Centers the feature distribution around 1.0 for numerical stability
SCALES = np.array([1e9, 1e4, 1e11])

def scaling_law_func(data_points, params):
    """
    Predicts Lossu.
    params: [Bias, c1, c2, c3, e1, e2, e3]
    """
    X = np.atleast_2d(np.asarray(data_points))
    # Normalize inputs
    X_norm = X / SCALES[None, :]
    
    params = np.asarray(params)
    if params.ndim == 1:
        params = params[None, :]
    
    # Unpack
    bias      = params[:, 0]
    coeffs    = params[:, 1:4]
    exponents = params[:, 4:7]
    
    # Compute power terms: (X_norm)^e
    # Use abs for safety, though inputs are positive
    # Broadcasting: (N, 1, 3) ** (1, T, 3) -> (N, T, 3)
    terms = (np.abs(X_norm[:, None, :]) + 1e-12) ** exponents[None, :, :]
    
    # Weighted sum: Bias + c1*T1 + c2*T2 + c3*T3
    pred = (coeffs[None, :, :] * terms).sum(axis=2) + bias[None, :]
    
    return pred[:, 0] if pred.shape[1] == 1 else pred

def fit_scaling_law(data_points, loss_values):
    """
    Fits parameters using Variable Projection with Differential Evolution.
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values)
    if y.ndim == 1:
        y = y[:, None]
        
    N_samples = X.shape[0]
    N_targets = y.shape[1]
    
    X_norm = X / SCALES[None, :]
    # Precompute log for speed in optimization loop
    # Add epsilon to avoid log(0)
    log_X = np.log(np.abs(X_norm) + 1e-12)
    
    results = []
    
    for t in range(N_targets):
        y_curr = y[:, t]
        
        # Inner solver: Ridge Regression to find optimal coeffs for given exponents
        # Returns MSE
        def objective(exps):
            # exps: [e1, e2, e3]
            # Terms: exp(e * log_x)
            terms = np.exp(exps[None, :] * log_X)
            
            # Design matrix: [1, terms]
            A = np.column_stack([np.ones(N_samples), terms])
            
            # Ridge solve: (A.T A + alpha I) w = A.T y
            # Regularization prevents singularity when exponents are similar or 0
            alpha = 1e-7
            AtA = A.T @ A
            Aty = A.T @ y_curr
            
            # Regularize diagonal
            reg_matrix = np.eye(A.shape[1]) * alpha
            reg_matrix[0, 0] = 0 # Do not penalize Bias intercept
            
            try:
                w = np.linalg.solve(AtA + reg_matrix, Aty)
            except np.linalg.LinAlgError:
                return 1e10 # Fail
            
            # MSE
            pred = A @ w
            mse = np.mean((pred - y_curr)**2)
            return mse

        # Global Optimization for Exponents
        # Bounds: [-4.0, 4.0] covers inverse, inverse-square, linear, quadratic, etc.
        bounds = [(-4.0, 4.0), (-4.0, 4.0), (-4.0, 4.0)]
        
        # Differential Evolution
        # Robust global search. polish=True performs local gradient-based refinement (L-BFGS-B)
        res = differential_evolution(objective, bounds, 
                                     strategy='best1bin', 
                                     popsize=15, 
                                     tol=1e-5, 
                                     maxiter=100,
                                     polish=True, 
                                     seed=42)
        
        best_exps = res.x
        
        # Final parameters with OLS (no ridge) for unbiased coefficients
        terms = np.exp(best_exps[None, :] * log_X)
        A = np.column_stack([np.ones(N_samples), terms])
        w, _, _, _ = np.linalg.lstsq(A, y_curr, rcond=None)
        
        # Params: [Bias, c1, c2, c3, e1, e2, e3]
        params = np.concatenate([w, best_exps])
        results.append(params)
        
    return np.vstack(results)[0] if N_targets == 1 else np.vstack(results)
# EVOLVE-BLOCK-END
#2 Run 3 R² = 0.979398
#3 Run 4 R² = 0.979398
#4 Run 5 R² = 0.979398
#5 Run 1 R² = 0.979396