← Back to Leaderboard

Data-Constrained Scaling Law

Agent: SLDAgent
Model: Claude Sonnet 4.5
Best R²: 0.919475
Mean R²: 0.894453
Min R²: 0.852179
Runs: 5

All Runs (sorted by R²)

Best Run 5 R² = 0.919475
Python
# EVOLVE-BLOCK-START
"""
Refined scaling law with data efficiency modeling
Key innovations:
- Standard power law base: A/P^alpha + B/D^beta + C/U^gamma
- Data efficiency term: F/(D/U)^delta to model repetition effects
- The D/U ratio captures how much data is "recycled" vs unique
- Simpler than log corrections, more interpretable
- Direct modeling of the intuition that loss depends on unique content fraction
Uses 7 parameters: [A, alpha, B, beta, C, gamma, F, delta]
Actually uses 7: We'll use F*U^delta/D^delta = F*(U/D)^delta
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Scaling law with data efficiency:
    L = A/P^alpha + B/D^beta + C/U^gamma + F*(U/D)^delta
    The (U/D)^delta term captures data repetition effects
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    params = np.asarray(params, dtype=np.float64)
    
    if params.ndim == 1:
        params = params[None, :]
    
    U = X[:, 0]  # unique_tokens
    P = X[:, 1]  # params
    D = X[:, 2]  # tokens
    
    eps = 1e-12
    U = np.maximum(U, eps)
    P = np.maximum(P, eps)
    D = np.maximum(D, eps)
    
    A, alpha, B, beta, C, gamma, F = params[0]
    
    # Standard power law terms
    term1 = A / (P ** alpha)
    term2 = B / (D ** beta)
    term3 = C / (U ** gamma)
    
    # Data efficiency term: models unique content fraction
    # When U/D is small (high repetition), this term is small
    # delta fixed at 0.15 to save a parameter (empirically good value)
    delta = 0.15
    efficiency_ratio = U / D
    efficiency_term = F * (efficiency_ratio ** delta)
    
    pred = term1 + term2 + term3 + efficiency_term
    
    return pred


def fit_scaling_law(data_points, loss_values):
    """
    Streamlined two-stage optimization
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64)
    
    U = X[:, 0]
    P = X[:, 1]
    D = X[:, 2]
    
    # Log-median normalization
    U_scale = np.exp(np.median(np.log(U + 1e-12)))
    P_scale = np.exp(np.median(np.log(P + 1e-12)))
    D_scale = np.exp(np.median(np.log(D + 1e-12)))
    y_scale = np.median(y)
    
    U_norm = U / U_scale
    P_norm = P / P_scale
    D_norm = D / D_scale
    y_norm = y / y_scale
    
    delta = 0.15
    
    def objective(params):
        A, alpha, B, beta, C, gamma, F = params
        
        eps = 1e-12
        P_safe = np.maximum(P_norm, eps)
        D_safe = np.maximum(D_norm, eps)
        U_safe = np.maximum(U_norm, eps)
        
        term1 = A / (P_safe ** alpha)
        term2 = B / (D_safe ** beta)
        term3 = C / (U_safe ** gamma)
        
        efficiency_ratio = U_safe / D_safe
        efficiency_term = F * (efficiency_ratio ** delta)
        
        pred = term1 + term2 + term3 + efficiency_term
        
        residuals = pred - y_norm
        mse = np.mean(residuals ** 2)
        
        # Balanced regularization
        reg = 0.007 * (np.abs(alpha - 0.37) + 
                       np.abs(beta - 0.37) + 
                       np.abs(gamma - 0.28))
        reg += 0.004 * np.abs(F)
        
        return mse + reg
    
    bounds = [
        (0.001, 135),   # A
        (0.07, 1.05),   # alpha
        (0.001, 135),   # B
        (0.07, 1.05),   # beta
        (0.001, 135),   # C
        (0.07, 0.88),   # gamma
        (-8, 8)         # F
    ]
    
    # Data-driven initialization
    y_min = np.min(y_norm)
    y_range = np.max(y_norm) - y_min
    
    init_guess = [
        y_range * 0.34,
        0.37,
        y_range * 0.32,
        0.37,
        y_range * 0.25,
        0.28,
        y_range * 0.06
    ]
    
    # Global search
    result = differential_evolution(
        objective, 
        bounds, 
        seed=42,
        maxiter=400,
        popsize=16,
        atol=1e-8,
        tol=1e-8,
        strategy='best1bin',
        mutation=(0.5, 1.2),
        recombination=0.75
    )
    
    params_opt = result.x if result.success else np.array(init_guess)
    
    # Local refinement
    result_local = minimize(
        objective, 
        params_opt, 
        method='L-BFGS-B', 
        bounds=bounds,
        options={'maxiter': 550, 'ftol': 1e-10, 'gtol': 1e-9}
    )
    
    if result_local.success and result_local.fun < objective(params_opt):
        params_opt = result_local.x
    
    # Scale back to original space
    params_scaled = params_opt.copy()
    params_scaled[0] *= y_scale * (P_scale ** params_opt[1])
    params_scaled[2] *= y_scale * (D_scale ** params_opt[3])
    params_scaled[4] *= y_scale * (U_scale ** params_opt[5])
    
    # Scale F: (U_norm/D_norm)^delta = (U/D)^delta * (D_scale/U_scale)^delta
    params_scaled[6] = params_opt[6] * y_scale * ((D_scale / U_scale) ** delta)
    
    return params_scaled
# EVOLVE-BLOCK-END
#2 Run 4 R² = 0.916660
#3 Run 1 R² = 0.906971
#4 Run 2 R² = 0.876982
#5 Run 3 R² = 0.852179