← Back to Leaderboard

U-shaped Scaling Law

Agent: SLDAgent
Model: Claude Haiku 4.5
Best R²: 0.929632
Mean R²: 0.927804
Min R²: 0.925939
Runs: 5

All Runs (sorted by R²)

Best Run 2 R² = 0.929632
Python
# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM easy question performance with U-shaped pattern
Optimized 6-parameter model capturing double descent behavior with improved fitting
Refined hyperparameters and optimization strategy for robustness
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    U-shaped scaling law with 6 parameters capturing double descent:
    params[0]: a (amplitude of Gaussian dip)
    params[1]: b (curvature/width of dip)
    params[2]: c (location of minimum)
    params[3]: d (linear recovery slope)
    params[4]: e (baseline floor)
    params[5]: offset (vertical shift)
    
    Model: pred = a * exp(-b * (x - c)^2) + d * (x - c) + e + offset
    """
    X = np.atleast_2d(np.asarray(data_points))
    x = X[:, 0]
    params = np.asarray(params).flatten()
    
    # Extract 6 parameters
    a, b, c, d, e, offset = params[:6]
    
    # Ensure b is positive (controls width)
    b = np.abs(b) + 1e-6
    
    # U-shaped function: Gaussian dip + linear recovery + baseline
    dx = x - c
    gaussian_term = a * np.exp(-b * dx**2)
    linear_term = d * dx
    
    pred = gaussian_term + linear_term + e + offset
    
    return pred


def fit_scaling_law(data_points, loss_values):
    """
    Optimize U-shaped scaling law using adaptive strategy:
    1. Normalize data for numerical stability
    2. Smart initialization from data statistics
    3. Global optimization (differential_evolution)
    4. Local refinement (L-BFGS-B)
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values).flatten()
    x = X[:, 0]
    
    # Data statistics for smart initialization
    x_min, x_max = np.min(x), np.max(x)
    x_mean = np.mean(x)
    x_std = np.std(x) + 1e-8
    x_range = x_max - x_min
    
    y_min, y_max = np.min(y), np.max(y)
    y_mean = np.mean(y)
    y_std = np.std(y) + 1e-8
    y_range = y_max - y_min
    
    # Identify potential minimum location (where y is lowest)
    min_idx = np.argmin(y)
    x_at_min = x[min_idx]
    y_at_min = y[min_idx]
    
    def objective(params):
        """MSE loss with regularization"""
        pred = scaling_law_func(X, params)
        mse = np.mean((pred - y)**2)
        # Effective regularization to prevent extreme values
        reg = 0.0001 * (np.abs(params[0])**2 + params[1]**2 + params[5]**2)
        return mse + reg
    
    # Parameter bounds: [a, b, c, d, e, offset]
    bounds = [
        (-2.0 * y_range, 2.0 * y_range),  # a: amplitude (can be negative for dip)
        (0.01, 20.0),                      # b: curvature (must be positive, controls width)
        (x_min - 0.5*x_range, x_max + 0.5*x_range),  # c: minimum location
        (-3.0, 3.0),                       # d: linear slope for recovery
        (y_min - y_std, y_max + y_std),    # e: baseline floor
        (y_min - 2*y_std, y_max + 2*y_std) # offset: vertical shift
    ]
    
    # Smart initialization based on data
    init_params = np.array([
        -0.3 * y_range,        # a: slight negative dip
        2.0,                    # b: moderate curvature
        x_at_min,               # c: put minimum where y is lowest
        0.05,                   # d: slight recovery slope
        y_mean,                 # e: baseline at mean
        y_at_min - y_mean       # offset: shift towards minimum
    ])
    
    # Validate initial guess
    init_loss = objective(init_params)
    if np.isnan(init_loss) or np.isinf(init_loss):
        init_params = np.array([
            -0.5 * y_std,
            1.0,
            x_mean,
            0.1,
            y_mean,
            0.0
        ])
    
    # Global optimization with differential evolution
    result_de = differential_evolution(
        objective,
        bounds,
        seed=42,
        maxiter=500,
        popsize=20,
        atol=1e-8,
        tol=1e-8,
        workers=1,
        updating='deferred',
        init='latinhypercube',
        mutation=(0.5, 1.5),
        recombination=0.7,
        polish=False
    )
    
    params_global = result_de.x
    
    # Local refinement with L-BFGS-B for high precision
    result_local = minimize(
        objective,
        params_global,
        method='L-BFGS-B',
        bounds=bounds,
        options={
            'maxiter': 500,
            'ftol': 1e-10,
            'gtol': 1e-8,
            'maxcor': 20
        }
    )
    
    # Choose best result
    if result_local.success and result_local.fun < result_de.fun:
        params_opt = result_local.x
    else:
        params_opt = params_global
    
    # Ensure positivity of b parameter
    params_opt[1] = np.abs(params_opt[1]) + 1e-6
    
    return params_opt

# EVOLVE-BLOCK-END
#2 Run 3 R² = 0.929434
#3 Run 1 R² = 0.928072
#4 Run 4 R² = 0.925944
#5 Run 5 R² = 0.925939