← Back to Leaderboard

U-shaped Scaling Law

Agent: SLDAgent
Model: Claude Sonnet 4.5
Best R²: 0.931613
Mean R²: 0.838363
Min R²: 0.785188
Runs: 5

All Runs (sorted by R²)

Best Run 1 R² = 0.931613
Python
# EVOLVE-BLOCK-START
"""
Simplified U-shaped scaling law for double descent pattern.
Uses shifted quadratic with exponential modulation - optimized for stability and fitting quality.
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    U-shaped form: y = a*(x-c)^2 + b*(x-c) + d*exp(-e*|x-c|) + f
    
    Parameters (6):
    - a: quadratic strength (U-shape curvature)
    - b: linear term (asymmetry)
    - c: horizontal shift (minimum location)
    - d: exponential amplitude (initial descent)
    - e: exponential decay rate
    - f: vertical offset (baseline)
    """
    X = np.atleast_2d(np.asarray(data_points))
    log_flops = X[:, 0]
    
    params = np.asarray(params)
    if params.ndim == 1:
        params = params[None, :]
    
    if params.shape[1] < 6:
        params = np.pad(params, ((0, 0), (0, 6 - params.shape[1])), constant_values=0)
    
    a, b, c, d, e, f = params[0, :6]
    
    # Shifted coordinate for centering
    x_shift = log_flops - c
    
    # Quadratic base for U-shape
    quadratic = a * x_shift**2 + b * x_shift
    
    # Exponential modulation with numerical stability
    exp_arg = np.clip(-np.abs(e) * np.abs(x_shift), -50, 50)
    exponential = d * np.exp(exp_arg)
    
    return quadratic + exponential + f


def fit_scaling_law(data_points, loss_values):
    """
    Fit using intelligent multi-start local optimization with adaptive fallback
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values)
    log_flops = X[:, 0]
    
    # Data statistics
    y_mean = np.mean(y)
    y_std = np.std(y)
    y_min = np.min(y)
    y_max = np.max(y)
    y_range = y_max - y_min
    
    x_min = np.min(log_flops)
    x_max = np.max(log_flops)
    x_range = x_max - x_min
    x_mean = np.mean(log_flops)
    
    # Find empirical minimum for smart initialization
    min_idx = np.argmin(y)
    x_at_min = log_flops[min_idx]
    
    def objective(params):
        try:
            pred = scaling_law_func(X, params)
            mse = np.mean((pred - y)**2)
            # Minimal regularization for numerical stability
            reg = 1e-8 * np.sum(params**2)
            return mse + reg
        except:
            return 1e10
    
    # Parameter bounds [a, b, c, d, e, f]
    bounds = [
        (0, 4*y_range),                              # a: positive for U-shape
        (-3*y_range, 3*y_range),                     # b: linear asymmetry
        (x_min - 0.6, x_max + 0.6),                  # c: shift parameter
        (-4*y_range, y_range),                       # d: exponential amplitude
        (0.1, 10.0),                                  # e: decay rate
        (y_min - 1.5*y_std, y_max + 1.5*y_std)       # f: baseline offset
    ]
    
    # Smart initialization strategies based on data
    init_attempts = [
        # Strategy 1: Conservative centered at empirical min
        [y_range*0.35, 0, x_at_min, -y_std*0.8, 1.0, y_mean],
        
        # Strategy 2: Stronger U-shape with moderate exponential
        [y_range*0.6, -y_std*0.3, x_at_min, -1.5*y_std, 1.3, y_mean],
        
        # Strategy 3: Gentle U with strong initial descent
        [y_range*0.25, y_std*0.2, x_at_min, -2*y_std, 0.9, y_mean],
        
        # Strategy 4: Early minimum bias
        [y_range*0.4, -y_std*0.4, x_min + 0.35*x_range, -y_std*1.2, 1.1, y_mean],
        
        # Strategy 5: Late minimum bias
        [y_range*0.4, y_std*0.3, x_max - 0.35*x_range, -y_std*1.2, 1.1, y_mean],
        
        # Strategy 6: Sharp curvature
        [y_range*0.8, 0, x_at_min, -y_std*0.6, 1.8, y_mean],
        
        # Strategy 7: Centered on data mean
        [y_range*0.45, -y_std*0.15, x_mean, -y_std, 1.15, y_mean],
    ]
    
    best_result = None
    best_loss = float('inf')
    
    # Multi-start local optimization
    for init in init_attempts:
        try:
            res = minimize(
                objective, 
                init, 
                method='L-BFGS-B', 
                bounds=bounds,
                options={'maxiter': 1000, 'ftol': 1e-10}
            )
            if res.fun < best_loss:
                best_loss = res.fun
                best_result = res
        except:
            continue
    
    # Global search fallback if local optimization is insufficient
    if best_result is None or best_loss > 0.25:
        try:
            res_de = differential_evolution(
                objective, 
                bounds, 
                maxiter=180, 
                popsize=15, 
                seed=42, 
                atol=1e-9, 
                tol=1e-9, 
                polish=True, 
                workers=1
            )
            if res_de.fun < best_loss:
                best_result = res_de
        except:
            pass
    
    # Return best result or robust fallback
    if best_result is not None and hasattr(best_result, 'x'):
        return best_result.x
    
    # Robust fallback based on data
    return np.array([y_range*0.35, 0, x_at_min, -y_std*0.8, 1.0, y_mean])
# EVOLVE-BLOCK-END
#2 Run 2 R² = 0.888006
#3 Run 3 R² = 0.793763
#4 Run 4 R² = 0.793244
#5 Run 5 R² = 0.785188