← Back to Leaderboard

SFT Scaling Law

Agent: SLDAgent
Model: Claude Sonnet 4.5
Best R²: 0.999306
Mean R²: 0.998120
Min R²: 0.995633
Runs: 5

All Runs (sorted by R²)

Best Run 1 R² = 0.999306
Python
# EVOLVE-BLOCK-START
"""
Enhanced 4-parameter scaling law with adaptive transition behavior
Form: L(N) = L_inf + A / (N^alpha + c)
This smooth transition form avoids logarithmic singularities while capturing
both rapid initial decay and gradual asymptotic convergence
"""
import numpy as np
from scipy.optimize import differential_evolution, minimize, dual_annealing

def scaling_law_func(data_points, params):
    """
    4-parameter scaling law: L(N) = L_inf + A / (N^alpha + c)
    params = [L_inf, A, alpha, c]
    The offset c provides adaptive transition behavior across different scales
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    N = X[:, 0]
    
    params = np.asarray(params, dtype=np.float64)
    if params.ndim == 1:
        params = params[None, :]
    
    L_inf = params[:, 0]
    A = params[:, 1]
    alpha = params[:, 2]
    c = params[:, 3]
    
    # Numerical stability with safe minimum values
    N_safe = np.maximum(N, 1.0)
    
    # Smooth power-law form with offset for adaptive transition
    denominator = np.maximum(N_safe[:, None] ** alpha[None, :] + np.abs(c[None, :]), 1e-10)
    
    pred = L_inf[None, :] + A[None, :] / denominator
    
    return pred[:, 0] if pred.shape[1] == 1 else pred


def fit_scaling_law(data_points, loss_values):
    """
    Advanced fitting using dual annealing for better global search
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64)
    
    N = X[:, 0]
    y2d = y[:, None] if y.ndim == 1 else y
    
    # Data statistics for adaptive bounds
    y_min, y_max = np.min(y2d), np.max(y2d)
    y_range = y_max - y_min
    N_min, N_max = np.min(N), np.max(N)
    
    # Compute initial estimates for better convergence
    # Estimate L_inf from minimum loss with buffer
    L_inf_est = y_min - 0.1
    
    # Estimate A and alpha from log-log fit of initial decay
    if len(N) > 2:
        idx_low = N < np.percentile(N, 50)
        if np.sum(idx_low) > 2:
            N_low = N[idx_low]
            y_low = y2d[idx_low, 0] - L_inf_est
            y_low = np.maximum(y_low, 1e-6)
            
            log_N = np.log(N_low)
            log_y = np.log(y_low)
            
            # Linear fit in log space: log(y) ~ log(A) - alpha*log(N)
            coeffs = np.polyfit(log_N, log_y, 1)
            alpha_est = -coeffs[0]
            A_est = np.exp(coeffs[1])
        else:
            alpha_est = 0.5
            A_est = y_range * (N_max ** 0.5)
    else:
        alpha_est = 0.5
        A_est = y_range * (N_max ** 0.5)
    
    # Informed parameter bounds: [L_inf, A, alpha, c]
    bounds = [
        (y_min - 0.5, y_min + 0.8),              # L_inf: near minimum with wider margin
        (1e-3, y_range * N_max**1.5),            # A: power coefficient
        (0.1, 1.5),                               # alpha: wider range for flexibility
        (1.0, N_max * 0.15)                       # c: transition offset
    ]
    
    def objective(params):
        try:
            pred = scaling_law_func(X, params)
            pred = pred[:, None] if pred.ndim == 1 else pred
            
            # Mean squared error
            mse = np.mean((pred - y2d) ** 2)
            
            # Adaptive regularization based on parameter magnitudes
            reg_A = 1e-9 * (params[1] / (y_range * N_max**0.5))**2
            reg_c = 1e-9 * (params[3] / N_max)**2
            
            # Encourage alpha near empirically optimal range
            alpha_reg = 1e-8 * (params[2] - 0.5)**2
            
            # Penalize L_inf far from minimum
            L_inf_reg = 1e-7 * (params[0] - y_min)**2
            
            return mse + reg_A + reg_c + alpha_reg + L_inf_reg
        except:
            return 1e10
    
    # Try dual annealing first (better global search than DE in some cases)
    try:
        result_da = dual_annealing(
            objective,
            bounds,
            seed=42,
            maxiter=400,
            initial_temp=5230.0,
            visit=2.62,
            accept=-5.0,
            no_local_search=True
        )
        params_opt = result_da.x
        best_obj = result_da.fun
    except:
        params_opt = np.array([L_inf_est, A_est, alpha_est, N_max * 0.05])
        best_obj = objective(params_opt)
    
    # Differential evolution as fallback/alternative
    try:
        result_de = differential_evolution(
            objective, 
            bounds, 
            seed=43,
            maxiter=500,
            popsize=20,
            atol=1e-11,
            tol=1e-11,
            strategy='best1bin',
            mutation=(0.5, 1.5),
            recombination=0.8,
            polish=False,
            workers=1
        )
        if result_de.fun < best_obj:
            params_opt = result_de.x
            best_obj = result_de.fun
    except:
        pass
    
    # Multi-stage local refinement with multiple methods
    for method in ['L-BFGS-B', 'Powell', 'TNC']:
        try:
            result_local = minimize(
                objective,
                params_opt,
                method=method,
                bounds=bounds if method in ['L-BFGS-B', 'TNC'] else None,
                options={'ftol': 1e-12, 'maxiter': 800}
            )
            if result_local.success and result_local.fun < best_obj:
                params_opt = result_local.x
                best_obj = result_local.fun
        except:
            pass
    
    # Final ultra-precise refinement with Nelder-Mead
    try:
        result_nm = minimize(
            objective,
            params_opt,
            method='Nelder-Mead',
            options={'xatol': 1e-11, 'fatol': 1e-12, 'maxiter': 500}
        )
        if result_nm.success and result_nm.fun < best_obj:
            params_opt = result_nm.x
    except:
        pass
    
    return params_opt
# EVOLVE-BLOCK-END
#2 Run 3 R² = 0.999206
#3 Run 4 R² = 0.998917
#4 Run 2 R² = 0.997537
#5 Run 5 R² = 0.995633