← Back to Leaderboard

SFT Scaling Law

Agent: SLDAgent
Model: Gemini 3 Pro Preview
Best R²: 0.999266
Mean R²: 0.998625
Min R²: 0.998460
Runs: 5

All Runs (sorted by R²)

Best Run 3 R² = 0.999266
Python
# EVOLVE-BLOCK-START
"""
Improved scaling law discovery using a 4-parameter Sigmoidal (Hill) function:
L(D) = E + A / (1 + (D/B)^alpha)
Implemented as: L(D) = E + A * sigmoid(-alpha * (ln D - ln B)) for numerical stability.
Optimization Strategy:
1. Variable Projection with Grid Search:
   - Scan (log(B), alpha) space.
   - For each pair, solve for (E, A) using Non-Negative Least Squares (NNLS).
   - This effectively finds the global basin of attraction.
2. Non-Linear Least Squares Refinement:
   - Use the best grid candidates to initialize a Trust Region Reflective fit.
   - Optimize all 4 parameters jointly with bounds to ensure physical validity.
   - Bounded A and E to keep values within realistic ranges (preventing unstable large-parameter solutions).
"""
import numpy as np
from scipy.optimize import nnls, curve_fit
from scipy.special import expit

def scaling_law_func(data_points, params):
    # data_points: (N, 1) array of data sizes
    # params: Array of shape (P,) or (T, P) where P=4 [E, A, B, alpha]
    # Returns: Predicted loss values
    
    X = np.atleast_2d(np.asarray(data_points))
    x = X[:, 0]
    
    params = np.asarray(params)
    if params.ndim == 1:
        params = params[None, :]
    
    # Unpack parameters
    E     = params[:, 0]
    A     = params[:, 1]
    B     = params[:, 2]
    alpha = params[:, 3]
    
    # Model: E + A * sigmoid( -alpha * (ln(x) - ln(B)) )
    # This is equivalent to E + A / (1 + (x/B)^alpha)
    
    # Broadcasting: x is (N,), params are (T,)
    # Result should be (N, T)
    
    # Avoid log(0)
    x_safe = np.maximum(x, 1e-10)
    log_x = np.log(x_safe)[:, None]  # (N, 1)
    
    # B must be positive
    B_safe = np.maximum(B, 1e-20)
    log_B = np.log(B_safe)[None, :]  # (1, T)
    
    alf   = alpha[None, :]           # (1, T)
    
    # Argument for sigmoid: -alpha * (ln x - ln B)
    arg = -alf * (log_x - log_B)
    
    # sigmoid(z) = 1 / (1 + exp(-z))
    w = expit(arg)
    
    pred = E[None, :] + A[None, :] * w
    
    if pred.shape[1] == 1:
        return pred[:, 0]
    return pred

def fit_scaling_law(data_points, loss_values):
    # data_points: (N, 1)
    # loss_values: (N,) or (N, T)
    
    X = np.atleast_2d(np.asarray(data_points))
    x = X[:, 0].astype(float)
    y_in = np.asarray(loss_values)
    
    if y_in.ndim == 1:
        y_2d = y_in[:, None]
    else:
        y_2d = y_in
        
    num_targets = y_2d.shape[1]
    results = np.zeros((num_targets, 4))
    
    # Normalize inputs for numerical stability
    x_max = np.max(x) if x.size > 0 else 1.0
    x_norm = x / x_max
    
    # Precompute log x for optimization
    log_x_norm = np.log(np.maximum(x_norm, 1e-10))
    ones_vec = np.ones_like(x_norm)
    
    # Define model for curve_fit (working with normalized x and y)
    # Params: E_n, A_n, log_B_n, alpha
    def model_opt(x_n, e_n, a_n, log_b_n, alf):
        # x_n is passed but we use precomputed log_x_norm if possible
        # but curve_fit passes x_n. We recompute log to be safe/compatible.
        lx = np.log(np.maximum(x_n, 1e-10))
        arg = -alf * (lx - log_b_n)
        return e_n + a_n * expit(arg)

    for i in range(num_targets):
        y = y_2d[:, i]
        y_max = np.max(y) if np.max(y) > 0 else 1.0
        y_norm = y / y_max
        
        # 1. Grid Search with Variable Projection
        # We search over (log_B_norm, alpha)
        # B can range from very small (pure power law) to > 1 (saturation)
        # log(B_norm) grid:
        
        # Grid density:
        # log_B: -10 to 2 (covers orders of magnitude below and above data range)
        lb_grid = np.linspace(-10, 2.0, 13)
        # alpha: 0.1 to 4.0
        a_grid = np.array([0.1, 0.3, 0.5, 0.75, 1.0, 1.5, 2.0, 3.0, 5.0])
        
        candidates = []
        
        for lb in lb_grid:
            for alf in a_grid:
                # Construct basis w
                arg = -alf * (log_x_norm - lb)
                w = expit(arg)
                
                # Solve NNLS: [1, w] @ [E, A] ~ y
                A_mat = np.vstack([ones_vec, w]).T
                coeffs, rnorm = nnls(A_mat, y_norm)
                
                mse = (rnorm**2) / len(y_norm)
                
                # coeffs are [E_n, A_n]
                candidates.append((mse, [coeffs[0], coeffs[1], lb, alf]))
                
        # Sort and pick best candidates
        candidates.sort(key=lambda c: c[0])
        top_candidates = [c[1] for c in candidates[:3]]
        
        # 2. Refine with curve_fit
        # We optimize [E_n, A_n, log_B_n, alpha]
        best_popt = top_candidates[0]
        best_final_mse = float('inf')
        
        # Bounds:
        # E_n >= 0
        # A_n >= 0. But also A_n shouldn't be arbitrarily large (e.g. < 20) to avoid instability
        # log_B_n: [-20, 10]
        # alpha: [0, 10]
        bounds = ([0.0, 0.0, -20.0, 0.0], [np.inf, 20.0, 10.0, 10.0])
        
        for p0 in top_candidates:
            try:
                # p0 is [E, A, lb, alf]
                popt, _ = curve_fit(model_opt, x_norm, y_norm, p0=p0, 
                                    bounds=bounds, method='trf', 
                                    maxfev=1000, ftol=1e-6)
                
                pred = model_opt(x_norm, *popt)
                mse = np.mean((pred - y_norm)**2)
                
                if mse < best_final_mse:
                    best_final_mse = mse
                    best_popt = popt
            except:
                continue
        
        # 3. De-normalize
        E_n, A_n, log_B_n, alf = best_popt
        
        E_real = E_n * y_max
        A_real = A_n * y_max
        # B_real = x_max * exp(log_B_n)
        B_real = x_max * np.exp(log_B_n)
        
        results[i] = [E_real, A_real, B_real, alf]
    
    if y_in.ndim == 1:
        return results[0]
    return results
# EVOLVE-BLOCK-END
#2 Run 2 R² = 0.998467
#3 Run 4 R² = 0.998467
#4 Run 5 R² = 0.998464
#5 Run 1 R² = 0.998460