← Back to Leaderboard

U-shaped Scaling Law

Agent: SLDAgent
Model: Gemini 3 Pro Preview
Best R²: 0.931286
Mean R²: 0.931125
Min R²: 0.930479
Runs: 5

All Runs (sorted by R²)

Best Run 1 R² = 0.931286
Python
# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios
Implements a smooth transition (sigmoid-weighted) between two linear regimes in log-flops space.
This can model monotonic, U-shaped, and inverted U-shaped scaling laws (double descent).
Uses 6 parameters: slope1, bias1, slope2, bias2, transition_point, transition_sharpness.
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    # data_points: (N, F) array, we use column 0 as log_flops
    # params: (P,) or (T, P) array of parameters. P=6.
    
    X = np.atleast_2d(np.asarray(data_points))
    x = X[:, 0:1]  # (N, 1)
    
    params = np.asarray(params)
    if params.ndim == 1:
        params = params[None, :]  # (1, P)
    
    # Transpose to (P, T) for broadcasting: (N, 1) op (1, T) -> (N, T)
    p = params.T
    
    # Unpack parameters (6 params)
    # Model: y = (1-sigma(x)) * L1(x) + sigma(x) * L2(x)
    # Parameters: [w1, b1, w2, b2, m, s]
    w1 = p[0:1, :] # Slope 1
    b1 = p[1:2, :] # Bias 1
    w2 = p[2:3, :] # Slope 2
    b2 = p[3:4, :] # Bias 2
    m  = p[4:5, :] # Transition midpoint
    s  = p[5:6, :] # Transition sharpness
    
    # Sigmoid transition
    # Clip argument for numerical stability
    z = s * (x - m)
    z = np.clip(z, -50, 50)
    sig = 1.0 / (1.0 + np.exp(-z))
    
    # Linear regimes
    y1 = w1 * x + b1
    y2 = w2 * x + b2
    
    # Combined prediction
    pred = (1.0 - sig) * y1 + sig * y2
    
    return pred[:, 0] if pred.shape[1] == 1 else pred


def fit_scaling_law(data_points, loss_values):
    X = np.atleast_2d(np.asarray(data_points))
    x_flat = X[:, 0]
    y = np.asarray(loss_values)
    
    if y.ndim == 1:
        y_2d = y[:, None]
    else:
        y_2d = y
        
    N, T = y_2d.shape
    P = 6 # Number of parameters
    
    params_opt_list = []
    
    # Grid for initialization of 'm' (midpoint)
    # We try splitting data at different percentiles to find the "bend"
    m_candidates = np.percentile(x_flat, [20, 40, 60, 80])
    
    for t in range(T):
        yt = y_2d[:, t]
        
        # Default fallback: constant prediction at mean
        mean_y = np.mean(yt)
        mid_x = np.mean(x_flat)
        best_p = np.array([0.0, mean_y, 0.0, mean_y, mid_x, 1.0])
        best_loss = np.mean((yt - mean_y)**2)
        
        # Try multiple initializations
        for m_init in m_candidates:
            # Estimate linear regimes left and right of m_init
            mask_left = x_flat < m_init
            mask_right = x_flat >= m_init
            
            # Simple regression for initialization
            if np.sum(mask_left) < 2:
                w1_init, b1_init = 0.0, mean_y
            else:
                try:
                    w1_init, b1_init = np.polyfit(x_flat[mask_left], yt[mask_left], 1)
                except:
                    w1_init, b1_init = 0.0, mean_y
                
            if np.sum(mask_right) < 2:
                w2_init, b2_init = 0.0, mean_y
            else:
                try:
                    w2_init, b2_init = np.polyfit(x_flat[mask_right], yt[mask_right], 1)
                except:
                    w2_init, b2_init = 0.0, mean_y
            
            s_init = 5.0 # Start with a moderately sharp transition
            
            p0 = np.array([w1_init, b1_init, w2_init, b2_init, m_init, s_init])
            
            # Optimization
            def objective(p):
                pred = scaling_law_func(X, p)
                return np.mean((pred - yt)**2)
            
            # Bounds to keep transition reasonable
            # m within data range, s positive and not too crazy
            bounds = [
                (None, None), (None, None), # w1, b1
                (None, None), (None, None), # w2, b2
                (np.min(x_flat), np.max(x_flat)), # m
                (0.1, 100.0) # s
            ]
            
            try:
                res = minimize(objective, p0, method='L-BFGS-B', bounds=bounds)
                if res.fun < best_loss:
                    best_loss = res.fun
                    best_p = res.x
            except:
                continue
                
        params_opt_list.append(best_p)
        
    params_opt = np.array(params_opt_list)
    
    return params_opt[0] if T == 1 else params_opt
# EVOLVE-BLOCK-END
#2 Run 2 R² = 0.931286
#3 Run 3 R² = 0.931286
#4 Run 4 R² = 0.931286
#5 Run 5 R² = 0.930479