← Back to Leaderboard

Data-Constrained Scaling Law

Agent: SLDAgent
Model: Gemini 3 Pro Preview
Best R²: 0.928929
Mean R²: 0.915996
Min R²: 0.893579
Runs: 5

All Runs (sorted by R²)

Best Run 3 R² = 0.928929
Python
# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios
Optimization Strategy: Hybrid Grid Search + NNLS + Robust Least Squares.
Functional Form: L = E + A*N^-alpha + B*D^-beta + C*R^delta
Features:
- Fixed normalization for numerical stability.
- NNLS for optimal linear parameter initialization.
- Robust 'soft_l1' loss for refinement to handle outliers.
"""
import numpy as np
from scipy.optimize import least_squares, nnls
import itertools

def scaling_law_func(data_points, params):
    # data_points: (N, 3) array [unique_tokens, params, tokens]
    # params: 7 parameters [E, A, alpha, B, beta, C, delta]
    
    # Input handling
    X = np.asarray(data_points, dtype=np.float64)
    if X.ndim == 1:
        X = X[None, :]
        
    p = np.asarray(params, dtype=np.float64)
    squeeze_output = False
    if p.ndim == 1:
        p = p[None, :]
        squeeze_output = True
        
    # Fixed scaling constants (approximate geometric means)
    SCALE_N = 1e9
    SCALE_D = 1e11
    
    # Extract features
    N_norm = X[:, 1:2] / SCALE_N
    D_norm = X[:, 2:3] / SCALE_D
    
    # Repetition Ratio R = Tokens / Unique
    # Add epsilon to denominator to avoid division by zero
    R = X[:, 2:3] / (X[:, 0:1] + 1e-9)
    
    # Extract parameters
    # Use abs() to ensure physical constraints (parameters must be non-negative)
    E     = np.abs(p[:, 0:1])
    A     = np.abs(p[:, 1:2])
    alpha = np.abs(p[:, 2:3])
    B     = np.abs(p[:, 3:4])
    beta  = np.abs(p[:, 4:5])
    C     = np.abs(p[:, 5:6])
    delta = np.abs(p[:, 6:7])
    
    # Functional Form: L = E + A*N^-alpha + B*D^-beta + C*R^delta
    term_N = A * (N_norm ** -alpha)
    term_D = B * (D_norm ** -beta)
    term_R = C * (R ** delta)
    
    pred = E + term_N + term_D + term_R
    
    if squeeze_output:
        return pred.flatten()
    return pred

def fit_scaling_law(data_points, loss_values):
    X = np.asarray(data_points, dtype=np.float64)
    y = np.asarray(loss_values, dtype=np.float64).flatten()
    
    SCALE_N = 1e9
    SCALE_D = 1e11
    
    N_norm = X[:, 1] / SCALE_N
    D_norm = X[:, 2] / SCALE_D
    R = X[:, 2] / (X[:, 0] + 1e-9)
    
    # Grid Search Strategy for initialization
    # Grid ranges based on theoretical expectations (Kaplan, Chinchilla)
    alphas = [0.05, 0.15, 0.33, 0.5, 0.7]
    betas  = [0.05, 0.15, 0.33, 0.5, 0.7]
    deltas = [0.0, 0.5, 1.0, 2.0]
    
    candidates = []
    
    # Target for NNLS: y = E + ...
    # Assume E >= E_min. Solve y - E_min = e_offset + ...
    E_min = 0.5
    y_shifted = np.maximum(y - E_min, 0.0)
    ones = np.ones_like(y)
    
    # Pre-calculate powers for efficiency
    # Not strictly necessary for N=182, but good practice
    
    for a, b, d in itertools.product(alphas, betas, deltas):
        # Basis functions
        f_N = (N_norm + 1e-12) ** -a
        f_D = (D_norm + 1e-12) ** -b
        f_R = (R + 1e-12) ** d
        
        # Design matrix M: [1, f_N, f_D, f_R]
        M = np.vstack([ones, f_N, f_D, f_R]).T
        
        try:
            # NNLS: min ||Mx - y_shifted||^2 s.t. x >= 0
            coeffs, rnorm = nnls(M, y_shifted)
            
            # Reconstruct parameters: E = coeffs[0] + E_min
            p_init = np.array([
                coeffs[0] + E_min, # E
                coeffs[1],         # A
                a,                 # alpha
                coeffs[2],         # B
                b,                 # beta
                coeffs[3],         # C
                d                  # delta
            ])
            candidates.append((rnorm, p_init))
        except:
            continue
            
    # Select top candidates
    candidates.sort(key=lambda x: x[0])
    # Take top 5 candidates to explore different basins
    top_k = [c[1] for c in candidates[:5]]
    
    if not top_k:
        # Fallback
        top_k = [np.array([1.5, 1.0, 0.3, 1.0, 0.3, 0.0, 0.5])]
        
    # Refinement using Robust Least Squares
    def residuals(p):
        return scaling_law_func(X, p) - y
        
    # Bounds: [E, A, alpha, B, beta, C, delta]
    lower_bounds = [0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
    upper_bounds = [10.0, np.inf, 3.0, np.inf, 3.0, np.inf, 5.0]
    
    best_cost = float('inf')
    best_params = top_k[0]
    
    for p0 in top_k:
        try:
            res = least_squares(
                residuals,
                p0,
                bounds=(lower_bounds, upper_bounds),
                method='trf',
                loss='soft_l1',  # Robust to outliers
                f_scale=0.1,     # Inlier scale
                max_nfev=500
            )
            
            # Compare using sum of squared residuals (L2 cost)
            # The optimizer minimizes soft_l1, but we select based on true MSE
            cost = np.sum(res.fun**2)
            
            if cost < best_cost:
                best_cost = cost
                best_params = res.x
        except:
            continue
            
    return best_params
# EVOLVE-BLOCK-END
#2 Run 4 R² = 0.928929
#3 Run 2 R² = 0.927895
#4 Run 5 R² = 0.900646
#5 Run 1 R² = 0.893579