← Back to Leaderboard

Parallel Scaling Law

Agent: SLDAgent
Model: Gemini 3 Pro Preview
Best R²: 0.999954
Mean R²: 0.999945
Min R²: 0.999914
Runs: 5

All Runs (sorted by R²)

Best Run 1 R² = 0.999954
Python
# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios
Improved program using Variable Projection (VarPro) with Global Grid Search.
The model L = E + A * N^-alpha * K^-beta is fitted by optimizing the non-linear exponents (alpha, beta)
via a dense grid search followed by L-BFGS-B refinement. The linear parameters (E, A) are solved 
analytically at each step using a 2D Non-Negative Least Squares (NNLS) solver.
Model: L = E + A * (N/1e9)^(-alpha) * K^(-beta)
Uses 4 parameters: [E, A, alpha, beta]
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    """
    Computes predicted loss using a power-law scaling model.
    Model: L = E + A * (N/1e9)^(-alpha) * K^(-beta)
    
    Args:
        data_points: (N, 2) array [num_params, parallel_size]
        params: (4,) or (T, 4) array [E, A, alpha, beta]
    
    Returns:
        Predicted loss values (N,) or (N, T)
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=float))
    params = np.asarray(params, dtype=float)

    # Handle parameter batching
    if params.ndim == 1:
        params = params[None, :]  # (1, 4)
    
    # Extract inputs and normalize num_params (billions)
    N_scaled = X[:, 0] / 1.0e9 
    K = X[:, 1]

    # Extract parameters
    E     = params[:, 0]
    A     = params[:, 1]
    alpha = params[:, 2]
    beta  = params[:, 3]

    # Calculate power law terms with broadcasting
    # N_scaled: (N,), alpha: (T,) -> (N, T)
    term_N = N_scaled[:, None] ** (-alpha[None, :])
    term_K = K[:, None] ** (-beta[None, :])
    
    # Combined model: E + A * N^-alpha * K^-beta
    pred = E[None, :] + A[None, :] * term_N * term_K

    # Return appropriate shape
    if pred.shape[1] == 1:
        return pred[:, 0]
    return pred


def fit_scaling_law(data_points, loss_values):
    """
    Fits the scaling law parameters using VarPro with grid search initialization.
    Optimizes (alpha, beta) while solving (E, A) analytically with non-negativity constraints.
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=float))
    y_all = np.asarray(loss_values, dtype=float)
    
    # Handle multi-target fitting
    if y_all.ndim == 1:
        y_all = y_all[:, None]
        
    num_targets = y_all.shape[1]
    optimized_params = []
    
    # Normalize inputs
    N_scaled = X[:, 0] / 1.0e9
    K = X[:, 1]
    
    # Define grid for global search of non-linear parameters
    # alpha (model scaling) usually 0.1-1.5, beta (parallel) usually 0.0-0.5
    grid_alpha = np.linspace(0.01, 1.5, 15)
    grid_beta = np.linspace(0.0, 0.6, 10)
    grid_points = [(a, b) for a in grid_alpha for b in grid_beta]
    
    for i in range(num_targets):
        y = y_all[:, i]
        
        # Inner solver: Given alpha, beta, find optimal E, A >= 0
        def solve_linear_params(alpha, beta):
            # Feature Z = N^-alpha * K^-beta
            Z = (N_scaled ** -alpha) * (K ** -beta)
            
            # We want min || [1 Z] @ [E, A].T - y ||^2 s.t. E,A >= 0
            # Analytical 2D NNLS
            sum_Z = np.sum(Z)
            sum_Z2 = np.sum(Z**2)
            sum_y = np.sum(y)
            sum_yZ = np.sum(y * Z)
            n = len(y)
            
            det = n * sum_Z2 - sum_Z**2
            
            best_mse = np.inf
            best_EA = (0.0, 0.0) # (E, A)
            
            # 1. Unconstrained Solution
            if det > 1e-13:
                E_unc = (sum_Z2 * sum_y - sum_Z * sum_yZ) / det
                A_unc = (n * sum_yZ - sum_Z * sum_y) / det
                if E_unc >= 0 and A_unc >= 0:
                    mse = np.mean((E_unc + A_unc * Z - y)**2)
                    return mse, E_unc, A_unc
            
            # 2. Boundary A=0 (Model is constant E)
            E_only = max(0.0, sum_y / n)
            mse_E = np.mean((E_only - y)**2)
            if mse_E < best_mse:
                best_mse = mse_E
                best_EA = (E_only, 0.0)
                
            # 3. Boundary E=0 (Model is A * Z)
            if sum_Z2 > 1e-13:
                A_only = max(0.0, sum_yZ / sum_Z2)
                mse_A = np.mean((A_only * Z - y)**2)
                if mse_A < best_mse:
                    best_mse = mse_A
                    best_EA = (0.0, A_only)
            
            return best_mse, best_EA[0], best_EA[1]

        # Step 1: Global Grid Search
        best_grid_mse = np.inf
        best_grid_params = (0.5, 0.1)
        
        for alpha_try, beta_try in grid_points:
            mse, _, _ = solve_linear_params(alpha_try, beta_try)
            if mse < best_grid_mse:
                best_grid_mse = mse
                best_grid_params = (alpha_try, beta_try)
                
        # Step 2: Local Refinement
        def objective(p):
            mse, _, _ = solve_linear_params(p[0], p[1])
            return mse
        
        # Bounds: alpha > 0, beta >= 0.
        bounds = [(0.0, 5.0), (0.0, 2.0)]
        
        try:
            res = minimize(objective, best_grid_params, method='L-BFGS-B', bounds=bounds)
            alpha_opt, beta_opt = res.x
        except:
            alpha_opt, beta_opt = best_grid_params
            
        # Recover linear parameters
        _, E_opt, A_opt = solve_linear_params(alpha_opt, beta_opt)
        
        optimized_params.append([E_opt, A_opt, alpha_opt, beta_opt])
            
    final_params = np.array(optimized_params)
    
    if num_targets == 1:
        return final_params[0]
    return final_params
# EVOLVE-BLOCK-END
#2 Run 2 R² = 0.999954
#3 Run 3 R² = 0.999952
#4 Run 4 R² = 0.999951
#5 Run 5 R² = 0.999914