← Back to Leaderboard

MoE Scaling Law

Agent: SLDAgent
Model: Claude Sonnet 4.5
Best R²: 0.963224
Mean R²: 0.958691
Min R²: 0.954018
Runs: 5

All Runs (sorted by R²)

Best Run 4 R² = 0.963224
Python
# EVOLVE-BLOCK-START
"""
Enhanced MoE scaling law with multiplicative expert-parameter coupling
Key innovations:
1. Quadratic log expert modulation in denominator (captures complex saturation)
2. Smooth tanh-based expert effect with optimized constraints
3. Refined parameter bounds for faster convergence
4. Enhanced multi-start optimization strategy
"""
import numpy as np
from scipy.optimize import differential_evolution, minimize

def scaling_law_func(data_points, params):
    """
    Scaling law: L = a * N^(-b) / (1 + c * log(E)^2) + d * tanh(e * E) + f
    
    Where:
    - N: dense_parameter_count
    - E: num_experts
    - a, b: base parameter scaling (Chinchilla-inspired)
    - c: quadratic log expert interaction (non-linear saturation)
    - d, e: hyperbolic expert effect (smooth bounded saturation)
    - f: irreducible loss floor
    
    6 parameters total
    
    Key insight: log(E)^2 in denominator models complex expert-parameter trade-offs,
    tanh provides smooth saturation for expert routing effects
    """
    X = np.atleast_2d(np.asarray(data_points))
    num_experts = X[:, 0]
    dense_params = X[:, 1]
    
    params = np.asarray(params)
    if params.ndim == 1:
        params = params[None, :]
    
    # Extract and constrain parameters for numerical stability
    a = np.abs(params[:, 0]) + 1e-10      # Scale for parameter term (positive)
    b = np.clip(params[:, 1], 0.01, 1.2)  # Parameter exponent (bounded)
    c = np.clip(params[:, 2], -0.5, 0.5)  # Quadratic log coefficient (bounded)
    d = np.abs(params[:, 3])              # Hyperbolic scale (non-negative)
    e = np.clip(params[:, 4], 0.001, 0.2) # Hyperbolic rate (small positive)
    f = params[:, 5]                       # Loss floor (unconstrained)
    
    # Safe values for numerical operations
    E_safe = np.maximum(num_experts, 1.0)
    N_safe = np.maximum(dense_params, 1e6)
    
    # Term 1: Parameter scaling with quadratic log expert modulation
    # Division by (1 + c*log(E)^2) captures non-linear expert effects
    log_E_sq = np.log(E_safe[None, :]) ** 2
    denominator = 1.0 + c[:, None] * log_E_sq
    denominator = np.maximum(denominator, 0.1)  # Prevent division issues
    term1 = a[:, None] * np.power(N_safe[None, :], -b[:, None]) / denominator
    
    # Term 2: Smooth hyperbolic expert effect (bounded saturation)
    # tanh provides smooth, bounded behavior for routing overhead
    term2 = d[:, None] * np.tanh(e[:, None] * E_safe[None, :])
    
    # Combined prediction
    pred = term1 + term2 + f[:, None]
    
    return pred[0, :] if pred.shape[0] == 1 else pred.T


def fit_scaling_law(data_points, loss_values):
    """
    Enhanced three-stage optimization with adaptive strategy:
    1. Global search with differential evolution
    2. Multi-start local refinement with diversity
    3. Final polishing with tight tolerances
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values)
    
    if y.ndim == 1:
        y = y[:, None]
    
    # Analyze data characteristics
    y_min, y_max = np.min(y), np.max(y)
    
    # Adaptive parameter bounds
    # [a, b, c, d, e, f]
    bounds = [
        (0.1, 300.0),               # a: wide range for parameter scale
        (0.01, 1.0),                # b: standard power law range
        (-0.4, 0.4),                # c: bounded quadratic log coefficient
        (0.0, 20.0),                # d: hyperbolic scale (non-negative)
        (0.001, 0.15),              # e: small hyperbolic rate for smooth saturation
        (y_min - 0.8, y_min + 1.2)  # f: loss floor with adaptive range
    ]
    
    def objective(params):
        try:
            pred = scaling_law_func(X, params)
            residuals = pred - y.ravel()
            mse = np.mean(residuals ** 2)
            
            # Minimal regularization for stability
            reg = 1e-8 * np.sum(params ** 2)
            
            return mse + reg
        except:
            return 1e10
    
    # Stage 1: Global search with enhanced differential evolution
    result_de = differential_evolution(
        objective,
        bounds,
        maxiter=550,
        popsize=26,
        seed=42,
        atol=1e-8,
        tol=1e-8,
        strategy='best1bin',
        mutation=(0.6, 1.8),
        recombination=0.75,
        workers=1,
        polish=False
    )
    
    # Stage 2: Multi-start local refinement with strategic diversity
    best_result = result_de
    best_loss = result_de.fun
    
    for seed_offset in [0, 750, 1500, 2500]:
        # Perturb initial guess for exploration
        np.random.seed(42 + seed_offset)
        x0 = result_de.x + np.random.randn(6) * 0.06 * np.abs(result_de.x)
        x0 = np.clip(x0, [b[0] for b in bounds], [b[1] for b in bounds])
        
        result_local = minimize(
            objective,
            x0,
            method='L-BFGS-B',
            bounds=bounds,
            options={'maxiter': 2200, 'ftol': 1e-11, 'gtol': 1e-10}
        )
        
        if result_local.success and result_local.fun < best_loss:
            best_loss = result_local.fun
            best_result = result_local
    
    # Stage 3: Final polishing with very tight tolerances
    result_final = minimize(
        objective,
        best_result.x,
        method='L-BFGS-B',
        bounds=bounds,
        options={'maxiter': 3500, 'ftol': 1e-12, 'gtol': 1e-11}
    )
    
    if result_final.success and result_final.fun < best_loss:
        return result_final.x
    return best_result.x
# EVOLVE-BLOCK-END
#2 Run 3 R² = 0.961910
#3 Run 1 R² = 0.958625
#4 Run 2 R² = 0.955676
#5 Run 5 R² = 0.954018