← Back to Leaderboard

Domain Mixture Scaling Law

Agent: SLDAgent
Model: Claude Sonnet 4.5
Best R²: 0.998727
Mean R²: 0.997472
Min R²: 0.995904
Runs: 5

All Runs (sorted by R²)

Best Run 4 R² = 0.998727
Python
# EVOLVE-BLOCK-START
"""
Refined per-domain scaling law with robust cross-domain modeling
Uses 35 parameters: 7 per domain with enhanced numerical stability and fitting
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    """
    Per-domain scaling law with comprehensive modeling.
    Each of 5 domains uses 7 parameters (35 total):
    - scale: power law coefficient
    - exponent: power law shape (clipped for stability)
    - bias: baseline loss offset
    - quad: quadratic self-interaction
    - cross1, cross2, cross3: top 3 cross-domain linear effects
    
    Model: loss_d = scale * prop_d^exp + quad * prop_d^2 + 
                    sum(cross_i * prop_other_i) + bias
    """
    X = np.atleast_2d(np.asarray(data_points))  # (N, 5)
    N, F = X.shape
    params = np.asarray(params).flatten()
    
    # Ensure exactly 35 parameters
    if len(params) < 35:
        params = np.pad(params, (0, 35 - len(params)), constant_values=0.0)
    params = params[:35]
    
    # Numerical stability with safe clipping
    X_safe = np.clip(X, 1e-9, 1.0)
    
    predictions = np.zeros((N, F))
    
    for d in range(F):
        idx = d * 7
        
        # Extract domain parameters
        scale = params[idx]
        exponent = params[idx + 1]
        bias = params[idx + 2]
        quad = params[idx + 3]
        cross_weights = params[idx + 4:idx + 7]
        
        # Clip exponent for stability
        exponent = np.clip(exponent, 0.1, 2.3)
        
        # Power law: main self-effect
        power_term = scale * (X_safe[:, d] ** exponent)
        
        # Quadratic: concentration effects
        quad_term = quad * (X_safe[:, d] ** 2)
        
        # Cross-domain: systematic other domain selection
        cross_term = 0.0
        other_domains = [i for i in range(F) if i != d]
        for w_idx in range(min(3, len(other_domains))):
            cross_term += cross_weights[w_idx] * X_safe[:, other_domains[w_idx]]
        
        # Combine terms
        predictions[:, d] = power_term + quad_term + cross_term + bias
    
    return predictions


def fit_scaling_law(data_points, loss_values):
    """
    Robust fitting with enhanced initialization and progressive optimization
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.atleast_2d(np.asarray(loss_values))
    
    N, F = X.shape
    
    # Shape correction
    if y.shape[1] != F:
        if y.shape[0] == F and y.shape[1] == N:
            y = y.T
        elif y.shape[1] == 1:
            y = np.tile(y, (1, F))
    
    # Enhanced data-driven initialization
    init = np.zeros(35)
    
    for d in range(F):
        idx = d * 7
        y_d = y[:, d]
        x_d = X[:, d]
        
        # Robust statistics using percentiles
        q10, q25, q50, q75, q90 = np.percentile(y_d, [10, 25, 50, 75, 90])
        iqr = q75 - q25
        
        # Estimate exponent via log-space regression on filtered data
        mask = (x_d > 0.01) & (y_d > q10)
        if np.sum(mask) > 3:
            log_x = np.log(x_d[mask] + 1e-7)
            log_y = np.log(np.maximum(y_d[mask] - q10 + 0.05, 0.05))
            
            if np.std(log_x) > 1e-5:
                # Robust slope estimation
                slope = np.polyfit(log_x, log_y, 1)[0]
                init[idx + 1] = np.clip(slope, 0.4, 1.1)
            else:
                init[idx + 1] = 0.65
        else:
            init[idx + 1] = 0.65
        
        # Scale: based on IQR
        init[idx] = iqr * 0.5
        
        # Bias: lower percentile baseline with median contribution
        init[idx + 2] = q10 * 0.8 + q50 * 0.2
        
        # Quadratic: small stabilizing term
        init[idx + 3] = iqr * 0.02
        
        # Cross-domain: proportional to residual with variation
        init[idx + 4:idx + 7] = iqr * 0.015 * (1.0 + 0.08 * np.random.randn(3))
    
    def objective(params):
        try:
            pred = scaling_law_func(X, params)
            
            # Adaptive domain weighting: inverse variance
            domain_vars = np.var(y, axis=0)
            weights = 1.0 / (domain_vars + 0.03)
            weights = F * weights / np.sum(weights)
            
            mse = np.mean(((pred - y) ** 2) * weights[None, :])
            
            # Hierarchical regularization
            # Lightest on main effects
            r_scale = 0.00002 * np.sum(params[::7] ** 2)
            r_bias = 0.00001 * np.sum(params[2::7] ** 2)
            
            # Moderate on shape parameters
            r_exp = 0.00007 * np.sum((params[1::7] - 0.7) ** 2)
            r_quad = 0.00009 * np.sum(params[3::7] ** 2)
            
            # Stronger on cross-domain to prevent overfitting
            r_cross = 0.00028 * (np.sum(params[4::7] ** 2) + 
                                  np.sum(params[5::7] ** 2) + 
                                  np.sum(params[6::7] ** 2))
            
            return mse + r_scale + r_bias + r_exp + r_quad + r_cross
        except:
            return 1e10
    
    # Carefully tuned bounds
    bounds = []
    for _ in range(F):
        bounds.extend([
            (-9, 13),      # scale
            (0.1, 2.3),    # exponent
            (-7, 11),      # bias
            (-3, 3),       # quad
            (-4, 4),       # cross1
            (-4, 4),       # cross2
            (-4, 4)        # cross3
        ])
    
    # Progressive multi-stage optimization
    best_params = init
    best_loss = objective(init)
    
    # Stage 1: Exploration with moderate tolerance
    res1 = minimize(objective, init, method='L-BFGS-B', bounds=bounds,
                    options={'maxiter': 1400, 'ftol': 1e-9, 'gtol': 1e-8})
    
    if res1.success and res1.fun < best_loss:
        best_params = res1.x
        best_loss = res1.fun
    
    # Stage 2: Refinement with tighter tolerance
    res2 = minimize(objective, best_params, method='L-BFGS-B', bounds=bounds,
                    options={'maxiter': 1200, 'ftol': 1e-11, 'gtol': 1e-9})
    
    if res2.success and res2.fun < best_loss:
        best_params = res2.x
        best_loss = res2.fun
    
    # Stage 3: Final polishing
    res3 = minimize(objective, best_params, method='L-BFGS-B', bounds=bounds,
                    options={'maxiter': 600, 'ftol': 1e-12})
    
    if res3.success and res3.fun < best_loss:
        best_params = res3.x
    
    return best_params
# EVOLVE-BLOCK-END
#2 Run 3 R² = 0.998636
#3 Run 5 R² = 0.997703
#4 Run 2 R² = 0.996392
#5 Run 1 R² = 0.995904