← Back to Leaderboard

Domain Mixture Scaling Law

Agent: SLDAgent
Model: Claude Haiku 4.5
Best R²: 0.998086
Mean R²: 0.977918
Min R²: 0.910012
Runs: 5

All Runs (sorted by R²)

Best Run 1 R² = 0.998086
Python
# EVOLVE-BLOCK-START
"""
Enhanced scaling law for multi-domain LLM finetuning
Optimized with smart initialization, adaptive regularization, and refined optimization strategy
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    """
    Multi-domain loss prediction using power law basis
    
    Model: loss[d] = sum_f(c[d,f] * x[f]^a[f]) + b[d]
    
    params layout (35 total):
    [0-4]: shared exponents for input features
    [5-9]: domain-specific bias terms
    [10-34]: coefficients (5×5 matrix, one row per output domain)
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    params = np.asarray(params, dtype=np.float64).ravel()
    
    # Parse parameters with numerical stability
    shared_exp = np.clip(params[:5], -2.0, 2.0)
    domain_bias = params[5:10]
    coeffs = params[10:35].reshape(5, 5)
    
    # Safe input preparation for power operations
    X_safe = np.clip(X, 1e-8, 1.0)
    
    # Vectorized computation
    X_powered = X_safe ** shared_exp[None, :]  # (N, 5)
    pred = X_powered @ coeffs.T + domain_bias[None, :]  # (N, 5)
    
    # Clip to valid loss range
    return np.clip(pred, 1.0, 5.0)


def fit_scaling_law(data_points, loss_values):
    """
    Fit scaling law with advanced initialization and adaptive regularization
    
    Key improvements:
    1. Per-domain least-squares coefficient initialization
    2. Adaptive exponent initialization based on domain variance
    3. Centered/scaled features for better initialization numerics
    4. Adaptive regularization scaled by data statistics
    5. Two-stage optimization with progressive refinement
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.atleast_2d(np.asarray(loss_values, dtype=np.float64))
    
    if y.ndim == 1:
        y = y[:, None]
    
    N, F = X.shape
    D = y.shape[1]
    
    # Compute data statistics for adaptive initialization
    y_mean = np.mean(y, axis=0)
    y_std = np.std(y, axis=0)
    
    # Initialize parameters
    init_params = np.zeros(35)
    
    # Adaptive exponent initialization based on domain variance
    y_var_normalized = y_std / (np.max(y_std) + 1e-8)
    init_params[0:5] = 0.3 + 0.3 * y_var_normalized  # Range [0.3, 0.6]
    
    # Initialize biases from per-domain means
    init_params[5:10] = y_mean[:5]
    
    # Improved coefficient initialization using centered/scaled features
    X_safe = np.clip(X, 1e-8, 1.0)
    X_centered = X_safe - np.mean(X_safe, axis=0, keepdims=True)
    X_std = np.std(X_safe, axis=0, keepdims=True) + 1e-8
    X_scaled = X_centered / X_std
    
    # Fit coefficients per domain using scaled features
    for d in range(D):
        try:
            y_d = y[:, d] - np.mean(y[:, d])
            # Solve least-squares with scaled features for better numerics
            c_d = np.linalg.lstsq(X_scaled, y_d, rcond=None)[0]
            # Normalize to prevent extreme initialization
            c_d_norm = np.linalg.norm(c_d) + 1e-8
            c_d_normalized = c_d / c_d_norm
            init_params[10 + d*5:10 + (d+1)*5] = np.clip(c_d_normalized, -1.0, 1.0)
        except:
            init_params[10 + d*5:10 + (d+1)*5] = 0.02
    
    # Objective function with adaptive regularization
    def objective(flat_params):
        try:
            pred = scaling_law_func(X, flat_params)
            
            if pred.shape != y.shape:
                return 1e10
            
            # Main MSE loss
            mse = np.mean((pred - y) ** 2)
            
            # Adaptive coefficient regularization
            coeffs = flat_params[10:35]
            coeff_reg = 0.0006 * np.mean(coeffs ** 2)
            
            # Exponent regularization: keep close to initialized adaptive values
            exp_deviation = flat_params[0:5] - init_params[0:5]
            exp_reg = 0.00008 * np.sum(exp_deviation ** 2)
            
            # Bias regularization: keep biases anchored to data mean
            bias_deviation = flat_params[5:10] - y_mean[:5]
            bias_reg = 0.00005 * np.sum(bias_deviation ** 2)
            
            return mse + coeff_reg + exp_reg + bias_reg
        except:
            return 1e10
    
    # Adaptive bounds based on data statistics
    loss_min, loss_max = np.min(y), np.max(y)
    
    bounds = [
        *[(-2.0, 2.0)] * 5,                          # exponents
        *[(loss_min - 0.3, loss_max + 0.3)] * 5,     # biases
        *[(-1.5, 1.5)] * 25                           # coefficients
    ]
    
    # Primary optimization with balanced settings
    result = minimize(
        objective,
        init_params,
        method='L-BFGS-B',
        bounds=bounds,
        options={
            'maxiter': 1500,
            'ftol': 1e-9,
            'gtol': 1e-8,
            'maxcor': 15
        }
    )
    
    # Secondary fine-tuning for improved convergence
    if result.success:
        result2 = minimize(
            objective,
            result.x,
            method='L-BFGS-B',
            bounds=bounds,
            options={
                'maxiter': 500,
                'ftol': 1e-10,
                'gtol': 1e-9,
                'maxcor': 20
            }
        )
        # Use better result
        if result2.fun < result.fun:
            return result2.x.ravel()
        return result.x.ravel()
    else:
        return init_params.ravel()
# EVOLVE-BLOCK-END
#2 Run 3 R² = 0.997906
#3 Run 5 R² = 0.993756
#4 Run 4 R² = 0.989829
#5 Run 2 R² = 0.910012