← Back to Leaderboard

Domain Mixture Scaling Law

Agent: SLDAgent
Model: Gemini 3 Pro Preview
Best R²: 0.996946
Mean R²: 0.996107
Min R²: 0.995604
Runs: 5

All Runs (sorted by R²)

Best Run 5 R² = 0.996946
Python
# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios
Improved with a "Non-linear Signed Rank-1 Transfer" Model (35 parameters).
This model generalizes the successful Rank-1 approach by allowing the
transfer impact to scale non-linearly with donor data size (tau parameter),
while maintaining signed transfer (interference) and pre-training offsets.
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    """
    Non-linear Signed Rank-1 Transfer Scaling Law.
    
    Model:
    Loss_i = A_i * (D_total_i)^(-alpha_i) + E_i
    D_total_i = D_pre_i + x_i + beta_i * sum_{j != i} (gamma_j * x_j^(tau_j))
    
    Parameters (35):
    0-4:   log_A (Amplitude)
    5-9:   log_alpha (Scaling exponent)
    10-14: log_E (Irreducible loss)
    15-19: logit_beta (Receptivity)
    20-24: raw_gamma (Transferability strength, signed)
    25-29: log_D_pre (Pre-training offset)
    30-34: log_tau (Transfer non-linearity)
    """
    X = np.atleast_2d(np.asarray(data_points))
    params = np.asarray(params).flatten()
    
    # Parameter Extraction
    log_A = params[0:5]
    log_alpha = params[5:10]
    log_E = params[10:15]
    logit_beta = params[15:20]
    raw_gamma = params[20:25]
    log_D_pre = params[25:30]
    log_tau = params[30:35]
    
    # Transformation to physical space
    A = np.exp(log_A)
    alpha = np.exp(log_alpha)
    E = np.exp(log_E)
    beta = 1.0 / (1.0 + np.exp(-logit_beta))
    gamma = np.tanh(raw_gamma)
    D_pre = np.exp(log_D_pre)
    tau = np.exp(log_tau)
    
    # Compute transformed inputs for transfer
    # Use small epsilon for numerical stability of gradients (log(0))
    # X_trans_j = x_j ^ tau_j
    X_safe = X + 1e-9
    X_trans = X_safe ** tau[None, :]
    
    # Weighted transfer components
    # Gamma_X: (N, 5)
    Gamma_X = X_trans * gamma[None, :]
    
    # Sum of all transfer sources (N, 1)
    Sum_Gamma_X = np.sum(Gamma_X, axis=1, keepdims=True)
    
    # Transfer to i from others: Sum - Self
    # Subtract own contribution: gamma_i * x_i^tau_i
    Transfer_Input = Sum_Gamma_X - Gamma_X
    
    # Apply receptivity
    # beta is (5,) -> (1, 5)
    Transfer_Term = beta[None, :] * Transfer_Input
    
    # Total Effective Data
    # D_total = D_pre + x_i + Transfer
    # Note: Target domain data x_i is treated linearly (standard scaling)
    D_total = D_pre[None, :] + X + Transfer_Term
    
    # Numerical stability
    D_total = np.maximum(D_total, 1e-8)
    
    # Predictions
    pred = A[None, :] * (D_total ** -alpha[None, :]) + E[None, :]
    
    return pred

def fit_scaling_law(data_points, loss_values):
    X = np.atleast_2d(np.asarray(data_points))
    Y = np.atleast_2d(np.asarray(loss_values))
    n_domains = 5
    
    # --- Stage 1: Robust Independent Fit ---
    # Fit A, alpha, E, D_pre assuming no transfer.
    # This establishes the baseline scaling curve for each domain.
    
    init_params_list = []
    
    for i in range(n_domains):
        x_i = X[:, i]
        y_i = Y[:, i]
        y_min, y_max = np.min(y_i), np.max(y_i)
        
        # Grid search for initialization to avoid local minima
        best_mse = float('inf')
        best_p = None
        
        # Heuristic grid
        # alpha: slope, d_guess: offset
        for d_guess in [1e-4, 1e-2, 0.1]:
            for alpha_guess in [0.3, 0.7, 1.5]:
                # Estimate A based on endpoints
                # y_range ~ A * (d^-alpha - (1+d)^-alpha)
                denom = (d_guess**-alpha_guess) - ((1.0+d_guess)**-alpha_guess)
                if denom < 1e-6: denom = 1e-6
                A_guess = max((y_max - y_min) / denom, 1e-4)
                
                p0_local = [
                    np.log(A_guess),
                    np.log(alpha_guess),
                    np.log(max(y_min - 0.05, 1e-5)),
                    np.log(d_guess)
                ]
                
                def obj_local(p):
                    A = np.exp(p[0])
                    al = np.exp(p[1])
                    E = np.exp(p[2])
                    d = np.exp(p[3])
                    pred = A * ((x_i + d)**-al) + E
                    return np.mean((pred - y_i)**2)
                
                try:
                    # L-BFGS-B with bounds
                    # alpha in [0.05, 7], E < y_min, D_pre in [1e-6, 1]
                    bounds = [(None, None), (-3, 2), (None, np.log(y_min)), (-14, 0)]
                    res = minimize(obj_local, p0_local, method='L-BFGS-B', bounds=bounds)
                    if res.fun < best_mse:
                        best_mse = res.fun
                        best_p = res.x
                except:
                    pass
        
        if best_p is None:
            # Fallback
            best_p = np.array([0.0, -0.5, np.log(max(y_min-0.1, 1e-5)), -4.6])
            
        init_params_list.append(best_p)
        
    init_params_list = np.array(init_params_list)
    
    # --- Stage 2: Joint Optimization ---
    # Initialize full model parameters.
    # Start with tau = 1.0 (log_tau = 0) -> Linear transfer initially.
    # Start with beta small, gamma neutral.
    
    p0_joint = np.concatenate([
        init_params_list[:, 0], # log_A
        init_params_list[:, 1], # log_alpha
        init_params_list[:, 2], # log_E
        np.full(5, -3.0),       # logit_beta (sigmoid(-3) ~ 0.05)
        np.full(5, 0.0),        # raw_gamma (tanh(0) = 0)
        init_params_list[:, 3], # log_D_pre
        np.full(5, 0.0)         # log_tau (tau = 1)
    ])
    
    def obj_joint(p):
        preds = scaling_law_func(X, p)
        mse = np.mean((preds - Y)**2)
        
        # Regularization
        
        # 1. Weak L2 on all params to prevent drift
        reg_l2 = 1e-7 * np.sum(p**2)
        
        # 2. Sparsity on gamma (prefer simpler transfer models)
        # indices 20-25 are raw_gamma
        reg_gamma = 1e-5 * np.sum(p[20:25]**2)
        
        # 3. Regularize tau towards 1.0 (log_tau towards 0)
        # indices 30-35 are log_tau
        # We prefer linear transfer unless data strongly suggests otherwise
        reg_tau = 1e-4 * np.sum(p[30:35]**2)
        
        # 4. Stability for D_pre (indices 25-30)
        # Penalize if log_d < -12 (d < 6e-6) to avoid numerical instability
        log_d = p[25:30]
        pen_d = 1e-3 * np.sum(np.maximum(0, -12.0 - log_d)**2)
        
        return mse + reg_l2 + reg_gamma + reg_tau + pen_d
        
    try:
        # BFGS is efficient for 35 parameters
        res = minimize(obj_joint, p0_joint, method='BFGS', 
                      options={'maxiter': 2000, 'gtol': 1e-6})
        best_p = res.x
    except:
        best_p = p0_joint
        
    return best_p
# EVOLVE-BLOCK-END
#2 Run 2 R² = 0.996107
#3 Run 3 R² = 0.996106
#4 Run 4 R² = 0.995770
#5 Run 1 R² = 0.995604