SLD - MoE Scaling Law - SLDAgent + Gemini 3 Pro Preview

All Runs (sorted by R²)

Best Run 2 R² = 0.958443

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios
Improved program using Variable Projection with Regularized NNLS.
Model: L = c0 + c1 * N^-a1 + c2 * N^-a2 * E^-beta
Inputs N and E are normalized to [0, 1] range to improve numerical conditioning 
of the optimization problem. The linear coefficients are solved using regularized 
Non-Negative Least Squares to handle collinearity and prevent overfitting.
"""
import numpy as np
from scipy.optimize import least_squares, nnls

def scaling_law_func(data_points, params):
    """
    Predicts validation loss.
    Model: L = c0 + c1 * N^(-a1) + c2 * N^(-a2) * E^(-beta)
    
    N is normalized by 1e9.
    E is normalized by 64.0.
    
    Params: [c0, c1, a1, c2, a2, beta]
    """
    X = np.atleast_2d(np.asarray(data_points))
    # X[:, 0] is num_experts (E)
    # X[:, 1] is dense_parameter_count (N)
    
    E = X[:, 0]
    N = X[:, 1]
    
    # Normalize inputs for numerical stability
    # N ranges ~1e8-8e8 -> 0.1-0.8
    # E ranges 1-64 -> 0.015-1.0
    N_norm = N / 1e9
    E_norm = E / 64.0
    
    params = np.asarray(params)
    if params.ndim == 1:
        params = params[None, :]
    
    # Extract parameters [c0, c1, a1, c2, a2, beta]
    c0   = params[:, 0][:, None]
    c1   = params[:, 1][:, None]
    a1   = params[:, 2][:, None]
    c2   = params[:, 3][:, None]
    a2   = params[:, 4][:, None]
    beta = params[:, 5][:, None]
    
    # Safe calculations
    N_safe = np.maximum(N_norm, 1e-10)
    E_safe = np.maximum(E_norm, 1e-10)
    
    log_N = np.log(N_safe)
    log_E = np.log(E_safe)
    
    # Term 1: N^(-a1)
    # Shape: (T, N_samples)
    term1 = np.exp(-a1 * log_N[None, :])
    
    # Term 2: N^(-a2) * E^(-beta)
    term2 = np.exp(-a2 * log_N[None, :] - beta * log_E[None, :])
    
    # Combine
    pred = c0 + c1 * term1 + c2 * term2
    
    # Return shape (Data, T) or (Data,)
    pred = pred.T
    if pred.shape[1] == 1:
        return pred[:, 0]
    return pred


def fit_scaling_law(data_points, loss_values):
    """
    Fits the scaling law using Variable Projection (VarPro) with regularized NNLS.
    Optimizes exponents [a1, a2, beta] using Trust Region Reflective (TRF) algorithm.
    Optimizes coefficients [c0, c1, c2] using regularized Non-Negative Least Squares.
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values)
    if y.ndim == 1:
        y = y[:, None]
        
    n_samples = X.shape[0]
    E = X[:, 0]
    N = X[:, 1]
    
    # Normalize inputs
    N_norm = N / 1e9
    E_norm = E / 64.0
    
    # Precompute logs
    log_N = np.log(np.maximum(N_norm, 1e-10))
    log_E = np.log(np.maximum(E_norm, 1e-10))
    
    results = []
    
    # Regularization strength for NNLS
    # Prevents overfitting to noise and handles collinearity
    l2_reg = 1e-5
    sqrt_lam = np.sqrt(l2_reg)
    
    for i in range(y.shape[1]):
        y_curr = y[:, i]
        
        # Augmented target for regularization
        y_aug = np.concatenate([y_curr, np.zeros(3)])
        
        def solve_inner(exponents, return_coeffs=False):
            a1, a2, beta = exponents
            
            # Basis functions
            # b0 = 1
            # b1 = N^-a1
            # b2 = N^-a2 * E^-beta
            b1 = np.exp(-a1 * log_N)
            b2 = np.exp(-a2 * log_N - beta * log_E)
            
            # Design matrix (n_samples, 3)
            A = np.vstack([np.ones(n_samples), b1, b2]).T
            
            # Column scaling for numerical conditioning
            # This balances the magnitude of bias (1.0) and power terms
            norms = np.linalg.norm(A, axis=0)
            norms[norms < 1e-10] = 1.0
            A_scaled = A / norms
            
            # Regularized NNLS
            # min ||A_scaled * c' - y||^2 + lambda ||c'||^2
            # Equivalent to solving augmented system
            reg_block = np.eye(3) * sqrt_lam
            A_aug = np.vstack([A_scaled, reg_block])
            
            c_scaled, _ = nnls(A_aug, y_aug)
            coeffs = c_scaled / norms
            
            if return_coeffs:
                return coeffs
            
            # Return residuals of the AUGMENTED problem
            # This ensures the outer optimizer sees the regularization cost
            return A_aug @ c_scaled - y_aug

        # Grid search for initialization
        # Helps to find a good basin of attraction
        best_loss = np.inf
        best_exp = [0.5, 0.5, 0.2]
        
        # Grid points covering typical scaling regimes
        grid_a = [0.2, 0.6, 1.2]
        grid_b = [0.0, 0.3, 0.8]
        
        for ga1 in grid_a:
            for ga2 in grid_a:
                for gb in grid_b:
                    try:
                        res = solve_inner([ga1, ga2, gb])
                        loss = np.sum(res**2)
                        if loss < best_loss:
                            best_loss = loss
                            best_exp = [ga1, ga2, gb]
                    except: pass
            
        # Refine with least_squares (TRF)
        try:
            res_opt = least_squares(
                solve_inner, 
                x0=best_exp, 
                bounds=([0.0, 0.0, 0.0], [5.0, 5.0, 5.0]),
                method='trf',
                loss='linear',
                ftol=1e-7, xtol=1e-7, max_nfev=150
            )
            final_exps = res_opt.x
        except:
            final_exps = best_exp
            
        final_coeffs = solve_inner(final_exps, return_coeffs=True)
        
        # Pack parameters: [c0, c1, a1, c2, a2, beta]
        params = np.array([
            final_coeffs[0], final_coeffs[1], final_exps[0],
            final_coeffs[2], final_exps[1], final_exps[2]
        ])
        results.append(params)
        
    return np.array(results) if len(results) > 1 else results[0]
# EVOLVE-BLOCK-END

#2 Run 1 R² = 0.958086

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios
Improved program with a structured scaling law for MoE models.
Uses Grid Search coupled with Non-Negative Least Squares (NNLS) for robust initialization,
followed by constrained non-linear optimization.
"""
import numpy as np
from scipy.optimize import least_squares, nnls
import itertools

def scaling_law_func(data_points, params):
    """
    Predicts validation loss based on MoE parameters.
    Model: L = bias + c1 * N^(-alpha1) + c2 * N^(-alpha2) * E^(-beta)
    where N is normalized dense parameter count.
    
    Parameters:
    data_points: (N, 2) array [num_experts, dense_parameter_count]
    params: (6,) or (T, 6) array [bias, c1, alpha1, c2, alpha2, beta]
    """
    X = np.atleast_2d(np.asarray(data_points))
    params = np.asarray(params)
    
    # Handle parameter shapes
    if params.ndim == 1:
        params = params[None, :]
    
    # Extract features
    E = X[:, 0]
    N = X[:, 1]
    
    # Normalize N to improve numerical stability
    # N is typically ~1e8 to 1e9. Scaling by 1e9 maps it to [0.1, 1.0]
    N_norm = N / 1e9
    
    # Unpack parameters (6 parameters)
    # Use abs() to ensure physical validity (positive scales and decay rates)
    bias   = np.abs(params[:, 0])
    c1     = np.abs(params[:, 1])
    alpha1 = np.abs(params[:, 2])
    c2     = np.abs(params[:, 3])
    alpha2 = np.abs(params[:, 4])
    beta   = np.abs(params[:, 5])
    
    # Calculate terms with broadcasting
    # Term 1: Dense scaling component (Base model performance)
    term1 = c1[:, None] * (N_norm[None, :] ** -alpha1[:, None])
    
    # Term 2: MoE specific component (Gain from experts)
    # c2 * N^-alpha2 * E^-beta
    # Note: E is number of experts.
    term2 = c2[:, None] * (N_norm[None, :] ** -alpha2[:, None]) * (E[None, :] ** -beta[:, None])
    
    # Combine
    pred = bias[:, None] + term1 + term2
    
    # Return shape (Data, T) or (Data,)
    pred = pred.T
    return pred[:, 0] if pred.shape[1] == 1 else pred


def fit_scaling_law(data_points, loss_values):
    """
    Fits the scaling law parameters using a robust Grid Search + NNLS initialization
    followed by Trust Region Reflective (TRF) non-linear least squares.
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values)
    
    if y.ndim == 1:
        y2d = y[:, None]
    else:
        y2d = y
        
    n_targets = y2d.shape[1]
    results = []
    
    E = X[:, 0]
    N = X[:, 1]
    N_norm = N / 1e9
    
    # --- Grid Search Configuration ---
    # We search over the non-linear exponents (alpha1, alpha2, beta)
    # and solve for linear parameters (bias, c1, c2) optimally using NNLS.
    # This avoids local minima associated with poor initial guesses.
    alpha_grid = [0.1, 0.4, 0.8, 1.2, 1.8]
    beta_grid = [0.0, 0.2, 0.5, 0.8]
    
    # Generate grid combinations
    grid_points = list(itertools.product(alpha_grid, alpha_grid, beta_grid))
    
    for i in range(n_targets):
        y_curr = y2d[:, i]
        min_loss = np.min(y_curr)
        
        best_mse = np.inf
        best_init = None # [bias, c1, alpha1, c2, alpha2, beta]
        
        # 1. Global Search via Grid + NNLS
        # Construct design matrix columns that don't depend on exponents
        ones_col = np.ones_like(N_norm)
        
        for a1, a2, b in grid_points:
            # Construct Design Matrix A: [1, N^-a1, N^-a2 * E^-b]
            col2 = N_norm ** -a1
            col3 = (N_norm ** -a2) * (E ** -b)
            
            A = np.vstack([ones_col, col2, col3]).T
            
            # Solve non-negative least squares: min ||Ax - y|| s.t. x >= 0
            try:
                x, rnorm = nnls(A, y_curr)
                # nnls returns residual norm. MSE = rnorm^2 / N
                mse = (rnorm**2) / len(y_curr)
                
                if mse < best_mse:
                    best_mse = mse
                    # Store parameters: [bias, c1, alpha1, c2, alpha2, beta]
                    best_init = [x[0], x[1], a1, x[2], a2, b]
            except:
                continue
        
        # Fallback if grid search fails (should not happen)
        if best_init is None:
            best_init = [min_loss * 0.9, 1.0, 0.5, 1.0, 0.5, 0.5]
            
        # 2. Refine with Non-Linear Least Squares
        x0 = best_init
        
        def residuals_func(p):
            # p is [bias, c1, alpha1, c2, alpha2, beta]
            p_abs = np.abs(p)
            preds = scaling_law_func(X, p_abs)
            return preds - y_curr
            
        # Bounds:
        # bias: [0, min_loss] - Strictly, irreducible loss must be < observed loss.
        # We allow a small margin for noise.
        # Exponents: [0, 5.0] - Physical scaling laws are rarely steeper.
        lower = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
        # Relax bias bound slightly to handle potential outliers at the bottom
        upper = [min_loss + 1e-3, np.inf, 5.0, np.inf, 5.0, 5.0]
        
        try:
            res = least_squares(
                residuals_func, 
                x0, 
                bounds=(lower, upper),
                method='trf',
                loss='soft_l1', # Robust to outliers
                f_scale=0.05,   
                ftol=1e-8,
                xtol=1e-8,
                gtol=1e-8,
                max_nfev=2000
            )
            best_params = np.abs(res.x)
        except:
            best_params = np.abs(np.array(x0))
        
        results.append(best_params)
        
    return np.array(results) if n_targets > 1 else results[0]
# EVOLVE-BLOCK-END

#3 Run 4 R² = 0.958086

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios (MoE)
Improved program using a smart initialization strategy based on 
decoupled log-linear regression for dense and expert terms, 
followed by robust non-linear least squares optimization.
"""
import numpy as np
from scipy.optimize import least_squares

def scaling_law_func(data_points, params):
    """
    Predicts validation loss based on MoE parameters.
    Model: L = bias + c1 * N^(-alpha1) + c2 * N^(-alpha2) * E^(-beta)
    where N is normalized dense parameter count (N/1e9).
    
    Parameters shape: (6,) or (T, 6)
    Data points shape: (N_samples, 2) -> [num_experts, dense_params]
    """
    X = np.atleast_2d(np.asarray(data_points))
    params = np.asarray(params)
    
    # Handle parameter shapes for broadcasting
    if params.ndim == 1:
        params = params[None, :]
    
    # Extract features
    E = X[:, 0]
    N = X[:, 1]
    
    # Normalize N to [0.1, 1.0] range (assuming N ~ 1e8 - 1e9)
    # Using a fixed constant 1e9 for stability
    N_norm = N / 1e9
    
    # Unpack parameters (6 parameters)
    # params: [bias, c1, alpha1, c2, alpha2, beta]
    # We use absolute values to ensure physical validity during prediction
    bias   = params[:, 0]
    c1     = np.abs(params[:, 1])
    alpha1 = np.abs(params[:, 2])
    c2     = np.abs(params[:, 3])
    alpha2 = np.abs(params[:, 4])
    beta   = np.abs(params[:, 5])
    
    # Term 1: Dense scaling component
    # Represents the asymptotic performance limit of the dense backbone
    term1 = c1[:, None] * (N_norm[None, :] ** -alpha1[:, None])
    
    # Term 2: Expert scaling component
    # Represents the additional reducible loss from experts
    term2 = c2[:, None] * (N_norm[None, :] ** -alpha2[:, None]) * (E[None, :] ** -beta[:, None])
    
    # Combine terms
    pred = bias[:, None] + term1 + term2
    
    # Return shape handling: (Data,) or (Data, T)
    if pred.shape[0] == 1:
        return pred.flatten()
    return pred.T

def fit_scaling_law(data_points, loss_values):
    """
    Fits the scaling law parameters using a two-stage approach:
    1. Heuristic initialization using log-linear regression on subsets.
    2. Refinement using robust non-linear least squares (TRF).
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values)
    if y.ndim == 1:
        y = y[:, None]
        
    E = X[:, 0]
    N = X[:, 1] / 1e9
    n_targets = y.shape[1]
    results = []
    
    for i in range(n_targets):
        y_curr = y[:, i]
        min_loss = np.min(y_curr)
        
        # --- Stage 1: Smart Initialization ---
        # Initialize bias slightly below minimum loss
        bias_init = max(0.0, min_loss - 0.1)
        
        # A. Estimate Dense Parameters (c1, alpha1)
        # Use data with minimum expert count as proxy for dense scaling
        min_experts = np.min(E)
        mask_dense = (E == min_experts)
        # If too few points, relax condition to bottom quartile
        if np.sum(mask_dense) < 4:
            mask_dense = E <= np.percentile(E, 25)
            
        # Fit log(y - bias) = log(c1) - alpha1 * log(N)
        try:
            target_dense = y_curr[mask_dense] - bias_init
            valid_idx = target_dense > 1e-6
            if np.sum(valid_idx) >= 2:
                log_y = np.log(target_dense[valid_idx])
                log_N = np.log(N[mask_dense][valid_idx])
                # Linear regression: [1, -logN] @ [log_c1, alpha1] = log_y
                A = np.vstack([np.ones_like(log_N), -log_N]).T
                sol = np.linalg.lstsq(A, log_y, rcond=None)[0]
                c1_init = np.exp(sol[0])
                alpha1_init = np.clip(sol[1], 0.1, 2.0)
            else:
                raise ValueError
        except:
            c1_init, alpha1_init = 1.0, 0.5
            
        # B. Estimate Expert Parameters (c2, alpha2, beta)
        # Fit log(Residuals) = log(c2) - alpha2 * log(N) - beta * log(E)
        try:
            # Calculate residuals from dense part
            pred_dense = bias_init + c1_init * (N ** -alpha1_init)
            residuals = y_curr - pred_dense
            
            valid_res = residuals > 1e-6
            if np.sum(valid_res) >= 4:
                log_R = np.log(residuals[valid_res])
                log_N = np.log(N[valid_res])
                log_E = np.log(E[valid_res])
                # Linear regression
                A = np.vstack([np.ones_like(log_N), -log_N, -log_E]).T
                sol = np.linalg.lstsq(A, log_R, rcond=None)[0]
                c2_init = np.exp(sol[0])
                alpha2_init = np.clip(sol[1], 0.1, 2.0)
                beta_init = np.clip(sol[2], 0.0, 2.0)
            else:
                raise ValueError
        except:
            c2_init, alpha2_init, beta_init = 1.0, 0.5, 0.5
            
        # Construct smart guess
        smart_guess = [bias_init, c1_init, alpha1_init, c2_init, alpha2_init, beta_init]
        
        # --- Stage 2: Robust Optimization ---
        
        # Define residual function
        def residual_func(p):
            # p: [bias, c1, alpha1, c2, alpha2, beta]
            b, c1, a1, c2, a2, beta = p
            pred = b + c1 * (N ** -a1) + c2 * (N ** -a2) * (E ** -beta)
            return pred - y_curr
            
        # Bounds: bias < min_loss, others > 0
        # We allow bias to go up to min_loss - epsilon
        lower_bounds = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
        upper_bounds = [min_loss - 1e-5, np.inf, 5.0, np.inf, 5.0, 5.0]
        
        # List of guesses to try
        guesses = [
            smart_guess,
            [min_loss * 0.9, 1.0, 0.5, 1.0, 0.5, 0.5], # Balanced
            [min_loss * 0.5, 5.0, 1.0, 0.1, 1.0, 0.1], # Dense heavy
            [min_loss * 0.8, 0.5, 0.3, 2.0, 0.3, 0.8], # Expert heavy
        ]
        
        best_res = None
        best_cost = np.inf
        
        for guess in guesses:
            try:
                # Clip guess to bounds
                x0 = np.clip(guess, 
                             [l + 1e-6 for l in lower_bounds], 
                             [u - 1e-6 if u < np.inf else 1e4 for u in upper_bounds])
                
                res = least_squares(
                    residual_func, 
                    x0, 
                    bounds=(lower_bounds, upper_bounds), 
                    method='trf', 
                    loss='soft_l1', # Robust to outliers
                    f_scale=0.05,   # Scale of inlier residuals
                    max_nfev=500,
                    ftol=1e-6
                )
                
                if res.cost < best_cost:
                    best_cost = res.cost
                    best_res = res.x
            except Exception:
                continue
                
        if best_res is None:
            best_res = np.array(smart_guess)
            
        results.append(best_res)
        
    return np.array(results) if n_targets > 1 else results[0]
# EVOLVE-BLOCK-END

#4 Run 5 R² = 0.958086

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios
Improved program using robust non-linear least squares (TRF) with soft_l1 loss.
Uses a decoupled additive scaling law model: L = bias + c1 * N^(-alpha1) + c2 * N^(-alpha2) * E^(-beta).
Implements a systematic grid-based initialization strategy to find the best starting basin.
"""
import numpy as np
from scipy.optimize import least_squares

def scaling_law_func(data_points, params):
    """
    Predicts validation loss based on MoE parameters.
    Model: L = bias + c1 * N^(-alpha1) + c2 * N^(-alpha2) * E^(-beta)
    
    Inputs:
        data_points: (N_samples, 2) array [num_experts, dense_parameter_count]
        params: (6,) or (M, 6) array [bias, c1, alpha1, c2, alpha2, beta]
    """
    X = np.atleast_2d(np.asarray(data_points))
    params = np.asarray(params)
    
    # Handle batch of parameters
    if params.ndim == 1:
        params = params[None, :]
    
    # Features
    # col 0: num_experts (E)
    # col 1: dense_parameter_count (N)
    E = X[:, 0]
    N = X[:, 1]
    
    # Normalize N for numerical stability
    # N is typically 1e8 to 8e8. Normalizing by 1e9 puts it in [0.1, 0.8].
    N_norm = N / 1e9
    
    # Unpack parameters
    # Use abs() to ensure the function is valid even if optimizer explores negatives
    bias   = params[:, 0]
    c1     = np.abs(params[:, 1])
    alpha1 = np.abs(params[:, 2])
    c2     = np.abs(params[:, 3])
    alpha2 = np.abs(params[:, 4])
    beta   = np.abs(params[:, 5])
    
    # Term 1: Dense scaling component
    # c1 * N^-alpha1
    # Adding small epsilon to base for safety, though N_norm is usually > 0.1
    term1 = c1[:, None] * ((N_norm[None, :] + 1e-9) ** -alpha1[:, None])
    
    # Term 2: Expert scaling component
    # c2 * N^-alpha2 * E^-beta
    term2 = c2[:, None] * ((N_norm[None, :] + 1e-9) ** -alpha2[:, None]) * (E[None, :] ** -beta[:, None])
    
    # Combine terms
    pred = bias[:, None] + term1 + term2
    
    # Shape handling
    pred = pred.T
    return pred[:, 0] if pred.shape[1] == 1 else pred

def fit_scaling_law(data_points, loss_values):
    """
    Fits the scaling law parameters using Trust Region Reflective (TRF) algorithm.
    Uses a systematic initialization strategy scanning bias and dense scaling.
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values)
    
    if y.ndim == 1:
        y2d = y[:, None]
    else:
        y2d = y
        
    n_targets = y2d.shape[1]
    results = []
    
    # Pre-compute features for initialization
    E = X[:, 0]
    N_norm = X[:, 1] / 1e9
    
    # Identify dense subset (min experts) for initialization
    min_E = np.min(E)
    mask_dense = (E == min_E)
    if np.sum(mask_dense) < 3:
        # Fallback to low expert count
        thresh = np.percentile(E, 25)
        mask_dense = E <= thresh
        if np.sum(mask_dense) < 3:
            mask_dense = np.ones_like(E, dtype=bool)

    for i in range(n_targets):
        y_curr = y2d[:, i]
        min_loss = np.min(y_curr)
        
        candidates = []
        
        # --- Strategy 1: Grid search on Bias + Log-Linear fit ---
        # We assume L ~ bias + c1*N^-a1 for dense data
        bias_grid = [min_loss - eps for eps in [0.02, 0.05, 0.1, 0.2, 0.5]]
        bias_grid = [b for b in bias_grid if b > 0]
        if not bias_grid: bias_grid = [max(0.0, min_loss - 0.1)]
        
        for b_guess in bias_grid:
            # 1. Fit dense term on subset
            y_shift = y_curr[mask_dense] - b_guess
            valid = y_shift > 1e-5
            
            if np.sum(valid) < 2: continue
            
            log_y = np.log(y_shift[valid])
            log_N = np.log(N_norm[mask_dense][valid])
            
            # Linear fit: log_y = log_c1 - alpha1 * log_N
            A = np.vstack([np.ones(len(log_y)), -log_N]).T
            try:
                sol, _, _, _ = np.linalg.lstsq(A, log_y, rcond=None)
                c1_est = np.exp(sol[0])
                a1_est = np.clip(sol[1], 0.01, 4.0)
            except:
                c1_est = 1.0
                a1_est = 0.5
            
            # 2. Fit expert term on residuals of full dataset
            # Residual R = y - (bias + c1*N^-a1)
            # Model R ~ c2 * N^-a2 * E^-beta
            term1 = c1_est * (N_norm ** -a1_est)
            residuals = y_curr - b_guess - term1
            
            valid_res = residuals > 1e-5
            if np.sum(valid_res) > 5:
                log_R = np.log(residuals[valid_res])
                log_N_res = np.log(N_norm[valid_res])
                log_E_res = np.log(E[valid_res])
                
                # log R = log c2 - a2*log N - beta*log E
                A2 = np.vstack([np.ones(len(log_R)), -log_N_res, -log_E_res]).T
                try:
                    sol2, _, _, _ = np.linalg.lstsq(A2, log_R, rcond=None)
                    c2_est = np.exp(sol2[0])
                    a2_est = np.clip(sol2[1], 0.01, 4.0)
                    beta_est = np.clip(sol2[2], 0.0, 3.0)
                    
                    candidates.append([b_guess, c1_est, a1_est, c2_est, a2_est, beta_est])
                except:
                    candidates.append([b_guess, c1_est, a1_est, c1_est*0.2, a1_est, 0.5])
            else:
                 # If residuals are negative/small, maybe expert term is negligible or we overshot bias
                 candidates.append([b_guess, c1_est, a1_est, 0.0, a1_est, 0.0])

        # --- Strategy 2: Physics Priors / Fallbacks ---
        candidates.append([min_loss * 0.9, 1.0, 0.5, 1.0, 0.5, 0.5])
        candidates.append([min_loss * 0.5, 5.0, 1.0, 2.0, 1.0, 0.5])
        
        # Deduplicate
        unique_candidates = []
        seen = []
        for c in candidates:
            c_arr = np.array(c)
            if not any(np.allclose(c_arr, s, rtol=0.1, atol=0.1) for s in seen):
                unique_candidates.append(c)
                seen.append(c_arr)
        
        # --- Optimization ---
        def res_func(p):
            return scaling_law_func(X, p) - y_curr
            
        # Bounds: bias [0, min_loss], others [0, inf], exponents [0, 6]
        lb = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
        ub = [min_loss, np.inf, 6.0, np.inf, 6.0, 6.0]
        
        best_params = None
        best_cost = np.inf
        
        for guess in unique_candidates:
            try:
                # Clip guess to bounds (with small margin for bias)
                p0 = np.clip(guess, lb, ub)
                p0[0] = min(p0[0], min_loss - 1e-4) 
                
                res = least_squares(
                    res_func, 
                    p0, 
                    bounds=(lb, ub),
                    method='trf',
                    loss='soft_l1', 
                    f_scale=0.05, # Robustness scale
                    max_nfev=500,
                    xtol=1e-7,
                    ftol=1e-7
                )
                
                if res.cost < best_cost:
                    best_cost = res.cost
                    best_params = res.x
            except Exception:
                continue
        
        if best_params is None:
            best_params = np.array([min_loss*0.9, 1.0, 0.5, 1.0, 0.5, 0.5])
            
        results.append(best_params)
        
    return np.array(results) if n_targets > 1 else results[0]
# EVOLVE-BLOCK-END

#5 Run 3 R² = 0.957176

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios
Improved program with Centered Variable Projection and Robust Fine-tuning
"""
import numpy as np
from scipy.optimize import least_squares, nnls, differential_evolution

def scaling_law_func(data_points, params):
    """
    Predicts validation loss based on MoE parameters.
    Model: L = bias + A * (N/1e9)^-alpha + B * (N/1e9)^-gamma * E^-beta
    
    params: [bias, A, alpha, B, gamma, beta]
    """
    X = np.atleast_2d(np.asarray(data_points))
    params = np.asarray(params)
    
    # Handle parameter shapes for vectorized evaluation
    if params.ndim == 1:
        params = params[None, :]
    
    # Extract features
    E = X[:, 0]
    N = X[:, 1]
    
    # Normalize N for numerical stability
    # Using 1e9 as scaling factor implies N_norm is roughly in [0.1, 0.8]
    N_norm = N / 1e9
    
    # Unpack parameters (6 parameters)
    # Use abs to ensure positivity and avoid numerical errors
    bias  = np.abs(params[:, 0])
    A     = np.abs(params[:, 1])
    alpha = np.abs(params[:, 2])
    B     = np.abs(params[:, 3])
    gamma = np.abs(params[:, 4])
    beta  = np.abs(params[:, 5])
    
    # Term 1: Dense scaling (A * N^-alpha)
    term1 = A[:, None] * (N_norm[None, :] ** -alpha[:, None])
    
    # Term 2: Expert interaction (B * N^-gamma * E^-beta)
    term2 = B[:, None] * (N_norm[None, :] ** -gamma[:, None]) * (E[None, :] ** -beta[:, None])
    
    # Total loss
    pred = bias[:, None] + term1 + term2
    
    # Return shape (Data, T) or (Data,)
    pred = pred.T
    return pred[:, 0] if pred.shape[1] == 1 else pred

def fit_scaling_law(data_points, loss_values):
    """
    Fits the scaling law parameters using Centered Variable Projection.
    
    Key Improvements:
    1. Centering: Log-features are centered (mean subtracted) before fitting.
       This orthogonalizes the linear coefficients (A, B) from the exponents,
       making the optimization landscape much smoother and easier for DE.
    2. Variable Projection: Uses DE to find global optimal exponents, solving
       for linear params via NNLS.
    3. Parameter Conversion: Converts centered parameters back to standard form
       for the final robust fine-tuning stage.
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values)
    
    if y.ndim == 1:
        y2d = y[:, None]
    else:
        y2d = y
        
    E = X[:, 0]
    N_norm = X[:, 1] / 1e9
    
    # Pre-compute centered log features for stable optimization
    # Adding epsilon to avoid log(0)
    log_N = np.log(N_norm + 1e-10)
    log_E = np.log(E + 1e-10)
    
    mu_N = np.mean(log_N)
    mu_E = np.mean(log_E)
    
    # Centered logs
    lN_c = log_N - mu_N
    lE_c = log_E - mu_E
    
    ones = np.ones_like(E)
    
    results = []
    
    for i in range(y2d.shape[1]):
        y_curr = y2d[:, i]
        min_loss = np.min(y_curr)
        
        # --- Stage 1: DE with Centered Variable Projection ---
        
        def vp_objective(exps):
            # exps: [alpha, gamma, beta]
            alph, gam, bet = exps
            
            # Basis in centered space:
            # bias + A_c * exp(-alpha * lN_c) + B_c * exp(-gamma * lN_c - beta * lE_c)
            t1 = np.exp(-alph * lN_c)
            t2 = np.exp(-gam * lN_c - bet * lE_c)
            
            M = np.stack([ones, t1, t2], axis=1)
            
            try:
                # Solve NNLS for [bias, A_c, B_c]
                # bias corresponds to column 0 (ones)
                coeffs, rnorm = nnls(M, y_curr)
                return rnorm
            except:
                return 1e9

        # DE Bounds for exponents
        # Expanded bounds to ensure we capture the full range of behaviors
        bounds = [(0.0, 8.0), (0.0, 8.0), (0.0, 8.0)]
        
        # Run Global Optimization
        # Increased maxiter and popsize for thoroughness given the fast evaluation
        res_de = differential_evolution(
            vp_objective, 
            bounds, 
            strategy='best1bin', 
            maxiter=100, 
            popsize=20, 
            mutation=(0.5, 1.0), 
            recombination=0.7,
            tol=1e-5,
            seed=42,
            polish=True
        )
        
        # Reconstruct centered parameters
        alph_opt, gam_opt, bet_opt = res_de.x
        
        t1 = np.exp(-alph_opt * lN_c)
        t2 = np.exp(-gam_opt * lN_c - bet_opt * lE_c)
        M = np.stack([ones, t1, t2], axis=1)
        coeffs, _ = nnls(M, y_curr)
        bias_c, A_c, B_c = coeffs
        
        # Convert centered parameters to standard form
        # Standard form: bias + A * N_norm^-alpha + ...
        # A = A_c * exp(alpha * mu_N)
        # B = B_c * exp(gamma * mu_N + beta * mu_E)
        
        A_init = A_c * np.exp(alph_opt * mu_N)
        B_init = B_c * np.exp(gam_opt * mu_N + bet_opt * mu_E)
        
        # Initial guess: [bias, A, alpha, B, gamma, beta]
        init_params = [bias_c, A_init, alph_opt, B_init, gam_opt, bet_opt]

        # --- Stage 2: Robust Fine-tuning ---
        
        def residual_func(p):
            return scaling_law_func(X, p) - y_curr
            
        # Ensure bias is within valid range for initialization
        if init_params[0] >= min_loss:
            init_params[0] = min_loss - 1e-4
            
        # Bounds: [bias, A, alpha, B, gamma, beta]
        # Bias must be < min_loss (physically, it's the irreducible loss)
        lower_bounds = [0, 0, 0, 0, 0, 0]
        upper_bounds = [min_loss, np.inf, 10.0, np.inf, 10.0, 10.0]
        
        try:
            res_robust = least_squares(
                residual_func, 
                x0=init_params, 
                bounds=(lower_bounds, upper_bounds),
                loss='soft_l1', 
                f_scale=0.01, # Tight inlier definition
                max_nfev=2000,
                ftol=1e-10,
                xtol=1e-10
            )
            final_params = res_robust.x
        except Exception:
            final_params = np.array(init_params)
            
        results.append(final_params)
        
    return np.array(results) if len(results) > 1 else results[0]
# EVOLVE-BLOCK-END