SLD - LR-BSZ Scaling Law - SLDAgent + Claude Sonnet 4.5

All Runs (sorted by R²)

Best Run 3 R² = 0.921644

▼

Python

# EVOLVE-BLOCK-START
"""
Physics-informed scaling law with improved hyperparameter modeling
Key improvements:
1. Chinchilla power laws with data-driven exponent fitting
2. Learning rate modeled via μP-inspired optimal scaling (lr_opt ~ 1/N)
3. Batch size effect via gradient noise scale theory with critical batch
4. Reduced to 9 parameters for better generalization
5. Enhanced optimizer with adaptive bounds and multi-stage refinement
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Scaling law: L = L_inf + A/N^α + B/D^β + lr_penalty + bsz_penalty
    
    Key physics:
    - lr_penalty: quadratic deviation from μP-style optimal LR
    - bsz_penalty: gradient noise model with critical batch size
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    params = np.asarray(params, dtype=np.float64)
    
    if params.ndim == 1:
        params = params[None, :]
    
    # Extract and normalize features with safety bounds
    lr = np.clip(X[:, 0], 1e-10, 1.0)
    bsz = np.clip(X[:, 1], 1.0, 1e8)
    D = np.clip(X[:, 2], 1e6, 1e15)
    N = np.clip(X[:, 3], 1e6, 1e12)
    
    # Unpack parameters (9 total - balanced complexity)
    L_inf = params[:, 0:1].T       # Irreducible loss
    A = params[:, 1:2].T           # Model size coefficient
    alpha = params[:, 2:3].T       # Model size exponent
    B = params[:, 3:4].T           # Data coefficient
    beta = params[:, 4:5].T        # Data exponent
    gamma = params[:, 5:6].T       # LR penalty scale
    lr_exp = params[:, 6:7].T      # LR-N coupling (μP theory)
    delta = params[:, 7:8].T       # BSZ penalty scale
    bsz_exp = params[:, 8:9].T     # BSZ-D coupling
    
    # Core Chinchilla-style power laws
    model_term = A / np.power(N[:, None], alpha)
    data_term = B / np.power(D[:, None], beta)
    
    # Learning rate penalty with μP-inspired scaling
    # Optimal LR scales inversely with model size: lr_opt ~ N^(-1)
    # Base LR of 0.005 is empirically reasonable for standard parameterization
    lr_opt = 0.005 * np.power(N[:, None], lr_exp)
    lr_ratio = lr[:, None] / np.clip(lr_opt, 1e-10, 1.0)
    
    # Symmetric quadratic penalty in log-space with gentle tails
    log_lr_ratio = np.log(lr_ratio)
    lr_penalty = gamma * (log_lr_ratio ** 2 + 0.05 * log_lr_ratio ** 4)
    
    # Batch size penalty with gradient noise theory
    # Critical batch size grows with data: B_crit ~ D^κ
    # Below critical: strong noise penalty; above: mild inefficiency
    bsz_crit = 128.0 * np.power(D[:, None] / 1e10, bsz_exp)
    bsz_ratio = bsz[:, None] / np.clip(bsz_crit, 8.0, 1e7)
    
    # Asymmetric penalty function
    # Small batches (ratio < 1): severe gradient noise
    # Large batches (ratio > 1): mild diminishing returns
    log_bsz_ratio = np.log(bsz_ratio)
    bsz_penalty = delta * np.where(
        bsz_ratio < 1.0,
        # Strong penalty for small batches: noise dominates
        0.5 * (1.0 / bsz_ratio - 1.0) + 0.3 * log_bsz_ratio ** 2,
        # Mild penalty for large batches: diminishing returns
        0.1 * log_bsz_ratio + 0.05 * log_bsz_ratio ** 2
    )
    
    # Total prediction
    pred = L_inf + model_term + data_term + lr_penalty + bsz_penalty
    
    return pred[:, 0] if pred.shape[1] == 1 else pred


def fit_scaling_law(data_points, loss_values):
    """
    Advanced three-stage fitting: global search → local refinement → final polish
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64)
    
    if y.ndim == 1:
        y = y[:, None]
    
    T = y.shape[1]
    n_params = 9
    
    # Compute data statistics for adaptive bounds
    loss_min, loss_max = np.min(y), np.max(y)
    loss_range = loss_max - loss_min
    loss_std = np.std(y)
    loss_median = np.median(y)
    
    # Percentile-based bounds for robustness
    loss_p10 = np.percentile(y, 10)
    loss_p90 = np.percentile(y, 90)
    
    # Theory-informed parameter bounds
    bounds = [
        (loss_min - 0.4, loss_p10 + 0.1),    # L_inf: near achievable minimum
        (0.005, loss_range * 150),            # A: wide range for model term
        (0.08, 0.65),                         # alpha: 0.3-0.5 typical, allow broader
        (0.005, loss_range * 150),            # B: wide range for data term
        (0.08, 0.65),                         # beta: similar to alpha
        (0.0, loss_std * 8),                  # gamma: LR penalty strength
        (-1.2, -0.05),                        # lr_exp: negative (μP theory)
        (0.0, loss_std * 6),                  # delta: BSZ penalty strength
        (0.0, 0.3),                           # bsz_exp: positive (larger D → larger B_crit)
    ]
    
    def objective(flat_params):
        params = flat_params.reshape(T, n_params)
        try:
            pred = scaling_law_func(X, params)
            if pred.ndim == 1:
                pred = pred[:, None]
            
            # Robust loss: Huber-style combination
            residuals = pred - y
            abs_residuals = np.abs(residuals)
            
            # MSE for small errors, MAE for large (outlier robustness)
            huber_delta = 0.5 * loss_std
            huber_loss = np.where(
                abs_residuals <= huber_delta,
                0.5 * residuals ** 2,
                huber_delta * (abs_residuals - 0.5 * huber_delta)
            )
            main_loss = np.mean(huber_loss)
            
            # Regularization: prefer Chinchilla-like exponents
            reg_alpha = 0.015 * (params[:, 2] - 0.38) ** 2
            reg_beta = 0.015 * (params[:, 4] - 0.38) ** 2
            
            # Mild parameter magnitude regularization
            reg_l2 = 1e-9 * np.sum(params ** 2)
            
            return main_loss + reg_alpha + reg_beta + reg_l2
        except:
            return 1e16
    
    # Smart initialization based on low-loss samples
    low_loss_mask = y < np.percentile(y, 25)
    L_inf_init = np.mean(y[low_loss_mask]) - 0.15 if np.any(low_loss_mask) else loss_min
    
    init_params = np.array([
        np.clip(L_inf_init, loss_min - 0.3, loss_p10),
        loss_range * 12,      # A
        0.38,                 # alpha (Chinchilla default)
        loss_range * 10,      # B
        0.38,                 # beta
        0.4,                  # gamma
        -0.6,                 # lr_exp (μP-like)
        0.25,                 # delta
        0.15,                 # bsz_exp
    ])
    
    # Stage 1: Differential evolution with enhanced settings
    result_de = differential_evolution(
        objective,
        bounds=bounds * T,
        maxiter=600,
        popsize=30,
        seed=42,
        atol=1e-11,
        tol=1e-11,
        workers=1,
        strategy='best1bin',
        mutation=(0.3, 1.3),
        recombination=0.85,
        polish=False,
        init='sobol'  # Better space coverage than latinhypercube
    )
    
    best_params = result_de.x
    best_score = result_de.fun
    
    # Stage 2: L-BFGS-B refinement with multiple restarts
    for attempt in range(3):
        try:
            if attempt == 0:
                start_point = best_params
            else:
                # Add small perturbations for exploration
                noise_scale = 0.005 * (2 - attempt)
                start_point = best_params + np.random.randn(len(best_params)) * noise_scale
            
            result_lbfgs = minimize(
                objective,
                start_point,
                method='L-BFGS-B',
                bounds=bounds * T,
                options={'maxiter': 2500, 'ftol': 1e-15, 'gtol': 1e-13}
            )
            
            if result_lbfgs.success and result_lbfgs.fun < best_score:
                best_params = result_lbfgs.x
                best_score = result_lbfgs.fun
        except:
            continue
    
    # Stage 3: Powell for final polish (unconstrained but verify bounds)
    try:
        result_powell = minimize(
            objective,
            best_params,
            method='Powell',
            options={'maxiter': 1500, 'ftol': 1e-13, 'xtol': 1e-13}
        )
        
        if result_powell.success and result_powell.fun < best_score:
            # Verify all parameters within bounds
            params_check = result_powell.x.reshape(T, n_params)
            within_bounds = all(
                bounds[i][0] <= params_check[0, i] <= bounds[i][1]
                for i in range(n_params)
            )
            if within_bounds:
                best_params = result_powell.x
    except:
        pass
    
    params_opt = best_params.reshape(T, n_params)
    return params_opt[0] if T == 1 else params_opt
# EVOLVE-BLOCK-END

#2 Run 2 R² = 0.914496

▼

Python

# EVOLVE-BLOCK-START
"""
Refined exponential scaling law with lr-batch interaction.
Combines proven exponential penalties with critical hyperparameter coupling.
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Loss = (A/N^α + B/D^β + C) * exp(γ·log²(lr/lr_opt)) * exp(δ·log²(bsz/bsz_opt)) * exp(ε·log(lr)·log(bsz))
    
    Key features:
    - Chinchilla base: A/N^α + B/D^β + C
    - Exponential LR penalty (Gaussian in log-space) - proven effective
    - Exponential batch size penalty
    - Critical lr-batch interaction: optimal batch size couples with learning rate
    - 9 parameters for expressiveness while maintaining stability
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    params = np.asarray(params, dtype=np.float64)
    
    lr = X[:, 0]
    bsz = X[:, 1]
    data_size = X[:, 2]
    param_size = X[:, 3]
    
    # 9 parameters: A, alpha, B, beta, C, gamma, lr_opt, delta, epsilon
    A, alpha, B, beta, C, gamma, lr_opt, delta, epsilon = params
    
    eps = 1e-10
    param_size = np.maximum(param_size, eps)
    data_size = np.maximum(data_size, eps)
    lr = np.maximum(lr, eps)
    bsz = np.maximum(bsz, eps)
    lr_opt = np.maximum(lr_opt, eps)
    
    # Base Chinchilla scaling
    base_loss = A / np.power(param_size, np.abs(alpha)) + \
                B / np.power(data_size, np.abs(beta)) + C
    
    # Learning rate penalty (Gaussian in log-space)
    lr_penalty = np.exp(gamma * np.square(np.log(lr / lr_opt)))
    
    # Batch size penalty (Gaussian in log-space, using adaptive reference)
    bsz_ref = 512.0  # Typical batch size
    bsz_penalty = np.exp(delta * np.square(np.log(bsz / bsz_ref)))
    
    # LR-Batch interaction: optimal batch size depends on learning rate
    # Higher LR may benefit from different batch sizes
    lr_ref = 1e-3
    lr_bsz_interaction = np.exp(epsilon * np.log(lr / lr_ref) * np.log(bsz / bsz_ref))
    
    pred = base_loss * lr_penalty * bsz_penalty * lr_bsz_interaction
    
    return pred


def fit_scaling_law(data_points, loss_values):
    """
    Two-stage optimization: global search then local refinement
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64)
    
    # Statistics for initialization
    lr_median = np.median(X[:, 0])
    loss_min = np.min(y)
    loss_mean = np.mean(y)
    
    # Find minimum loss configuration
    min_idx = np.argmin(y)
    lr_at_min = X[min_idx, 0]
    
    # Parameter bounds: [A, alpha, B, beta, C, gamma, lr_opt, delta, epsilon]
    bounds = [
        (0.1, 50.0),              # A: model coefficient
        (0.1, 0.8),               # alpha: model exponent
        (0.1, 50.0),              # B: data coefficient
        (0.1, 0.8),               # beta: data exponent
        (loss_min * 0.3, loss_mean * 0.7),  # C: baseline
        (-2.0, 2.0),              # gamma: lr penalty strength
        (X[:, 0].min(), X[:, 0].max()),  # lr_opt
        (-1.0, 1.0),              # delta: batch size penalty
        (-0.3, 0.3)               # epsilon: lr-batch interaction
    ]
    
    def objective(params):
        try:
            pred = scaling_law_func(X, params)
            if not np.all(np.isfinite(pred)):
                return 1e10
            residuals = pred - y
            mse = np.mean(residuals ** 2)
            
            # Light regularization for extreme penalties
            reg = 0.0001 * (np.square(params[5]) + np.square(params[7]) + np.square(params[8]))
            
            return mse + reg
        except:
            return 1e10
    
    # Smart initialization
    x0 = np.array([
        12.0,              # A
        0.35,              # alpha (Chinchilla)
        15.0,              # B
        0.32,              # beta (Chinchilla)
        loss_min * 0.6,    # C
        0.0,               # gamma (neutral start)
        lr_at_min,         # lr_opt (near observed minimum)
        0.0,               # delta (neutral start)
        0.0                # epsilon (neutral interaction)
    ])
    
    # Global search with differential evolution
    result = differential_evolution(
        objective,
        bounds,
        seed=42,
        maxiter=250,
        popsize=15,
        atol=1e-8,
        tol=1e-8,
        workers=1,
        init='sobol',
        strategy='best1bin',
        x0=x0
    )
    
    # Local refinement with multiple attempts
    best_result = result
    
    for attempt in range(3):
        try:
            start_point = best_result.x
            if attempt > 0:
                rng = np.random.RandomState(42 + attempt)
                noise = rng.randn(len(start_point)) * 0.02
                start_point = start_point * (1.0 + noise)
                start_point = np.clip(start_point, 
                                     [b[0] for b in bounds], 
                                     [b[1] for b in bounds])
            
            local_result = minimize(
                objective,
                start_point,
                method='L-BFGS-B',
                bounds=bounds,
                options={'maxiter': 800, 'ftol': 1e-10}
            )
            
            if local_result.success and local_result.fun < best_result.fun:
                best_result = local_result
        except:
            pass
    
    return best_result.x
# EVOLVE-BLOCK-END

#3 Run 4 R² = 0.913341

▼

Python

# EVOLVE-BLOCK-START
"""
Proven asymmetric scaling law with enhanced numerical stability and optimization.
L = E + A/N^α + B/D^β + C*max(0,log(lr/lr_opt))^2 + D*|min(0,log(lr/lr_opt))| + F*exp(-bsz/G)

Key improvements (10 params, 2702 data points):
- Asymmetric LR: quadratic high penalty (catastrophic), linear low penalty (benign)
- Exponential batch saturation (proven stable)
- Enhanced numerical stability with careful clipping
- Improved optimization strategy with better convergence
- Strong Chinchilla regularization (α≈β≈0.35)
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Robust asymmetric scaling law with proven additive structure.
    High LR causes catastrophic failures (quadratic), low LR just slows convergence (linear).
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    params = np.asarray(params, dtype=np.float64)
    
    if params.ndim == 1:
        params = params[None, :]
    
    # Strict bounds for numerical stability
    lr = np.clip(X[:, 0], 1e-8, 1.0)
    bsz = np.clip(X[:, 1], 1.0, 1e5)
    D = np.clip(X[:, 2], 1e8, 1e12)
    N = np.clip(X[:, 3], 1e7, 1e10)
    
    preds = []
    for p in params:
        E = p[0]                          # Irreducible loss
        A = np.abs(p[1])                  # Model coefficient (forced positive)
        alpha = np.clip(p[2], 0.1, 0.6)   # Model exponent (Chinchilla range)
        B = np.abs(p[3])                  # Data coefficient (forced positive)
        beta = np.clip(p[4], 0.1, 0.6)    # Data exponent (Chinchilla range)
        C = np.abs(p[5])                  # High LR penalty (catastrophic)
        lr_opt = np.exp(np.clip(p[6], -10, 0))  # Optimal LR (realistic range)
        D_param = np.abs(p[7])            # Low LR penalty (benign)
        F = np.abs(p[8])                  # Batch inefficiency coefficient
        G = np.abs(p[9])                  # Batch saturation scale
        
        # Core Chinchilla scaling with stable computation
        base = E + A / (N ** alpha) + B / (D ** beta)
        
        # Asymmetric learning rate penalties
        log_ratio = np.log(lr / lr_opt)
        
        # High LR: quadratic penalty (superlinear, catastrophic divergence)
        lr_high = C * np.maximum(0, log_ratio) ** 2
        
        # Low LR: linear penalty (just slower convergence, not catastrophic)
        lr_low = D_param * np.abs(np.minimum(0, log_ratio))
        
        # Batch size: exponential saturation (small batches inefficient)
        bsz_pen = F * np.exp(-bsz / G)
        
        pred = base + lr_high + lr_low + bsz_pen
        preds.append(pred)
    
    preds = np.array(preds).T
    return preds[:, 0] if preds.shape[1] == 1 else preds


def fit_scaling_law(data_points, loss_values):
    """
    Enhanced three-stage optimization with improved convergence.
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64)
    
    if y.ndim == 1:
        y = y[:, None]
    T = y.shape[1]
    
    # Analyze data for smart initialization
    lr_vals = X[:, 0]
    bsz_vals = X[:, 1]
    y_min = np.min(y)
    
    # Find optimal hyperparameters from elite performers (top 10%)
    elite_mask = y.ravel() < np.percentile(y, 10)
    lr_opt_init = np.median(lr_vals[elite_mask]) if np.any(elite_mask) else np.median(lr_vals)
    
    # 10 parameters: [E, A, alpha, B, beta, C, log_lr_opt, D, F, G]
    bounds = [
        (y_min * 0.7, y_min * 1.05),     # E: slightly below minimum
        (1e2, 1e9),                       # A: model coefficient
        (0.1, 0.6),                       # alpha: Chinchilla range
        (1e2, 1e11),                      # B: data coefficient
        (0.1, 0.6),                       # beta: Chinchilla range
        (0.0, 5.5),                       # C: high LR penalty
        (np.log(lr_vals.min() * 0.3), np.log(lr_vals.max() * 3.0)),  # log_lr_opt
        (0.0, 2.8),                       # D: low LR penalty
        (0.0, 2.2),                       # F: batch inefficiency
        (40.0, 1200.0),                   # G: batch saturation scale
    ]
    
    all_params = []
    
    for t in range(T):
        y_t = y[:, t]
        
        def objective(p):
            try:
                pred = scaling_law_func(X, p)
                mse = np.mean((pred - y_t) ** 2)
                
                # Strong Chinchilla regularization
                reg_chinchilla = 2.5e-3 * ((p[2] - 0.35)**2 + (p[4] - 0.35)**2)
                
                # Light coefficient regularization
                reg_coef = 1e-8 * (p[1]**2 + p[3]**2)
                
                # Moderate penalty term regularization
                reg_penalty = 6e-4 * (p[5]**2 + p[7]**2 + p[8]**2)
                
                return mse + reg_chinchilla + reg_coef + reg_penalty
            except:
                return 1e10
        
        # Stage 1: Broad global search
        res1 = differential_evolution(
            objective, bounds, seed=42, maxiter=240, popsize=14,
            atol=1e-7, tol=1e-7, workers=1, strategy='best1bin', polish=False
        )
        
        # Stage 2: Refined global search
        res2 = differential_evolution(
            objective, bounds, seed=123, maxiter=280, popsize=15,
            atol=1e-8, tol=1e-8, workers=1, init='latinhypercube',
            x0=res1.x, polish=False
        )
        
        # Stage 3: Local refinement with multiple restarts
        best = res2.x
        best_loss = res2.fun
        
        for restart in range(3):
            # Add small perturbation for diversity
            init_point = best + np.random.randn(10) * 0.015 if restart > 0 else best
            
            refined = minimize(
                objective, init_point,
                method='L-BFGS-B', bounds=bounds,
                options={'maxiter': 1500, 'ftol': 1e-11, 'gtol': 1e-9}
            )
            
            if refined.success and refined.fun < best_loss:
                best_loss = refined.fun
                best = refined.x
        
        all_params.append(best)
    
    params_array = np.array(all_params)
    return params_array[0] if T == 1 else params_array
# EVOLVE-BLOCK-END

#4 Run 1 R² = 0.910593

▼

Python

# EVOLVE-BLOCK-START
"""
Enhanced scaling law incorporating best practices:
- Chinchilla power laws (A/N^α + B/D^β + C) as foundation
- Asymmetric LR penalty: low LR (underfitting) >> high LR (instability)
- Smooth batch size saturation with critical point
- Multiplicative structure for cross-scale robustness
- 11 parameters for optimal expressiveness/generalizability balance
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Multiplicative scaling law:
    L = (A/N^α + B/D^β + C) * (1 + E_low*max(0,-log(lr/lr_o))^ζ_low + E_high*max(0,log(lr/lr_o))^ζ_high) * (1 + F/(1+(bsz/b_c)^η))
    
    Key insights:
    - Underfitting (low LR) causes worse degradation than instability (high LR)
    - Batch size has diminishing returns with smooth transition
    - Multiplicative effects preserve relative scale relationships
    
    Features: [lr, bsz, data_size, non_embedding_param_size]
    params = [A, α, B, β, C, E_low, E_high, log_lr_opt, F, log_bsz_crit, η]
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    params = np.asarray(params, dtype=np.float64)
    
    lr = X[:, 0]
    bsz = X[:, 1]
    data_size = X[:, 2]
    model_params = X[:, 3]
    
    # Extract and constrain parameters for numerical stability
    A = np.abs(params[0]) + 1e-10
    alpha = np.clip(params[1], 0.16, 0.54)
    B = np.abs(params[2]) + 1e-10
    beta = np.clip(params[3], 0.16, 0.54)
    C = params[4]
    E_low = np.abs(params[5])    # Penalty for low LR (stronger)
    E_high = np.abs(params[6])   # Penalty for high LR (weaker)
    lr_opt = np.exp(params[7])
    F = np.abs(params[8])         # Batch size penalty magnitude
    bsz_crit = np.exp(params[9])  # Critical batch size
    eta = np.clip(params[10], 0.38, 1.85)  # Transition sharpness
    
    # Core Chinchilla-style power laws
    base_loss = A / (model_params ** alpha) + B / (data_size ** beta) + C
    
    # Asymmetric learning rate penalty (multiplicative)
    # Physics: Low LR causes underfitting (worse), high LR causes instability (less bad)
    lr_ratio = np.clip(lr / lr_opt, 1e-6, 1e6)
    log_lr_ratio = np.log(lr_ratio)
    
    # Steeper penalty for low LR (power 1.9), gentler for high LR (power 1.45)
    penalty_low = E_low * np.maximum(0, -log_lr_ratio) ** 1.9
    penalty_high = E_high * np.maximum(0, log_lr_ratio) ** 1.45
    lr_penalty = 1.0 + penalty_low + penalty_high
    
    # Batch size effect: smooth saturation via sigmoid-like function
    # Small batches: high gradient noise (penalty)
    # Large batches: diminishing returns (approaches asymptote)
    bsz_ratio = bsz / bsz_crit
    bsz_penalty = 1.0 + F / (1.0 + bsz_ratio ** eta)
    
    # Multiplicative combination for better cross-scale behavior
    loss = base_loss * lr_penalty * bsz_penalty
    
    return loss


def fit_scaling_law(data_points, loss_values):
    """
    Robust three-stage optimization with intelligent initialization
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64)
    
    lr = X[:, 0]
    bsz = X[:, 1]
    
    loss_min, loss_max = np.min(y), np.max(y)
    loss_std = np.std(y)
    
    # Smart initialization: find optimal LR from low-loss region
    low_loss_threshold = np.percentile(y, 18)
    low_loss_mask = y <= low_loss_threshold
    if np.sum(low_loss_mask) > 10:
        lr_opt_init = np.median(lr[low_loss_mask])
        bsz_crit_init = np.median(bsz[low_loss_mask])
    else:
        # Fallback: weighted by inverse loss
        weights = np.exp(-2.0 * (y - loss_min) / (loss_std + 1e-8))
        lr_opt_init = np.average(lr, weights=weights)
        bsz_crit_init = np.median(bsz)
    
    # Bounds: [A, α, B, β, C, E_low, E_high, log_lr_opt, F, log_bsz_crit, η]
    bounds = [
        (1e-4, 82.0),           # A
        (0.19, 0.51),           # α (near Chinchilla 0.34)
        (1e-4, 82.0),           # B
        (0.19, 0.51),           # β (near Chinchilla 0.28)
        (loss_min * 0.43, loss_max * 0.89),  # C
        (0.0, 6.2),             # E_low (typically larger)
        (0.0, 4.2),             # E_high (typically smaller)
        (np.log(np.min(lr) * 0.23), np.log(np.max(lr) * 4.3)),  # log_lr_opt
        (0.0, 3.1),             # F
        (np.log(18), np.log(720)),  # log_bsz_crit
        (0.43, 1.72),           # η
    ]
    
    def objective(params):
        try:
            pred = scaling_law_func(X, params)
            
            if not np.all(np.isfinite(pred)):
                return 1e10
            
            residuals = pred - y
            mse = np.mean(residuals ** 2)
            mae = np.mean(np.abs(residuals))
            
            # Regularization toward theoretical values and simplicity
            reg_alpha = 0.0078 * (params[1] - 0.34) ** 2
            reg_beta = 0.0078 * (params[3] - 0.28) ** 2
            
            # Encourage asymmetry: E_low should be >= E_high
            reg_asymmetry = 0.0015 * max(0, params[6] - params[5] - 0.3) ** 2
            
            # Prefer moderate effect strengths
            reg_effects = 0.0011 * (params[5] ** 2 + params[6] ** 2 + params[8] ** 2)
            
            reg = reg_alpha + reg_beta + reg_asymmetry + reg_effects
            
            # Balanced objective with slight MSE emphasis
            return 0.64 * mse + 0.36 * mae + reg
        except:
            return 1e10
    
    # Stage 1: Global search with differential evolution
    result_de = differential_evolution(
        objective,
        bounds,
        strategy='best1bin',
        maxiter=470,
        popsize=23,
        atol=1e-9,
        tol=1e-9,
        seed=42,
        workers=1,
        polish=False,
        updating='deferred',
        init='sobol',
        recombination=0.7,
        mutation=(0.55, 1.45)
    )
    
    best_params = result_de.x
    best_score = result_de.fun
    
    # Stage 2: Local refinement with smart restarts
    for attempt in range(5):
        if attempt == 0:
            init_params = best_params
        else:
            # Adaptive perturbation: vary effect parameters more than exponents
            perturbation = np.random.randn(len(best_params)) * 0.024
            perturbation[1] *= 0.57  # α: keep stable
            perturbation[3] *= 0.57  # β: keep stable
            perturbation[5] *= 2.5   # E_low: explore more
            perturbation[6] *= 2.5   # E_high: explore more
            perturbation[8] *= 2.15  # F: explore more
            init_params = best_params + perturbation
        
        init_params = np.clip(init_params, [b[0] for b in bounds], [b[1] for b in bounds])
        
        result_local = minimize(
            objective,
            init_params,
            method='L-BFGS-B',
            bounds=bounds,
            options={'maxiter': 3650, 'ftol': 1e-13, 'gtol': 1e-10, 'maxls': 48}
        )
        
        if result_local.success and result_local.fun < best_score:
            best_params = result_local.x
            best_score = result_local.fun
    
    # Stage 3: Final polish with Nelder-Mead (derivative-free)
    result_nm = minimize(
        objective,
        best_params,
        method='Nelder-Mead',
        options={'maxiter': 1900, 'xatol': 1e-11, 'fatol': 1e-12}
    )
    
    if result_nm.success and result_nm.fun < best_score:
        best_params = result_nm.x
    
    return best_params
# EVOLVE-BLOCK-END

#5 Run 5 R² = 0.894674

▼

Python

# EVOLVE-BLOCK-START
"""
Refined Chinchilla scaling law with enhanced numerical stability
L = A + B/N^α + C/D^β + E*log(lr/lr0) + F*log(bsz/bsz0)
Focus: Robust fitting with physical constraints and numerical safety
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    9-parameter Chinchilla scaling law with robust numerical handling
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    params = np.asarray(params, dtype=np.float64)
    
    if params.ndim == 1:
        params = params[None, :]
    
    lr, bsz, D, N = X[:, 0], X[:, 1], X[:, 2], X[:, 3]
    
    # Unpack parameters
    A, B, alpha, C, beta, E, lr0, F, bsz0 = [params[:, i] for i in range(9)]
    
    eps = 1e-10
    
    # Core power law terms with safe operations
    N_safe = np.maximum(N[None, :], eps)
    param_term = B[:, None] / (N_safe ** alpha[:, None])
    
    D_safe = np.maximum(D[None, :], eps)
    data_term = C[:, None] / (D_safe ** beta[:, None])
    
    # Log-ratio terms with conservative clipping
    lr_safe = np.maximum(lr[None, :], eps)
    lr0_safe = np.maximum(lr0[:, None], eps)
    lr_ratio = np.clip(lr_safe / lr0_safe, 1e-2, 1e2)
    lr_term = E[:, None] * np.log(lr_ratio)
    
    bsz_safe = np.maximum(bsz[None, :], 1.0)
    bsz0_safe = np.maximum(bsz0[:, None], 1.0)
    bsz_ratio = np.clip(bsz_safe / bsz0_safe, 1e-2, 1e2)
    bsz_term = F[:, None] * np.log(bsz_ratio)
    
    pred = A[:, None] + param_term + data_term + lr_term + bsz_term
    
    return pred[0, :] if pred.shape[0] == 1 else pred


def fit_scaling_law(data_points, loss_values):
    """
    Multi-stage optimization with robust initialization
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64)
    
    if y.ndim == 1:
        y = y[:, None]
    
    lr, bsz, D, N = X[:, 0], X[:, 1], X[:, 2], X[:, 3]
    
    # Loss statistics
    loss_min, loss_max = np.min(y), np.max(y)
    loss_range = loss_max - loss_min
    
    # Find good hyperparameters from top performers
    top_20_mask = y[:, 0] < np.percentile(y, 20)
    lr0_init = np.median(lr[top_20_mask]) if np.any(top_20_mask) else np.median(lr)
    bsz0_init = np.median(bsz[top_20_mask]) if np.any(top_20_mask) else np.median(bsz)
    
    # Physics-based initialization
    N_mean = np.exp(np.mean(np.log(N)))
    D_mean = np.exp(np.mean(np.log(D)))
    
    # Chinchilla-inspired exponents
    alpha_init = 0.40
    beta_init = 0.28
    
    # Initialize coefficients
    A_init = loss_min * 0.90
    B_init = loss_range * 0.3 * (N_mean ** alpha_init)
    C_init = loss_range * 0.3 * (D_mean ** beta_init)
    E_init = 0.05
    F_init = 0.02
    
    init_params = np.array([
        A_init, B_init, alpha_init, C_init, beta_init,
        E_init, lr0_init, F_init, bsz0_init
    ])
    
    # Conservative bounds for stability
    bounds = [
        (loss_min * 0.7, loss_min * 1.3),      # A
        (1e-4, loss_range * 150),               # B
        (0.25, 0.60),                           # α
        (1e-4, loss_range * 150),               # C
        (0.18, 0.38),                           # β
        (-0.35, 0.35),                          # E
        (np.min(lr) * 0.3, np.max(lr) * 3.5),  # lr0
        (-0.25, 0.25),                          # F
        (np.min(bsz) * 0.3, np.max(bsz) * 3.5) # bsz0
    ]
    
    def objective(params):
        try:
            pred = scaling_law_func(X, params)
            residuals = pred - y[:, 0]
            mse = np.mean(residuals ** 2)
            
            # Light regularization
            reg = 1e-8 * np.sum(params ** 2)
            
            # Preference for literature values
            reg += 5e-6 * ((params[2] - 0.40) ** 2 + (params[4] - 0.28) ** 2)
            
            return mse + reg
        except (ValueError, RuntimeWarning, FloatingPointError):
            return 1e12
    
    best_params = init_params.copy()
    best_loss = objective(init_params)
    
    # Stage 1: Differential evolution for global search
    try:
        result_de = differential_evolution(
            objective, bounds,
            seed=42,
            maxiter=350,
            popsize=16,
            atol=1e-6,
            tol=1e-5,
            workers=1,
            updating='deferred',
            polish=True
        )
        if result_de.fun < best_loss:
            best_params = result_de.x
            best_loss = result_de.fun
    except Exception:
        pass
    
    # Stage 2: Local refinement from initialization
    try:
        result_local1 = minimize(
            objective,
            init_params,
            method='L-BFGS-B',
            bounds=bounds,
            options={
                'maxiter': 1500,
                'ftol': 1e-10,
                'gtol': 1e-8
            }
        )
        if result_local1.success and result_local1.fun < best_loss:
            best_params = result_local1.x
            best_loss = result_local1.fun
    except Exception:
        pass
    
    # Stage 3: Polish from best found
    try:
        result_final = minimize(
            objective,
            best_params,
            method='L-BFGS-B',
            bounds=bounds,
            options={
                'maxiter': 2000,
                'ftol': 1e-11,
                'gtol': 1e-9,
                'maxcor': 20
            }
        )
        if result_final.success and result_final.fun < best_loss:
            best_params = result_final.x
    except Exception:
        pass
    
    return best_params

# EVOLVE-BLOCK-END