SLD - MoE Scaling Law - SLDAgent + Claude Sonnet 4.5

All Runs (sorted by R²)

Best Run 4 R² = 0.963224

▼

Python

# EVOLVE-BLOCK-START
"""
Enhanced MoE scaling law with multiplicative expert-parameter coupling
Key innovations:
1. Quadratic log expert modulation in denominator (captures complex saturation)
2. Smooth tanh-based expert effect with optimized constraints
3. Refined parameter bounds for faster convergence
4. Enhanced multi-start optimization strategy
"""
import numpy as np
from scipy.optimize import differential_evolution, minimize

def scaling_law_func(data_points, params):
    """
    Scaling law: L = a * N^(-b) / (1 + c * log(E)^2) + d * tanh(e * E) + f
    
    Where:
    - N: dense_parameter_count
    - E: num_experts
    - a, b: base parameter scaling (Chinchilla-inspired)
    - c: quadratic log expert interaction (non-linear saturation)
    - d, e: hyperbolic expert effect (smooth bounded saturation)
    - f: irreducible loss floor
    
    6 parameters total
    
    Key insight: log(E)^2 in denominator models complex expert-parameter trade-offs,
    tanh provides smooth saturation for expert routing effects
    """
    X = np.atleast_2d(np.asarray(data_points))
    num_experts = X[:, 0]
    dense_params = X[:, 1]
    
    params = np.asarray(params)
    if params.ndim == 1:
        params = params[None, :]
    
    # Extract and constrain parameters for numerical stability
    a = np.abs(params[:, 0]) + 1e-10      # Scale for parameter term (positive)
    b = np.clip(params[:, 1], 0.01, 1.2)  # Parameter exponent (bounded)
    c = np.clip(params[:, 2], -0.5, 0.5)  # Quadratic log coefficient (bounded)
    d = np.abs(params[:, 3])              # Hyperbolic scale (non-negative)
    e = np.clip(params[:, 4], 0.001, 0.2) # Hyperbolic rate (small positive)
    f = params[:, 5]                       # Loss floor (unconstrained)
    
    # Safe values for numerical operations
    E_safe = np.maximum(num_experts, 1.0)
    N_safe = np.maximum(dense_params, 1e6)
    
    # Term 1: Parameter scaling with quadratic log expert modulation
    # Division by (1 + c*log(E)^2) captures non-linear expert effects
    log_E_sq = np.log(E_safe[None, :]) ** 2
    denominator = 1.0 + c[:, None] * log_E_sq
    denominator = np.maximum(denominator, 0.1)  # Prevent division issues
    term1 = a[:, None] * np.power(N_safe[None, :], -b[:, None]) / denominator
    
    # Term 2: Smooth hyperbolic expert effect (bounded saturation)
    # tanh provides smooth, bounded behavior for routing overhead
    term2 = d[:, None] * np.tanh(e[:, None] * E_safe[None, :])
    
    # Combined prediction
    pred = term1 + term2 + f[:, None]
    
    return pred[0, :] if pred.shape[0] == 1 else pred.T


def fit_scaling_law(data_points, loss_values):
    """
    Enhanced three-stage optimization with adaptive strategy:
    1. Global search with differential evolution
    2. Multi-start local refinement with diversity
    3. Final polishing with tight tolerances
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values)
    
    if y.ndim == 1:
        y = y[:, None]
    
    # Analyze data characteristics
    y_min, y_max = np.min(y), np.max(y)
    
    # Adaptive parameter bounds
    # [a, b, c, d, e, f]
    bounds = [
        (0.1, 300.0),               # a: wide range for parameter scale
        (0.01, 1.0),                # b: standard power law range
        (-0.4, 0.4),                # c: bounded quadratic log coefficient
        (0.0, 20.0),                # d: hyperbolic scale (non-negative)
        (0.001, 0.15),              # e: small hyperbolic rate for smooth saturation
        (y_min - 0.8, y_min + 1.2)  # f: loss floor with adaptive range
    ]
    
    def objective(params):
        try:
            pred = scaling_law_func(X, params)
            residuals = pred - y.ravel()
            mse = np.mean(residuals ** 2)
            
            # Minimal regularization for stability
            reg = 1e-8 * np.sum(params ** 2)
            
            return mse + reg
        except:
            return 1e10
    
    # Stage 1: Global search with enhanced differential evolution
    result_de = differential_evolution(
        objective,
        bounds,
        maxiter=550,
        popsize=26,
        seed=42,
        atol=1e-8,
        tol=1e-8,
        strategy='best1bin',
        mutation=(0.6, 1.8),
        recombination=0.75,
        workers=1,
        polish=False
    )
    
    # Stage 2: Multi-start local refinement with strategic diversity
    best_result = result_de
    best_loss = result_de.fun
    
    for seed_offset in [0, 750, 1500, 2500]:
        # Perturb initial guess for exploration
        np.random.seed(42 + seed_offset)
        x0 = result_de.x + np.random.randn(6) * 0.06 * np.abs(result_de.x)
        x0 = np.clip(x0, [b[0] for b in bounds], [b[1] for b in bounds])
        
        result_local = minimize(
            objective,
            x0,
            method='L-BFGS-B',
            bounds=bounds,
            options={'maxiter': 2200, 'ftol': 1e-11, 'gtol': 1e-10}
        )
        
        if result_local.success and result_local.fun < best_loss:
            best_loss = result_local.fun
            best_result = result_local
    
    # Stage 3: Final polishing with very tight tolerances
    result_final = minimize(
        objective,
        best_result.x,
        method='L-BFGS-B',
        bounds=bounds,
        options={'maxiter': 3500, 'ftol': 1e-12, 'gtol': 1e-11}
    )
    
    if result_final.success and result_final.fun < best_loss:
        return result_final.x
    return best_result.x
# EVOLVE-BLOCK-END

#2 Run 3 R² = 0.961910

▼

Python

# EVOLVE-BLOCK-START
"""
Optimized MoE scaling law with stable logarithmic-polynomial expert interaction
Uses 6 parameters efficiently with superior numerical conditioning
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Scaling law: L = a * P^b * (1 + c * log(1+E) + d * log(1+E)^2) + e * P^f
    where E = num_experts, P = dense_parameter_count
    
    Key innovations:
    - Quadratic in log(E): allows flexible non-monotonic expert effects
    - log(1+E) handles E=1 smoothly without discontinuity
    - Polynomial form more stable than exponential or rational forms
    - Can capture both beneficial and detrimental expert scaling
    - All terms remain bounded across full data range
    """
    X = np.atleast_2d(np.asarray(data_points))
    params = np.asarray(params).flatten()[:6]
    
    num_experts = np.maximum(X[:, 0], 1.0)
    dense_params = np.maximum(X[:, 1], 1e7)
    
    a, b, c, d, e, f = params
    
    # Normalize for numerical stability
    P_norm = dense_params / 1e8
    
    # Logarithmic expert terms (quadratic polynomial)
    log_E = np.log1p(num_experts)  # log(1 + E)
    
    # Expert factor: 1 + c*log(E) + d*log(E)^2
    # This quadratic form can model:
    # - Linear expert effects (d≈0)
    # - Accelerating effects (d>0)
    # - Diminishing returns (d<0)
    expert_factor = 1.0 + c * log_E + d * log_E * log_E
    
    # Main term: power law with polynomial expert modulation
    term1 = a * np.power(P_norm, b) * expert_factor
    
    # Secondary term: captures baseline parameter scaling
    term2 = e * np.power(P_norm, f)
    
    return term1 + term2

def fit_scaling_law(data_points, loss_values):
    """
    Three-stage optimization with enhanced exploration:
    1. Global differential evolution
    2. Multi-start local optimization with diverse initializations
    3. Final high-precision refinement
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values)
    
    def objective(params):
        pred = scaling_law_func(X, params)
        mse = np.mean((pred - y) ** 2)
        # Very light regularization for stability
        reg = 1e-11 * (params[0]**2 + params[2]**2 + params[3]**2)
        return mse + reg
    
    # Bounds tuned for polynomial log form
    bounds = [
        (0.5, 80.0),       # a: main coefficient (positive)
        (-0.7, -0.03),     # b: parameter exponent (negative)
        (-4.0, 4.0),       # c: linear expert coefficient
        (-2.0, 2.0),       # d: quadratic expert coefficient
        (-8.0, 8.0),       # e: secondary coefficient
        (-0.7, -0.03),     # f: secondary exponent (negative)
    ]
    
    # Diverse initialization strategies exploring quadratic space
    init_strategies = [
        np.array([8.0, -0.25, 0.4, -0.05, 1.0, -0.18]),   # Diminishing returns (d<0)
        np.array([10.0, -0.28, -0.3, 0.03, 0.6, -0.22]),  # Accelerating effect (d>0)
        np.array([12.0, -0.22, 0.2, 0.0, 1.8, -0.26]),    # Linear expert effect
        np.array([7.0, -0.30, 0.6, -0.08, 0.4, -0.15]),   # Strong diminishing
        np.array([9.5, -0.26, 0.0, 0.05, 1.3, -0.20]),    # Pure quadratic
        np.array([11.0, -0.24, -0.4, -0.03, -0.5, -0.24]), # Mixed signs
        np.array([6.5, -0.32, 0.15, 0.08, 0.8, -0.12]),   # Accelerating secondary
    ]
    
    best_params = None
    best_score = float('inf')
    
    # Stage 1: Global search
    de_result = differential_evolution(
        objective, 
        bounds, 
        maxiter=550,
        seed=42,
        atol=1e-11,
        tol=1e-11,
        workers=1,
        strategy='best1bin',
        popsize=20,
        mutation=(0.6, 1.2),
        recombination=0.75
    )
    
    if de_result.fun < best_score:
        best_score = de_result.fun
        best_params = de_result.x
    
    # Stage 2: Multi-start local optimization
    for init in init_strategies:
        result = minimize(
            objective,
            init,
            method='L-BFGS-B',
            bounds=bounds,
            options={'maxiter': 4000, 'ftol': 1e-13, 'gtol': 1e-9}
        )
        
        if result.success and result.fun < best_score:
            best_score = result.fun
            best_params = result.x
    
    # Stage 3: Final ultra-precise refinement
    if best_params is not None:
        final_result = minimize(
            objective,
            best_params,
            method='L-BFGS-B',
            bounds=bounds,
            options={'maxiter': 5000, 'ftol': 1e-14, 'gtol': 1e-10}
        )
        
        if final_result.success and final_result.fun < best_score:
            best_params = final_result.x
    
    return best_params if best_params is not None else de_result.x
# EVOLVE-BLOCK-END

#3 Run 1 R² = 0.958625

▼

Python

# EVOLVE-BLOCK-START
"""
Advanced MoE scaling law with log-stable computation and adaptive optimization.
Uses refined power law with improved numerical handling and basin-hopping refinement.
"""
import numpy as np
from scipy.optimize import differential_evolution, minimize, basinhopping

def scaling_law_func(data_points, params):
    """
    MoE scaling law: L = a / (N^alpha * E^beta) + c + d * E^gamma / N^delta
    
    This captures:
    - Main power law: a / (N^alpha * E^beta) - fundamental scaling
    - Baseline: c - irreducible loss  
    - Expert-parameter interaction: d * E^gamma / N^delta - efficiency/overhead
    
    6 parameters: [a, alpha, beta, c, d, delta]
    gamma = |beta| * 0.45 for optimal coupling (between 0.4 and 0.5)
    """
    X = np.atleast_2d(np.asarray(data_points))
    params = np.asarray(params)
    
    if params.ndim == 1:
        params = params[None, :]
    
    num_experts = X[:, 0]
    dense_params = X[:, 1]
    
    # Numerical stability
    num_experts = np.maximum(num_experts, 1.0)
    dense_params = np.maximum(dense_params, 1e7)
    
    a, alpha, beta, c, d, delta = params[0, :6]
    
    # Normalize for numerical stability
    N_norm = dense_params / 1e8
    E = num_experts
    
    # Compute main term in log space for better numerical stability
    log_main = np.log(np.abs(a) + 1e-15) - np.abs(alpha) * np.log(N_norm) - beta * np.log(E)
    main_term = np.exp(np.clip(log_main, -50, 50))
    
    # Expert-parameter interaction with refined gamma coupling
    gamma = np.abs(beta) * 0.45  # Sweet spot between 0.4 and 0.5
    
    # Compute interaction term with safety checks
    E_gamma = np.power(E, gamma)
    N_delta = np.power(N_norm, np.abs(delta))
    interaction_term = d * E_gamma / (N_delta + 1e-15)
    
    # Final prediction
    pred = main_term + interaction_term + c
    
    return pred


def fit_scaling_law(data_points, loss_values):
    """
    Three-stage optimization with basin-hopping:
    1. Global search with differential evolution (multiple seeds)
    2. Basin-hopping for escaping local minima
    3. Fine-grained local refinement
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values)
    
    # Data-driven bounds
    y_min = np.min(y)
    y_max = np.max(y)
    y_mean = np.mean(y)
    y_std = np.std(y)
    
    # Parameter bounds: [a, alpha, beta, c, d, delta]
    bounds = [
        (1e-3, 5e11),                    # a: scale coefficient (slightly wider)
        (0.05, 0.85),                    # alpha: parameter exponent
        (-0.35, 0.35),                   # beta: expert exponent
        (y_min * 0.65, y_max * 1.25),   # c: baseline loss (adaptive)
        (-1.5e-2, 1.5e-2),               # d: interaction coefficient (wider)
        (0.05, 0.85)                     # delta: interaction exponent
    ]
    
    def objective(params_flat):
        params = params_flat.reshape(1, -1)
        try:
            pred = scaling_law_func(X, params)
            
            # MSE loss
            mse = np.mean((pred - y) ** 2)
            
            # Adaptive regularization based on parameter magnitudes
            # Stronger regularization on scale terms
            reg = 1e-11 * (params[0, 0]**2 + params[0, 4]**2) + 1e-13 * np.sum(params**2)
            
            return mse + reg
        except:
            return 1e15
    
    # Stage 1: Multiple global searches for robustness
    best_global = None
    best_loss = 1e15
    
    # Try different seeds and mutation strategies
    configs = [
        (42, (0.5, 1.3), 'best1bin', 0.75),
        (123, (0.5, 1.35), 'best1bin', 0.75),
        (789, (0.45, 1.4), 'best2bin', 0.8)
    ]
    
    for seed_val, mut_range, strategy, recomb in configs:
        try:
            result_de = differential_evolution(
                objective,
                bounds,
                seed=seed_val,
                maxiter=400,
                popsize=24,
                atol=1e-9,
                tol=1e-9,
                mutation=mut_range,
                recombination=recomb,
                strategy=strategy,
                workers=1,
                polish=False
            )
            
            if result_de.fun < best_loss:
                best_loss = result_de.fun
                best_global = result_de
        except:
            pass
    
    # Fallback if all searches fail
    if best_global is None:
        best_global = type('obj', (object,), {
            'x': np.array([1e4, 0.4, 0.1, y_mean, 0.0, 0.4]),
            'fun': 1e10
        })()
    
    # Stage 2: Basin-hopping to escape local minima
    class BoundsChecker:
        def __init__(self, bounds):
            self.bounds = bounds
        
        def __call__(self, **kwargs):
            x = kwargs["x_new"]
            in_bounds = all([b[0] <= xi <= b[1] for xi, b in zip(x, self.bounds)])
            return in_bounds
    
    bounds_checker = BoundsChecker(bounds)
    
    try:
        minimizer_kwargs = {
            "method": "L-BFGS-B",
            "bounds": bounds,
            "options": {"maxiter": 1000, "ftol": 1e-11}
        }
        
        result_basin = basinhopping(
            objective,
            best_global.x,
            minimizer_kwargs=minimizer_kwargs,
            niter=80,
            T=1.0,
            stepsize=0.5,
            accept_test=bounds_checker,
            seed=42
        )
        
        if result_basin.fun < best_loss:
            best_loss = result_basin.fun
            best_result = type('obj', (object,), {'x': result_basin.x, 'fun': result_basin.fun})()
        else:
            best_result = best_global
    except:
        best_result = best_global
        best_loss = best_global.fun
    
    # Stage 3: Fine-grained local refinement with progressive exploration
    for i in range(6):
        if i == 0:
            init = best_result.x
        else:
            # Progressive perturbation with adaptive scaling
            perturbation_scale = 1e-6 * (1.25 ** i)
            init = best_result.x + np.random.randn(6) * perturbation_scale
            init = np.clip(init, [b[0] for b in bounds], [b[1] for b in bounds])
        
        try:
            result_local = minimize(
                objective,
                init,
                method='L-BFGS-B',
                bounds=bounds,
                options={
                    'maxiter': 2500,
                    'ftol': 1e-13,
                    'gtol': 1e-11,
                    'maxfun': 20000
                }
            )
            
            if result_local.success and result_local.fun < best_loss:
                best_loss = result_local.fun
                best_result = result_local
        except:
            continue
    
    return best_result.x
# EVOLVE-BLOCK-END

#4 Run 2 R² = 0.955676

▼

Python

# EVOLVE-BLOCK-START
"""
Advanced MoE scaling law with coupled parameter-expert scaling.
Form: L = a * N^b * (1 + c * E^d) + e / sqrt(1 + E) + f
Captures multiplicative interaction between parameters and experts with efficiency gains,
plus routing overhead that decays with expert count.
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Coupled MoE scaling law:
    L = a * N^b * (1 + c * E^d) + e / sqrt(1 + E) + f
    
    Components:
    - a * N^b * (1 + c * E^d): Parameter scaling modulated by expert count
      The (1 + c * E^d) term captures how experts modify parameter efficiency
    - e / sqrt(1 + E): Routing overhead with square root decay
    - f: Baseline irreducible loss
    
    6 parameters: [a, b, c, d, e, f]
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    params = np.asarray(params, dtype=np.float64).flatten()
    
    if len(params) < 6:
        params = np.pad(params, (0, 6 - len(params)), constant_values=0.0)
    params = params[:6]
    
    num_experts = X[:, 0]
    dense_params = X[:, 1]
    
    a, b, c, d, e, f = params
    
    # Numerical safety
    E = np.maximum(num_experts, 1.0)
    N = np.maximum(dense_params, 1e7)
    
    # Constrain exponents for numerical stability
    b_safe = np.clip(b, -0.8, 0.3)
    d_safe = np.clip(d, -1.0, 1.0)
    
    # Main scaling term with expert modulation
    base_scaling = np.abs(a) * np.power(N, b_safe)
    expert_modulation = 1.0 + c * np.power(E, d_safe)
    term1 = base_scaling * expert_modulation
    
    # Routing overhead with sqrt decay (faster than logarithmic, slower than power law)
    term2 = np.abs(e) / np.sqrt(1.0 + E)
    
    # Baseline loss
    term3 = f
    
    pred = term1 + term2 + term3
    
    return pred


def fit_scaling_law(data_points, loss_values):
    """
    Robust multi-stage optimization with extensive search.
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64).flatten()
    
    # Statistical analysis
    y_mean = np.mean(y)
    y_min = np.min(y)
    y_max = np.max(y)
    y_std = np.std(y)
    loss_range = y_max - y_min
    
    def objective(params):
        try:
            pred = scaling_law_func(X, params)
            residuals = pred - y
            mse = np.mean(residuals ** 2)
            
            # Light regularization
            reg = 1e-10 * np.sum(params ** 2)
            
            return mse + reg
        except:
            return 1e12
    
    # Carefully tuned bounds based on MoE theory
    bounds = [
        (1e-14, 1e14),                # a: main coefficient
        (-0.65, 0.15),                # b: parameter exponent (negative)
        (-2.0, 2.0),                  # c: expert interaction coefficient
        (-0.8, 0.8),                  # d: expert interaction exponent
        (0.0, loss_range * 3),        # e: routing overhead
        (y_min * 0.4, y_max * 1.1)    # f: baseline
    ]
    
    # Diverse initialization strategies informed by top performers
    initializations = [
        # Strategy 1: Strong negative expert interaction (experts improve efficiency)
        np.array([y_mean * 3e8, -0.27, -0.15, 0.4, loss_range * 0.6, y_min * 0.95]),
        
        # Strategy 2: Positive expert interaction (experts add overhead)
        np.array([y_mean * 8e8, -0.32, 0.25, 0.5, loss_range * 0.4, y_min * 1.0]),
        
        # Strategy 3: Minimal expert effect
        np.array([y_mean * 5e8, -0.25, -0.02, 0.1, loss_range * 0.5, y_mean * 0.9]),
        
        # Strategy 4: Strong parameter scaling, weak expert
        np.array([y_mean * 1e9, -0.35, 0.05, 0.3, loss_range * 0.3, y_min * 0.92]),
        
        # Strategy 5: Balanced approach
        np.array([y_mean * 4e8, -0.28, -0.08, 0.35, loss_range * 0.55, y_mean * 0.88]),
        
        # Strategy 6: High routing overhead
        np.array([y_mean * 6e8, -0.24, -0.12, 0.6, loss_range * 0.9, y_min * 0.97]),
        
        # Strategy 7: Minimal overhead, strong expert
        np.array([y_mean * 2e9, -0.30, -0.20, 0.45, loss_range * 0.2, y_min * 0.93]),
    ]
    
    best_params = None
    best_loss = float('inf')
    
    # Phase 1: Multi-start local optimization
    for init in initializations:
        try:
            result = minimize(
                objective,
                init,
                method='L-BFGS-B',
                bounds=bounds,
                options={
                    'maxiter': 5000,
                    'ftol': 1e-14,
                    'gtol': 1e-11,
                    'maxfun': 6000
                }
            )
            
            if result.fun < best_loss:
                best_loss = result.fun
                best_params = result.x
        except:
            continue
    
    # Phase 2: Global search if local optimization underperforms
    if best_params is None or best_loss > 0.5 * y_std ** 2:
        try:
            result_de = differential_evolution(
                objective,
                bounds,
                maxiter=550,
                popsize=22,
                seed=42,
                atol=1e-12,
                tol=1e-12,
                workers=1,
                strategy='best1bin',
                init='latinhypercube'
            )
            
            if result_de.fun < best_loss:
                best_params = result_de.x
                best_loss = result_de.fun
                
                # Phase 3: Polish with local optimization
                try:
                    result_polish = minimize(
                        objective,
                        best_params,
                        method='L-BFGS-B',
                        bounds=bounds,
                        options={'maxiter': 3000, 'ftol': 1e-14, 'gtol': 1e-11}
                    )
                    if result_polish.success and result_polish.fun < best_loss:
                        best_params = result_polish.x
                except:
                    pass
        except:
            pass
    
    # Fallback to reasonable default
    if best_params is None:
        best_params = np.array([y_mean * 3e8, -0.27, -0.1, 0.4, loss_range * 0.5, y_min * 0.95])
    
    return best_params
# EVOLVE-BLOCK-END

#5 Run 5 R² = 0.954018

▼

Python

# EVOLVE-BLOCK-START
"""
Hybrid MoE scaling law with enhanced expert-parameter coupling
Combines logarithmic expert effects with rational saturation
Key innovation: Cross-term that captures expert-parameter interaction
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Scaling law: L = a * N^(-b) * (1 + c * log(E+1) / (d + log(E+1))) + e * N^(-f)
    where N = dense_parameter_count, E = num_experts
    6 parameters: a, b, c, d, e, f
    
    Form rationale:
    - a * N^(-b): Base Chinchilla parameter scaling
    - (1 + c * log(E+1) / (d + log(E+1))): Rational-logarithmic expert term
      * Combines smooth log growth with saturation via rational function
      * When E=1: factor ≈ 1 + c*log(2)/(d+log(2))
      * As E→∞: factor → 1 + c/d (saturates smoothly)
    - e * N^(-f): Independent parameter correction term
    
    This form is more flexible than pure log or rational alone
    """
    data_points = np.atleast_2d(np.asarray(data_points))
    params = np.asarray(params).flatten()
    
    num_experts = data_points[:, 0]
    dense_params = data_points[:, 1]
    
    a, b, c, d, e, f = params[:6]
    
    eps = 1e-10
    num_experts = np.maximum(num_experts, 1.0)
    dense_params = np.maximum(dense_params, eps)
    
    # Base parameter scaling
    base = a * (dense_params ** (-np.abs(b)))
    
    # Rational-logarithmic expert modifier
    log_experts = np.log(num_experts + 1.0)
    expert_factor = 1.0 + c * log_experts / (np.abs(d) + log_experts + eps)
    
    # Independent correction term
    correction = e * (dense_params ** (-np.abs(f)))
    
    pred = base * expert_factor + correction
    
    return pred


def fit_scaling_law(data_points, loss_values):
    """
    Advanced fitting with adaptive multi-start and ensemble refinement
    """
    data_points = np.atleast_2d(np.asarray(data_points))
    loss_values = np.asarray(loss_values).flatten()
    
    loss_mean = np.mean(loss_values)
    loss_min = np.min(loss_values)
    loss_std = np.std(loss_values)
    
    def objective(params):
        try:
            pred = scaling_law_func(data_points, params)
            
            if np.any(~np.isfinite(pred)):
                return 1e10
            
            residuals = pred - loss_values
            mse = np.mean(residuals ** 2)
            
            # Adaptive regularization
            reg = 1e-8 * np.sum(params ** 2)
            
            # Strong penalty for invalid predictions
            penalty = 0.0
            if np.any(pred < 0):
                penalty += 1e8 * np.sum(np.maximum(0, -pred) ** 2)
            if np.any(pred > 12):
                penalty += 1e6 * np.sum(np.maximum(0, pred - 12) ** 2)
            
            return mse + reg + penalty
        except:
            return 1e10
    
    # Intelligent bounds based on data and form
    bounds = [
        (0.4, 28.0),                      # a: base scale
        (0.05, 0.5),                      # b: parameter exponent
        (-1.0, 2.0),                      # c: expert effect strength
        (0.1, 5.0),                       # d: saturation parameter
        (-2.5, 7.0),                      # e: correction scale
        (0.0, 0.65),                      # f: correction exponent
    ]
    
    # Multi-strategy ensemble optimization
    best_result = None
    best_score = float('inf')
    
    # Use diverse strategies with different exploration patterns
    configs = [
        (42, 'best1bin', 22, (0.5, 1.6), 0.75),
        (123, 'randtobest1bin', 19, (0.6, 1.4), 0.8),
        (456, 'best2bin', 17, (0.7, 1.8), 0.7),
        (789, 'rand2bin', 20, (0.4, 1.5), 0.78)
    ]
    
    for seed, strategy, popsize, mutation, recombination in configs:
        result = differential_evolution(
            objective, 
            bounds,
            seed=seed,
            maxiter=480,
            popsize=popsize,
            atol=1e-9,
            tol=1e-9,
            strategy=strategy,
            mutation=mutation,
            recombination=recombination,
            polish=False,
            workers=1,
            updating='deferred'
        )
        
        if result.fun < best_score:
            best_score = result.fun
            best_result = result
    
    # Multi-stage local refinement
    # Stage 1: L-BFGS-B with standard settings
    result_local1 = minimize(
        objective,
        best_result.x,
        method='L-BFGS-B',
        bounds=bounds,
        options={'maxiter': 1500, 'ftol': 1e-12, 'gtol': 1e-11}
    )
    
    # Use best of global and local
    if result_local1.success and result_local1.fun < best_result.fun:
        intermediate = result_local1.x
        intermediate_score = result_local1.fun
    else:
        intermediate = best_result.x
        intermediate_score = best_result.fun
    
    # Stage 2: Fine-tune with tighter tolerances
    result_local2 = minimize(
        objective,
        intermediate,
        method='L-BFGS-B',
        bounds=bounds,
        options={'maxiter': 1000, 'ftol': 1e-13, 'gtol': 1e-12, 'maxcor': 25}
    )
    
    # Select best overall result
    if result_local2.success and result_local2.fun < intermediate_score:
        params_opt = result_local2.x
    else:
        params_opt = intermediate
    
    return params_opt
# EVOLVE-BLOCK-END