SLD - MoE Scaling Law - SLDAgent + Claude Haiku 4.5

All Runs (sorted by R²)

Best Run 2 R² = 0.962352

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law for MoE architectures using refined log-normalized power-law
Improved numerical stability and optimization strategy while maintaining theoretical soundness
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Refined log-normalized power-law: loss = a + b*E_norm^α + c*P_norm^β + d*E_norm^α*P_norm^β
    where E_norm = log(num_experts / 16), P_norm = log(dense_parameter_count / 4e8)
    
    Improvements:
    - Smoother handling of signed powers using expm1/log1p for stability
    - Better numerical behavior across full range
    - Interaction term with independent exponents
    
    params: [a, b, c, d, alpha, beta] (6 parameters)
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    
    E = np.maximum(X[:, 0], 1.0)  # num_experts
    P = np.maximum(X[:, 1], 1e7)  # dense_parameter_count
    
    # Log-normalization with data-informed reference points
    E_norm = np.log(E / 16.0)
    P_norm = np.log(P / 4e8)
    
    params = np.asarray(params, dtype=np.float64).flatten()
    a = params[0]
    b = params[1]
    c = params[2]
    d = params[3]
    alpha = params[4]
    beta = params[5]
    
    # Clip exponents for stability
    alpha_safe = np.clip(alpha, -1.5, 1.5)
    beta_safe = np.clip(beta, -1.5, 1.5)
    
    # Improved power calculation: handle signs more carefully
    # For small |x|, use Taylor expansion behavior; for large |x|, use direct power
    def safe_signed_power(x, exponent):
        """Compute x^exponent with proper sign handling"""
        abs_x = np.abs(x)
        # Add small epsilon for numerical stability
        result = np.sign(x) * np.power(abs_x + 1e-12, exponent)
        return result
    
    E_alpha = safe_signed_power(E_norm, alpha_safe)
    P_beta = safe_signed_power(P_norm, beta_safe)
    
    # Loss model with multiplicative interaction
    loss = a + b * E_alpha + c * P_beta + d * E_alpha * P_beta
    
    return np.squeeze(loss)


def fit_scaling_law(data_points, loss_values):
    """
    Multi-stage optimization with adaptive parameters
    Stage 1: Coarse global search with larger population
    Stage 2: Fine-tuned local search with stricter convergence
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64).flatten()
    
    y_min = np.min(y)
    y_max = np.max(y)
    y_mean = np.mean(y)
    y_std = np.std(y)
    
    def objective(params):
        try:
            pred = scaling_law_func(X, params)
            # Check for NaN/Inf
            if not np.all(np.isfinite(pred)):
                return 1e10
            mse = np.mean((pred - y) ** 2)
            return mse if np.isfinite(mse) else 1e10
        except:
            return 1e10
    
    # Adaptive bounds based on loss statistics
    bounds = [
        (y_min - 0.4, y_max + 0.4),   # a: baseline loss with margin
        (-3.0, 3.0),                   # b: expert coefficient (expanded range)
        (-3.0, 3.0),                   # c: parameter coefficient (expanded range)
        (-2.0, 2.0),                   # d: interaction coefficient
        (-1.5, 1.5),                   # alpha: expert exponent
        (-1.5, 1.5),                   # beta: parameter exponent
    ]
    
    # Stage 1: Global search with differential evolution
    # Increased iterations and population for better exploration
    result_de = differential_evolution(
        objective,
        bounds,
        seed=42,
        maxiter=400,
        popsize=20,
        atol=1e-11,
        tol=1e-11,
        workers=1,
        updating='deferred',
        strategy='best1bin',
        recombination=0.8,
        mutation=(0.5, 1.5),
        polish=False
    )
    
    # Stage 2: L-BFGS-B for precise local refinement
    result_local = minimize(
        objective,
        result_de.x,
        method='L-BFGS-B',
        bounds=bounds,
        options={
            'maxiter': 300,
            'ftol': 1e-13,
            'gtol': 1e-10,
            'maxcor': 20
        }
    )
    
    # Stage 3: Additional refinement if needed
    if result_local.fun > result_de.fun * 0.95:  # If local didn't improve much
        result_local = minimize(
            objective,
            result_de.x,
            method='L-BFGS-B',
            bounds=bounds,
            options={
                'maxiter': 200,
                'ftol': 1e-14,
                'gtol': 1e-11,
                'maxcor': 25
            }
        )
    
    # Return best result
    best_params = result_local.x if result_local.fun < result_de.fun else result_de.x
    
    return best_params

# EVOLVE-BLOCK-END

#2 Run 5 R² = 0.961899

▼

Python

# EVOLVE-BLOCK-START
"""
Refined scaling law for MoE architectures with improved generalization
Balances expressiveness with parameter efficiency and numerical stability
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Scaling law with 6 parameters for MoE architectures.
    
    Model: loss = a + b*dense_norm^c + d*log(experts+1)^e + f*log(experts+1)*dense_norm^c
    - Unified power-law exponent c for dense parameters (numerical stability)
    - Flexible expert count scaling with power of logarithm
    - Interaction term naturally captures expert-parameter dynamics
    - All 6 parameters used with clear interpretability
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    params = np.atleast_1d(np.asarray(params, dtype=np.float64))
    
    num_experts = X[:, 0]
    dense_params = X[:, 1]
    
    # Normalize dense parameters for numerical stability
    dense_norm = dense_params / 1e8
    
    a, b, c, d, e, f = params[0], params[1], params[2], params[3], params[4], params[5]
    
    # Compute base features
    log_experts = np.log(num_experts + 1.0)
    dense_power = np.power(dense_norm, c)
    log_experts_power = np.power(log_experts, e)
    
    # Combine terms: baseline + dense scaling + expert effect + interaction
    pred = a + b * dense_power + d * log_experts_power + f * log_experts * dense_power
    
    return pred


def fit_scaling_law(data_points, loss_values):
    """
    Fit scaling law with efficient two-stage optimization.
    Global search with moderate effort followed by local refinement.
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.atleast_1d(np.asarray(loss_values, dtype=np.float64))
    
    def objective(params):
        try:
            pred = scaling_law_func(X, params)
            # Check for numerical issues
            if np.any(np.isnan(pred)) or np.any(np.isinf(pred)):
                return 1e10
            mse = np.mean((pred - y) ** 2)
            return mse if np.isfinite(mse) else 1e10
        except:
            return 1e10
    
    # Bounds informed by data characteristics and physical constraints
    bounds = [
        (1.0, 4.0),      # a: baseline loss (loss range is 1.8-3.8)
        (-3.0, 2.0),     # b: dense parameter coefficient (allows improvement)
        (-1.0, 0.5),     # c: dense parameter exponent (typically negative for scaling)
        (-2.0, 2.0),     # d: expert count log power coefficient
        (-1.0, 2.0),     # e: expert count exponent (power of log)
        (-2.0, 2.0)      # f: interaction coefficient
    ]
    
    # Stage 1: Global optimization with differential evolution
    # Reduced maxiter for efficiency, sufficient for convergence on this problem
    result_de = differential_evolution(
        objective,
        bounds,
        seed=42,
        maxiter=600,
        popsize=18,
        atol=1e-8,
        tol=1e-8,
        workers=1,
        updating='immediate',
        polish=True
    )
    
    # Stage 2: Local refinement with L-BFGS-B from best global solution
    result_local = minimize(
        objective,
        result_de.x,
        method='L-BFGS-B',
        bounds=bounds,
        options={'maxiter': 1500, 'ftol': 1e-9, 'gtol': 1e-9}
    )
    
    # Return best result from both stages
    return result_local.x if result_local.fun < result_de.fun else result_de.x
# EVOLVE-BLOCK-END

#3 Run 3 R² = 0.958840

▼

Python

# EVOLVE-BLOCK-START
"""
Simplified MoE scaling law with data-driven normalization and focused optimization
Hybrid power-law model: L = a * (E/E_ref)^b * (P/P_ref)^c + d * (E*P)^e + f
Uses 6 parameters with improved numerical stability and faster convergence
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Simplified hybrid scaling law with interaction term
    L = a * (E/E_ref)^b * (P/P_ref)^c + d * (E*P)^e + f
    6 parameters: [a, b, c, d, e, f]
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    
    if X.shape[0] == 0:
        return np.array([])
    
    params = np.asarray(params, dtype=np.float64)
    
    num_experts = X[:, 0]
    dense_params = X[:, 1]
    
    # Data-driven normalization
    E_norm = num_experts / 32.0
    P_norm = dense_params / 4e8
    
    # Safety clipping
    E_norm = np.clip(E_norm, 0.01, 100.0)
    P_norm = np.clip(P_norm, 0.01, 100.0)
    
    a, b, c, d, e, f = params[0], params[1], params[2], params[3], params[4], params[5]
    
    # Clip exponents
    b = np.clip(b, -1.5, 1.5)
    c = np.clip(c, -1.5, 1.5)
    e = np.clip(e, -1.5, 1.5)
    
    # Main power law term
    term1 = a * np.power(E_norm, b) * np.power(P_norm, c)
    
    # Interaction term (replaces logarithmic)
    interaction = E_norm * P_norm
    term2 = d * np.power(interaction, e)
    
    loss = term1 + term2 + f
    
    return np.clip(loss, 0.5, 5.5)


def fit_scaling_law(data_points, loss_values):
    """
    Focused multi-stage optimization with data-driven initialization
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64)
    
    if len(y) < 3:
        return np.array([1.0, -0.5, -0.5, 0.1, -0.5, 2.5])
    
    y_mean = np.mean(y)
    y_std = np.std(y)
    y_min = np.min(y)
    y_max = np.max(y)
    
    # Extract features
    E = X[:, 0]
    P = X[:, 1]
    
    # Compute relationships for initialization
    E_norm = E / 32.0
    P_norm = P / 4e8
    
    def objective(params):
        try:
            pred = scaling_law_func(X, params)
            mse = np.mean((pred - y) ** 2)
            return mse if np.isfinite(mse) else 1e10
        except:
            return 1e10
    
    # Tighter, data-driven bounds
    bounds = [
        (0.5, 3.0),                           # a: amplitude
        (-1.5, 1.0),                          # b: expert exponent
        (-1.5, 1.0),                          # c: parameter exponent
        (-0.5, 1.0),                          # d: interaction amplitude
        (-1.5, 0.5),                          # e: interaction exponent
        (y_min - y_std, y_max + y_std)        # f: bias
    ]
    
    # Focused initialization: 4 diverse candidates
    inits = [
        np.array([1.5, -0.2, -0.3, 0.1, -0.3, y_mean]),     # Conservative
        np.array([1.0, -0.3, -0.5, 0.05, -0.2, y_mean]),    # Param-heavy
        np.array([2.0, -0.1, -0.2, 0.2, -0.4, y_mean]),     # Expert-heavy
        np.array([1.3, -0.25, -0.35, 0.08, -0.25, y_mean]), # Balanced
    ]
    
    best_params = None
    best_loss = float('inf')
    
    # Stage 1: Multi-start local optimization with aggressive tolerance
    for init in inits:
        try:
            result = minimize(
                objective,
                init,
                method='L-BFGS-B',
                bounds=bounds,
                options={'maxiter': 1500, 'ftol': 1e-12, 'gtol': 1e-11}
            )
            
            if result.fun < best_loss:
                best_loss = result.fun
                best_params = result.x
        except:
            pass
    
    # Stage 2: Global optimization only if needed
    if best_loss > 0.3:
        try:
            result = differential_evolution(
                objective,
                bounds,
                maxiter=800,
                popsize=30,
                seed=42,
                atol=1e-9,
                tol=1e-9,
                workers=1,
                updating='deferred',
                polish=True
            )
            
            if result.fun < best_loss:
                best_loss = result.fun
                best_params = result.x
        except:
            pass
    
    # Fallback
    if best_params is None:
        best_params = np.array([1.3, -0.25, -0.35, 0.1, -0.3, y_mean])
    
    return best_params
# EVOLVE-BLOCK-END

#4 Run 4 R² = 0.958269

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law for MoE LLM training with expert and parameter efficiency
Captures scaling relationships through power law with interaction effects
Optimized for accuracy and robustness across diverse MoE configurations
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Scaling law: loss = a + b*E^c + d*P^e + f*E^c*P^e
    where E = num_experts, P = dense_parameter_count (normalized to 1e8)
    params: [a, b, c, d, e, f] - 6 parameters
    """
    X = np.atleast_2d(np.asarray(data_points))
    
    num_experts = X[:, 0]
    dense_params = X[:, 1]
    
    # Normalize dense_params to scale 1-8
    dense_params_norm = dense_params / 1e8
    
    a, b, c, d, e, f = params[:6]
    
    # Compute scaling law with interaction term
    expert_term = b * np.power(num_experts, c)
    param_term = d * np.power(dense_params_norm, e)
    interaction_term = f * np.power(num_experts, c) * np.power(dense_params_norm, e)
    
    loss = a + expert_term + param_term + interaction_term
    
    return loss


def fit_scaling_law(data_points, loss_values):
    """
    Fit scaling law using differential evolution followed by local optimization
    Two-stage refinement for robust convergence
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values)
    
    def objective(params):
        try:
            pred = scaling_law_func(X, params)
            mse = np.mean((pred - y) ** 2)
            # Regularization to prevent extreme parameter values
            penalty = 0.001 * np.sum(np.abs(params))
            return mse + penalty
        except:
            return 1e10
    
    # Bounds for parameters: [a, b, c, d, e, f]
    # a: baseline loss around 2-4
    # b: expert coefficient
    # c: expert exponent
    # d: parameter coefficient
    # e: parameter exponent (typically negative)
    # f: interaction coefficient
    bounds = [
        (1.0, 4.0),      # a: baseline loss
        (-2.0, 2.0),     # b: expert coefficient
        (-1.0, 1.0),     # c: expert exponent
        (-2.0, 2.0),     # d: parameter coefficient
        (-1.0, 0.5),     # e: parameter exponent
        (-2.0, 2.0),     # f: interaction coefficient
    ]
    
    # Stage 1: Global search with differential evolution
    result_de = differential_evolution(
        objective, 
        bounds, 
        seed=42, 
        maxiter=300, 
        popsize=15,
        atol=1e-6,
        tol=1e-6,
        workers=1,
        updating='immediate'
    )
    
    # Stage 2: First local refinement with L-BFGS-B
    result_local_1 = minimize(
        objective,
        result_de.x,
        method='L-BFGS-B',
        bounds=bounds,
        options={'maxiter': 500, 'ftol': 1e-8}
    )
    
    # Stage 3: Second refinement pass for final convergence
    result_local_2 = minimize(
        objective,
        result_local_1.x,
        method='L-BFGS-B',
        bounds=bounds,
        options={'maxiter': 300, 'ftol': 1e-9, 'gtol': 1e-8}
    )
    
    # Return best result
    params_opt = result_local_2.x if result_local_2.fun < result_local_1.fun else result_local_1.x
    
    return params_opt
# EVOLVE-BLOCK-END

#5 Run 1 R² = 0.949784

▼

Python

# EVOLVE-BLOCK-START
"""
Improved MoE scaling law with enhanced numerical stability
Simplified form: loss = a + b*experts^(-c) + d*log(params) + e*log(experts)*log(params) + f*log(params)^2
Physical interpretation: baseline + expert benefit + parameter benefit + interaction + parameter curvature
"""
import numpy as np
from scipy.optimize import least_squares

def scaling_law_func(data_points, params):
    """
    Scaling law with 6 parameters:
    loss = a + b*experts^(-c) + d*log(dense_params) + e*log(experts)*log(dense_params) + f*log(dense_params)^2
    
    params: [a, b, c, d, e, f]
    - a: baseline loss
    - b: expert coefficient
    - c: expert decay exponent
    - d: linear log-parameter term
    - e: interaction coefficient
    - f: quadratic log-parameter term
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    num_experts = np.maximum(X[:, 0], 1.0)
    dense_params = np.maximum(X[:, 1], 1.0)
    
    params = np.asarray(params, dtype=np.float64)
    if params.size < 6:
        params = np.pad(params, (0, 6 - params.size), mode='constant', constant_values=0)
    
    a, b, c, d, e, f = params[:6]
    c = np.maximum(c, 0.01)
    
    log_experts = np.log(num_experts)
    log_params = np.log(dense_params)
    
    expert_term = b * np.power(num_experts, -c)
    log_param_term = d * log_params
    interaction_term = e * log_experts * log_params
    quad_param_term = f * log_params * log_params
    
    return a + expert_term + log_param_term + interaction_term + quad_param_term


def fit_scaling_law(data_points, loss_values):
    """Fit with smart initialization and robust optimization"""
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64).flatten()
    
    best_params = None
    best_error = np.inf
    
    # Compute data statistics for smart initialization
    num_experts = np.maximum(X[:, 0], 1.0)
    dense_params = np.maximum(X[:, 1], 1.0)
    log_experts = np.log(num_experts)
    log_params = np.log(dense_params)
    
    y_mean = np.mean(y)
    y_std = np.std(y)
    
    initializations = [
        # Physics-informed: experts help, params help, interaction small
        np.array([y_mean, 0.8, 1.0, -0.2, -0.05, 0.02]),
        # Strong expert effect
        np.array([y_mean, 1.2, 1.2, -0.15, -0.08, 0.01]),
        # Strong parameter effect with curvature
        np.array([y_mean + 0.2, 0.5, 0.8, -0.3, -0.02, 0.03]),
        # Balanced with quadratic
        np.array([y_mean, 0.7, 0.9, -0.25, -0.06, 0.025]),
        # Interaction-dominant
        np.array([y_mean + 0.1, 0.6, 1.1, -0.1, -0.1, 0.015]),
    ]
    
    def residuals(params):
        return scaling_law_func(X, params) - y
    
    for init in initializations:
        try:
            result = least_squares(
                residuals,
                init,
                bounds=(
                    [1.0, -5.0, 0.01, -2.0, -2.0, -0.5],
                    [4.5, 5.0, 3.0, 1.0, 1.0, 0.5]
                ),
                ftol=1e-9,
                xtol=1e-9,
                gtol=1e-9,
                max_nfev=12000
            )
            error = np.sum(result.fun ** 2)
            if error < best_error:
                best_error = error
                best_params = result.x
        except:
            pass
    
    return best_params if best_params is not None else np.array([2.5, 0.8, 1.0, -0.2, -0.05, 0.02])
# EVOLVE-BLOCK-END