← Back to Leaderboard

MoE Scaling Law

Agent: SLDAgent
Model: Gemini 2.5 Flash
Best R²: 0.958324
Mean R²: 0.957962
Min R²: 0.957564
Runs: 5

All Runs (sorted by R²)

Best Run 1 R² = 0.958324
Python
# EVOLVE-BLOCK-START
import numpy as np
from scipy.optimize import minimize, Bounds

def scaling_law_func(data_points, params):
    """
    Predicts validation loss based on MoE architecture parameters using an evolved scaling law.
    The law is of the form:
    Loss = (A * num_experts^alpha_e1 + B * num_experts^alpha_e2) * (scaled_dense_param_count^beta_p) + C0

    This model proposes that the overall scaling with dense parameters (beta_p) is modulated
    by a coefficient that is itself a sum of two power laws of the number of experts.
    This aims to capture more nuanced interactions between expert count and parameter count,
    suggesting that the benefit from dense parameters might scale differently depending on
    the number of experts. It utilizes 6 parameters. Numerical stability is enhanced by using
    log-exp transformations for power terms.

    Args:
        data_points (np.ndarray): (N,2) array with columns [num_experts, dense_parameter_count].
        params (np.ndarray): Array of 6 parameters:
                             [A, alpha_e1, B, alpha_e2, beta_p, C0].

    Returns:
        np.ndarray: Predicted validation loss values (N,).
    """
    X = np.atleast_2d(np.asarray(data_points))
    num_experts = X[:, 0]
    dense_parameter_count = X[:, 1]

    # Normalize dense_parameter_count to a more manageable scale (e.g., 1 to 8 for this dataset)
    # This improves numerical stability for power calculations and makes exponents more interpretable.
    # P_norm is a fixed constant, chosen as an approximate lower bound of parameter counts in the dataset.
    P_norm = 1e8 
    scaled_dense_param_count = dense_parameter_count / P_norm

    # Unpack parameters
    A = params[0]
    alpha_e1 = params[1] # Exponent for num_experts in the first expert-dependent coefficient term
    B = params[2]
    alpha_e2 = params[3] # Exponent for num_experts in the second expert-dependent coefficient term
    beta_p = params[4]   # Exponent for scaled_dense_param_count (common to both expert terms)
    C0 = params[5]       # Irreducible loss

    # Use log-exp transformation for numerical stability when dealing with power laws.
    # Add a small epsilon (1e-9) to bases before taking logarithm to prevent log(0) errors,
    # ensuring robustness even if num_experts or scaled_dense_param_count could theoretically be zero.
    log_num_experts = np.log(num_experts + 1e-9)
    log_scaled_dense_param_count = np.log(scaled_dense_param_count + 1e-9)

    # First expert-dependent coefficient part: A * num_experts^alpha_e1
    term_expert_coeff1 = A * np.exp(alpha_e1 * log_num_experts)
    
    # Second expert-dependent coefficient part: B * num_experts^alpha_e2
    term_expert_coeff2 = B * np.exp(alpha_e2 * log_num_experts)

    # Combine expert-dependent coefficients. This sum forms the overall coefficient
    # for the parameter scaling term.
    combined_expert_coeff = term_expert_coeff1 + term_expert_coeff2
    
    # Apply the common parameter scaling: scaled_dense_param_count^beta_p
    param_scaling_term = np.exp(beta_p * log_scaled_dense_param_count)

    # Total predicted loss including the irreducible loss C0
    pred = combined_expert_coeff * param_scaling_term + C0

    return pred

def fit_scaling_law(data_points, loss_values):
    """
    Fits the evolved 6-parameter scaling law to the provided data using bounded optimization.
    This function leverages 'L-BFGS-B', a robust quasi-Newton method, with carefully chosen
    initial guesses and parameter bounds to guide the optimization towards physically
    realistic and accurate solutions for the new model structure.

    Args:
        data_points (np.ndarray): (N,2) array with columns [num_experts, dense_parameter_count].
        loss_values (np.ndarray): Array of corresponding validation loss values (N,).

    Returns:
        np.ndarray: Optimized parameters [A, alpha_e1, B, alpha_e2, beta_p, C0].
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values)

    min_loss = np.min(y)

    # Informed initial guesses for the 6 parameters: [A, alpha_e1, B, alpha_e2, beta_p, C0].
    # These are adapted for the new model structure and common scaling law patterns.
    # Exponents (alpha_e1, alpha_e2, beta_p) are typically negative as increasing resources reduces loss.
    initial_params = np.array([
        5.0,                 # A: Coefficient for the first expert term, generally positive
        -0.05,               # alpha_e1: Exponent for num_experts in the first term (expected negative)
        1.0,                 # B: Coefficient for the second expert term, generally positive
        -0.01,               # alpha_e2: Exponent for num_experts in the second term (expected negative, potentially a smaller effect)
        -0.1,                # beta_p: Exponent for dense_parameter_count (typically negative)
        min_loss * 0.9       # C0: Irreducible loss, initialized slightly below min observed loss
    ])

    # Define bounds for each parameter to ensure physical realism and aid optimizer convergence.
    # The bounds are set to allow sufficient exploration while preventing unphysical solutions.
    # Specific bounds interpretation:
    # A, B: Must be positive (or very close to zero). Upper bounds are generous.
    # alpha_e1, alpha_e2: Exponents for num_experts. Typically negative (more experts reduce loss),
    #                     but allowing up to 0.1 to account for plateauing or minor overheads.
    # beta_p: Exponent for dense_parameter_count. Must be negative or zero (loss decreases or plateaus with more params).
    # C0: Irreducible loss, must be non-negative and less than or equal to the maximum observed loss.
    bounds = Bounds(
        [0.001, -5.0, 0.001, -5.0, -5.0, 0.0],  # Lower bounds
        [1000.0, 0.1,  1000.0, 0.1,  0.0, np.max(y)] # Upper bounds
    )

    def objective(params):
        """Objective function for minimization (Mean Squared Error)."""
        pred = scaling_law_func(X, params)
        # Robustness check: Penalize heavily if predictions are non-finite (NaN, Inf) or unphysical (negative loss).
        # This guides the optimizer away from problematic parameter regions that yield invalid predictions.
        if not np.all(np.isfinite(pred)) or np.any(pred < 0):
            return np.inf 
        mse = np.mean((pred - y) ** 2)
        return mse

    # Perform optimization using 'L-BFGS-B', which is suitable for bounded, non-linear problems.
    # This method has proven effective in previous top-performing programs for this task.
    result = minimize(objective, initial_params, method='L-BFGS-B', bounds=bounds)

    if result.success:
        # Optimization converged successfully, return the found parameters.
        return result.x
    else:
        # If optimization failed (e.g., did not converge within max iterations or reached a local minimum
        # that doesn't satisfy convergence criteria), print a warning and return the initial parameters
        # as a fallback. This indicates potential issues with the model, data, or optimization settings.
        print(f"Optimization failed: {result.message}. Returning initial parameters.")
        return initial_params
# EVOLVE-BLOCK-END
#2 Run 5 R² = 0.958143
#3 Run 3 R² = 0.957990
#4 Run 2 R² = 0.957790
#5 Run 4 R² = 0.957564