SLD - MoE Scaling Law - SLDAgent + Gemini 2.5 Flash

All Runs (sorted by R²)

Best Run 1 R² = 0.958324

▼

Python

# EVOLVE-BLOCK-START
import numpy as np
from scipy.optimize import minimize, Bounds

def scaling_law_func(data_points, params):
    """
    Predicts validation loss based on MoE architecture parameters using an evolved scaling law.
    The law is of the form:
    Loss = (A * num_experts^alpha_e1 + B * num_experts^alpha_e2) * (scaled_dense_param_count^beta_p) + C0

    This model proposes that the overall scaling with dense parameters (beta_p) is modulated
    by a coefficient that is itself a sum of two power laws of the number of experts.
    This aims to capture more nuanced interactions between expert count and parameter count,
    suggesting that the benefit from dense parameters might scale differently depending on
    the number of experts. It utilizes 6 parameters. Numerical stability is enhanced by using
    log-exp transformations for power terms.

    Args:
        data_points (np.ndarray): (N,2) array with columns [num_experts, dense_parameter_count].
        params (np.ndarray): Array of 6 parameters:
                             [A, alpha_e1, B, alpha_e2, beta_p, C0].

    Returns:
        np.ndarray: Predicted validation loss values (N,).
    """
    X = np.atleast_2d(np.asarray(data_points))
    num_experts = X[:, 0]
    dense_parameter_count = X[:, 1]

    # Normalize dense_parameter_count to a more manageable scale (e.g., 1 to 8 for this dataset)
    # This improves numerical stability for power calculations and makes exponents more interpretable.
    # P_norm is a fixed constant, chosen as an approximate lower bound of parameter counts in the dataset.
    P_norm = 1e8 
    scaled_dense_param_count = dense_parameter_count / P_norm

    # Unpack parameters
    A = params[0]
    alpha_e1 = params[1] # Exponent for num_experts in the first expert-dependent coefficient term
    B = params[2]
    alpha_e2 = params[3] # Exponent for num_experts in the second expert-dependent coefficient term
    beta_p = params[4]   # Exponent for scaled_dense_param_count (common to both expert terms)
    C0 = params[5]       # Irreducible loss

    # Use log-exp transformation for numerical stability when dealing with power laws.
    # Add a small epsilon (1e-9) to bases before taking logarithm to prevent log(0) errors,
    # ensuring robustness even if num_experts or scaled_dense_param_count could theoretically be zero.
    log_num_experts = np.log(num_experts + 1e-9)
    log_scaled_dense_param_count = np.log(scaled_dense_param_count + 1e-9)

    # First expert-dependent coefficient part: A * num_experts^alpha_e1
    term_expert_coeff1 = A * np.exp(alpha_e1 * log_num_experts)
    
    # Second expert-dependent coefficient part: B * num_experts^alpha_e2
    term_expert_coeff2 = B * np.exp(alpha_e2 * log_num_experts)

    # Combine expert-dependent coefficients. This sum forms the overall coefficient
    # for the parameter scaling term.
    combined_expert_coeff = term_expert_coeff1 + term_expert_coeff2
    
    # Apply the common parameter scaling: scaled_dense_param_count^beta_p
    param_scaling_term = np.exp(beta_p * log_scaled_dense_param_count)

    # Total predicted loss including the irreducible loss C0
    pred = combined_expert_coeff * param_scaling_term + C0

    return pred

def fit_scaling_law(data_points, loss_values):
    """
    Fits the evolved 6-parameter scaling law to the provided data using bounded optimization.
    This function leverages 'L-BFGS-B', a robust quasi-Newton method, with carefully chosen
    initial guesses and parameter bounds to guide the optimization towards physically
    realistic and accurate solutions for the new model structure.

    Args:
        data_points (np.ndarray): (N,2) array with columns [num_experts, dense_parameter_count].
        loss_values (np.ndarray): Array of corresponding validation loss values (N,).

    Returns:
        np.ndarray: Optimized parameters [A, alpha_e1, B, alpha_e2, beta_p, C0].
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values)

    min_loss = np.min(y)

    # Informed initial guesses for the 6 parameters: [A, alpha_e1, B, alpha_e2, beta_p, C0].
    # These are adapted for the new model structure and common scaling law patterns.
    # Exponents (alpha_e1, alpha_e2, beta_p) are typically negative as increasing resources reduces loss.
    initial_params = np.array([
        5.0,                 # A: Coefficient for the first expert term, generally positive
        -0.05,               # alpha_e1: Exponent for num_experts in the first term (expected negative)
        1.0,                 # B: Coefficient for the second expert term, generally positive
        -0.01,               # alpha_e2: Exponent for num_experts in the second term (expected negative, potentially a smaller effect)
        -0.1,                # beta_p: Exponent for dense_parameter_count (typically negative)
        min_loss * 0.9       # C0: Irreducible loss, initialized slightly below min observed loss
    ])

    # Define bounds for each parameter to ensure physical realism and aid optimizer convergence.
    # The bounds are set to allow sufficient exploration while preventing unphysical solutions.
    # Specific bounds interpretation:
    # A, B: Must be positive (or very close to zero). Upper bounds are generous.
    # alpha_e1, alpha_e2: Exponents for num_experts. Typically negative (more experts reduce loss),
    #                     but allowing up to 0.1 to account for plateauing or minor overheads.
    # beta_p: Exponent for dense_parameter_count. Must be negative or zero (loss decreases or plateaus with more params).
    # C0: Irreducible loss, must be non-negative and less than or equal to the maximum observed loss.
    bounds = Bounds(
        [0.001, -5.0, 0.001, -5.0, -5.0, 0.0],  # Lower bounds
        [1000.0, 0.1,  1000.0, 0.1,  0.0, np.max(y)] # Upper bounds
    )

    def objective(params):
        """Objective function for minimization (Mean Squared Error)."""
        pred = scaling_law_func(X, params)
        # Robustness check: Penalize heavily if predictions are non-finite (NaN, Inf) or unphysical (negative loss).
        # This guides the optimizer away from problematic parameter regions that yield invalid predictions.
        if not np.all(np.isfinite(pred)) or np.any(pred < 0):
            return np.inf 
        mse = np.mean((pred - y) ** 2)
        return mse

    # Perform optimization using 'L-BFGS-B', which is suitable for bounded, non-linear problems.
    # This method has proven effective in previous top-performing programs for this task.
    result = minimize(objective, initial_params, method='L-BFGS-B', bounds=bounds)

    if result.success:
        # Optimization converged successfully, return the found parameters.
        return result.x
    else:
        # If optimization failed (e.g., did not converge within max iterations or reached a local minimum
        # that doesn't satisfy convergence criteria), print a warning and return the initial parameters
        # as a fallback. This indicates potential issues with the model, data, or optimization settings.
        print(f"Optimization failed: {result.message}. Returning initial parameters.")
        return initial_params
# EVOLVE-BLOCK-END

#2 Run 5 R² = 0.958143

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios
Evolved program with a more robust and proven scaling law for MoE architectures,
drawing inspiration from top-performing models in previous attempts.

Key improvements over the immediately preceding version:
1.  **Refined Scaling Law Function**: The core scaling law function `scaling_law_func`
    maintains the effective multiplicative power law form:
    L = p0 * (P_dense_norm^p1) * (N_experts^p2) + p3 * (N_experts^p4) + p5
    This structure separates the contributions of dense parameters and experts into a
    primary interactive term and a secondary expert-specific term, plus an irreducible loss.
    This form has demonstrated strong performance in modeling MoE architectures.
2.  **Input Normalization**: `dense_parameter_count` is normalized by `1e8` to improve
    numerical stability during optimization and make parameter interpretations more intuitive.
    This normalization is a fixed constant, not dependent on the input data batch.
3.  **Refined Initial Guesses**: Initial parameter values are carefully chosen to reflect
    expected ranges after normalization and typical scaling law behaviors, aiding convergence.
    These are based on the successful heuristic from previous top-performing programs.
4.  **Optimized Bounds**: Parameter bounds are designed to be restrictive yet realistic,
    guiding the optimizer towards physically plausible solutions. Small adjustments have been
    made to `p0` and `p3` bounds to potentially explore a slightly wider optimal range,
    while maintaining overall stability.
5.  **Enhanced Objective Function**: The objective function includes robust checks for `NaN`,
    `Inf`, and negative loss predictions, penalizing them heavily to steer the optimizer
    away from unstable or unrealistic parameter spaces, ensuring numerical stability.
6.  **Increased Max Iterations**: The `maxiter` for the L-BFGS-B optimizer has been further
    increased to `2000` (from `1000`) to allow more steps for convergence, especially with
    complex landscapes or tight bounds, potentially leading to a more precise minimum.
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    # data_points: (N,2) array with columns [num_experts, dense_parameter_count]
    # params: Array of 6 parameters: [p0, p1, p2, p3, p4, p5]
    # Model form: L = p0 * (P_dense_norm^p1) * (N_experts^p2) + p3 * (N_experts^p4) + p5

    num_experts = data_points[:, 0]
    dense_parameter_count = data_points[:, 1]

    # Normalize dense_parameter_count by its minimum observed value (1e8) or a similar reference point.
    # This transforms the large parameter counts (1e8 to 8e8) into a smaller, more numerically stable range (1 to 8),
    # which helps the optimizer and makes p0 more interpretable as a base loss contribution.
    # This is a fixed constant, not derived from the current batch of data_points.
    dense_parameter_count_norm = dense_parameter_count / 1e8 
    
    p = np.asarray(params).flatten()
    if len(p) != 6:
        raise ValueError(f"params must contain exactly 6 elements, but got {len(p)}")

    # Calculate the predicted loss using the evolved scaling law function.
    # np.power handles fractional and negative exponents robustly.
    # num_experts ranges from 1 to 64, dense_parameter_count_norm from 1 to 8,
    # ensuring no issues with 0^negative_exponent or extremely large base values.
    # Smallest num_experts is 1, smallest dense_parameter_count_norm is 1.

    # Term 1: Joint scaling of normalized dense parameters and experts.
    # This term captures the primary scaling effect where model size and expert count interact.
    term1 = p[0] * np.power(dense_parameter_count_norm, p[1]) * np.power(num_experts, p[2])
    
    # Term 2: Expert-specific scaling.
    # This term captures additional effects purely related to the number of experts,
    # such as routing overheads, expert capacity benefits, or other MoE-specific dynamics.
    term2 = p[3] * np.power(num_experts, p[4])
    
    # Term 3: Irreducible loss.
    # Represents the asymptotic minimum loss that cannot be reduced by increasing model size or experts.
    predicted_loss = term1 + term2 + p[5]

    return predicted_loss


def fit_scaling_law(data_points, loss_values):
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values).flatten()

    # Initial guess for the 6 parameters: [p0, p1, p2, p3, p4, p5]
    # p0: Coefficient for the primary scaling term (P_dense_norm^p1 * N_experts^p2).
    # p1: Exponent for dense_parameter_count_norm (expected negative, as more parameters reduce loss).
    # p2: Exponent for num_experts in the first term (can be positive or negative).
    # p3: Coefficient for the secondary num_experts term.
    # p4: Exponent for num_experts in the second term (can be positive or negative).
    # p5: Irreducible loss (asymptotic minimum).

    # Heuristic initial guess based on typical scaling law values and the characteristics of the data.
    # p0 is now adjusted for normalized P_dense, making it closer to the actual loss values.
    # p1 is typically negative, indicating diminishing returns with more parameters.
    # p2 and p4 often show slight negative exponents, suggesting experts generally help reduce loss.
    initial_p5 = np.min(y) * 0.9 # A good starting point for irreducible loss, slightly below the observed minimum.
    initial_params = np.array([
        2.0,   # p0: Coefficient for the primary scaling term (closer to observed loss values after normalization)
        -0.15, # p1: Exponent for dense_parameter_count (typical scaling exponent, slightly adjusted for better fit)
        -0.05, # p2: Exponent for num_experts in the first term (experts often have a subtle scaling effect)
        0.5,   # p3: Coefficient for the secondary num_experts term (reduced initial guess for potentially smaller effect)
        -0.05, # p4: Exponent for num_experts in the second term
        initial_p5 # p5: Irreducible loss
    ])

    # Bounds for the 6 parameters to guide the optimizer and ensure physical realism.
    # Tighter bounds are set to constrain the search space to more plausible values,
    # improving convergence speed and preventing physically unrealistic solutions.
    # Minor adjustments to p0 and p3 bounds to allow for a slightly wider search.
    bounds = [
        (0.001, 20.0),    # p0: Must be positive, wider upper bound to allow for more flexibility.
        (-1.0, -0.01),    # p1: Must be negative (more parameters -> less loss); tighter range for realistic exponents.
        (-1.0, 1.0),      # p2: Can be positive or negative; tighter range to prevent extreme expert scaling.
        (0.0, 10.0),      # p3: Must be non-negative; wider upper bound.
        (-1.0, 1.0),      # p4: Can be positive or negative; tighter range.
        (0.0, np.max(y))  # p5: Irreducible loss must be non-negative and less than max observed loss.
    ]

    def objective(params):
        """Calculates the Mean Squared Error for the given parameters, with penalties for invalid predictions."""
        pred = scaling_law_func(X, params)
        
        # Handle potential numerical instabilities (e.g., NaNs or Infs) or physically impossible predictions (e.g., negative loss).
        # A very large error is returned to heavily penalize such parameter combinations and guide the optimizer away.
        if np.any(np.isnan(pred)) or np.any(np.isinf(pred)) or np.any(pred < 0):
            return 1e10 # Return a very large error to guide optimizer away from these regions

        mse = np.mean((pred - y) ** 2)
        return mse

    # Using L-BFGS-B for bounded optimization, which is more robust for constrained problems
    # compared to BFGS, and allows specifying bounds for parameters.
    # Increased maxiter to allow the optimizer more steps to find a better minimum, especially with tighter bounds.
    result = minimize(objective, initial_params, method='L-BFGS-B', bounds=bounds, options={'maxiter': 2000})

    # Return optimized parameters if the optimization was successful, otherwise return the initial guess
    # to ensure a valid output even if convergence fails.
    params_opt = result.x if result.success else initial_params
    return params_opt
# EVOLVE-BLOCK-END

#3 Run 3 R² = 0.957990

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios
Evolved program with a more sophisticated scaling law for MoE architectures and improved optimization.
Improvements:
1.  Retained the robust scaling law function form: `L = p0 * (P_dense_norm^p1) * (N_experts_norm^p2) + p3 * (N_experts_norm^p4) + p5`.
    This form effectively captures multiplicative interactions between dense parameters and experts,
    common in scaling laws, and includes an additional additive term for experts and an irreducible loss.
    It uses exactly 6 parameters, adhering to the constraint.
2.  Maintained robust input normalization for `dense_parameter_count` and `num_experts` using
    constant reference values. This significantly improves numerical stability and helps the
    optimizer converge effectively by operating on smaller, well-behaved numbers.
3.  **Refined initial parameter guesses** for `p0`, `p1`, `p2`, `p3`, `p4` to be more aligned with the observed loss range and typical scaling behaviors, providing a better starting point for the optimizer.
4.  **Tightened parameter bounds** for exponents `p1`, `p2`, and `p4` to be even more physically realistic for LLM scaling, based on common observed exponents. This helps guide the optimizer towards more plausible solutions and potentially improves convergence to better local minima.
5.  Retained robust optimization settings: L-BFGS-B method for bounded optimization, increased
    `maxiter` for better convergence, and an explicit penalty in the objective function for
    numerical instabilities (NaNs, Infs) or non-physical predictions (negative loss).
"""
import numpy as np
from scipy.optimize import minimize

# Reference values for normalizing input features.
# These are constant and derived from the problem description's data ranges, not dynamically computed.
# Normalizing to make the smallest value 1 (or close to 1) for better power law calculations
# and to improve the numerical conditioning of the optimization problem.
DENSE_PARAM_REF = 1e8 # Smallest dense_parameter_count observed in data (1e8 to 8e8)
NUM_EXPERTS_REF = 1   # Smallest num_experts observed in data (1 to 64)

def scaling_law_func(data_points, params):
    # data_points: (N,2) array with columns [num_experts, dense_parameter_count]
    # params: Array of 6 parameters: [p0, p1, p2, p3, p4, p5]
    # Model form: L = p0 * (P_dense_norm^p1) * (N_experts_norm^p2) + p3 * (N_experts_norm^p4) + p5

    num_experts = data_points[:, 0]
    dense_parameter_count = data_points[:, 1]

    p = np.asarray(params).flatten()
    if len(p) != 6:
        raise ValueError(f"params must contain exactly 6 elements, but got {len(p)}")

    # Normalize inputs using constant reference values. This helps with numerical stability
    # when raising large numbers to powers, and can make the optimization landscape smoother.
    # The normalization itself does not add parameters, but implicitly adjusts the effective
    # scale of coefficients like p0 and p3.
    # The normalized input ranges (num_experts_norm from 1 to 64, dense_parameter_count_norm from 1 to 8)
    # ensure no issues with 0^negative_exponent, as min values are 1.
    num_experts_norm = num_experts / NUM_EXPERTS_REF
    dense_parameter_count_norm = dense_parameter_count / DENSE_PARAM_REF

    # Calculate the predicted loss using the evolved scaling law function.
    # np.power handles fractional and negative exponents robustly.

    # Term 1: Multiplicative scaling term, capturing the combined effect of dense parameters and experts.
    term1 = p[0] * np.power(dense_parameter_count_norm, p[1]) * np.power(num_experts_norm, p[2])
    
    # Term 2: Additional additive scaling term, specifically for the effect of experts.
    term2 = p[3] * np.power(num_experts_norm, p[4])
    
    # Term 3: Irreducible loss component (asymptotic minimum loss).
    predicted_loss = term1 + term2 + p[5]

    return predicted_loss


def fit_scaling_law(data_points, loss_values):
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values).flatten()

    # Initial guess for the 6 parameters: [p0, p1, p2, p3, p4, p5]
    # These heuristics are adjusted for the normalized input features and expected loss range (1.8 to 3.8).
    # p0: Coefficient for the primary scaling term.
    # p1: Exponent for dense_parameter_count (expected negative for loss reduction).
    # p2: Exponent for num_experts in the multiplicative term (can be positive/negative).
    # p3: Coefficient for the secondary num_experts term.
    # p4: Exponent for num_experts in the additive term (can be positive/negative).
    # p5: Irreducible loss.

    initial_p5 = np.min(y) * 0.95 # Start slightly below the minimum observed loss to allow upward movement.
    # Refined initial guesses:
    initial_params = np.array([
        1.8,   # p0: Adjusted to make p0+p3+p5 closer to max(y) for smallest configurations.
        -0.15, # p1: Slightly less aggressive negative exponent, common in LLM scaling.
        -0.05, # p2: Smaller negative exponent for experts, indicating a milder scaling effect.
        0.29,  # p3: Adjusted to balance with p0, making p0+p3+p5 closer to max(y).
        -0.05, # p4: Similar to p2, anticipating a milder loss reduction from experts.
        initial_p5 # p5: Irreducible loss.
    ])

    # Bounds for the 6 parameters to guide the optimizer towards physically realistic values
    # and prevent numerical instabilities or runaway parameter searches.
    # Tightened bounds for p1, p2, p4 based on more common LLM scaling observations (from Program 1).
    bounds = [
        (0.001, 100.0),    # p0: Must be positive. Tighter upper bound after input normalization.
        (-0.7, -0.001),    # p1: Exponent for dense_parameter_count must be negative.
                           #     Tighter lower bound (-0.7) and upper bound (-0.001) reflect typical LLM scaling.
        (-1.0, 0.5),       # p2: Exponent for num_experts. Tighter range to reflect common expert effects.
        (0.0, 50.0),       # p3: Must be non-negative.
        (-1.0, 0.5),       # p4: Exponent for num_experts, similar logic to p2.
        (0.0, np.max(y) * 1.05) # p5: Irreducible loss.
    ]

    def objective(params):
        pred = scaling_law_func(X, params)
        
        # Penalize numerical instabilities (NaNs, Infs) and non-physical predictions (negative loss).
        # A very large penalty steers the optimizer away from these regions.
        if np.any(np.isnan(pred)) or np.any(np.isinf(pred)) or np.any(pred < 0):
            return 1e12 

        mse = np.mean((pred - y) ** 2)
        return mse

    # Using L-BFGS-B for bounded optimization, which is a robust method for constrained non-linear problems.
    # Increased 'maxiter' to allow the optimizer more steps to converge to a better solution,
    # especially in potentially complex loss landscapes.
    result = minimize(objective, initial_params, method='L-BFGS-B', bounds=bounds,
                      options={'disp': False, 'maxiter': 2000})

    # Return optimized parameters if the optimization was successful; otherwise,
    # fall back to the initial guess to ensure a valid return value.
    params_opt = result.x if result.success else initial_params
    return params_opt
# EVOLVE-BLOCK-END

#4 Run 2 R² = 0.957790

▼

Python

# EVOLVE-BLOCK-START
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    """
    Predicts validation loss based on MoE architecture parameters using an extended multiplicative scaling law.

    The model form is:
    Loss = exp(log_c0 + e_experts * log(N) + e_params * log(P) + e_interact * log(N) * log(P) + e_quad_log_P * (log(P))**2) + bias

    Where:
    - N = num_experts
    - P = dense_parameter_count
    - log_c0, e_experts, e_params, e_interact, e_quad_log_P, bias are the model parameters.

    This model extends the previous version by adding a quadratic term for log(P) in the exponent.
    This allows for more flexible modeling of diminishing returns or changing scaling behavior
    with increasing model size (dense_parameter_count), while still adhering to the 6-parameter limit.

    Args:
        data_points (np.ndarray): (N_samples, 2) array with columns [num_experts, dense_parameter_count].
        params (np.ndarray): Array of 6 parameters: [log_c0, e_experts, e_params, e_interact, e_quad_log_P, bias].

    Returns:
        np.ndarray: Predicted validation loss values (N_samples,).
    """
    num_experts = data_points[:, 0]
    dense_parameter_count = data_points[:, 1]

    # Log-transform features for linearity in the exponent.
    # num_experts (1 to 64) and dense_parameter_count (1e8 to 8e8) are guaranteed positive.
    # log(1) = 0, so dense models (N=1) naturally simplify the expert-related terms.
    log_N = np.log(num_experts)
    log_P = np.log(dense_parameter_count)

    # Unpack parameters for clarity
    log_c0, e_experts, e_params, e_interact, e_quad_log_P, bias = params

    # Calculate the exponent term for the multiplicative scaling component
    # The quadratic term (log_P**2) allows the effective exponent of P to change with P.
    exponent_term = (
        log_c0
        + e_experts * log_N
        + e_params * log_P
        + e_interact * log_N * log_P
        + e_quad_log_P * (log_P**2)  # Added quadratic term for log(P)
    )
    
    # Clip exponent_term to prevent np.exp from returning inf/0 due to extreme values.
    # This improves numerical stability during optimization, especially in early iterations.
    scaling_component = np.exp(np.clip(exponent_term, -20, 20))
    
    # Total predicted loss is the scaling component plus the irreducible bias
    pred = scaling_component + bias
    
    return pred


def fit_scaling_law(data_points, loss_values):
    """
    Fits the extended multiplicative scaling law function to the given data using L-BFGS-B optimization.

    Args:
        data_points (np.ndarray): (N_samples, 2) array with columns [num_experts, dense_parameter_count].
        loss_values (np.ndarray): Array of corresponding validation loss values (N_samples,).

    Returns:
        np.ndarray: Optimized parameters [log_c0, e_experts, e_params, e_interact, e_quad_log_P, bias] (6 parameters).
    """
    num_params = 6 # The number of parameters for our scaling law model

    # --- Initial Parameter Guesses ---
    min_loss = np.min(loss_values)
    mean_loss = np.mean(loss_values)

    # Heuristic for 'bias':
    # It represents the irreducible loss, so it must be positive and less than any observed loss.
    # A margin is used to ensure numerical stability for the scaling_component (which must be positive).
    bias_init = max(0.01, min_loss - 0.1)
    # Fallback if initial bias is too close to min_loss, to ensure a positive scaling component.
    if bias_init >= min_loss - 1e-6:
        bias_init = min_loss * 0.9

    # Heuristic for 'log_c0':
    # Estimate based on the mean loss, assuming other terms are close to neutral on average.
    # Ensure the argument to np.log is positive.
    log_c0_init = np.log(max(1e-6, mean_loss - bias_init))

    # Heuristics for exponents:
    # Loss typically decreases with more experts/parameters, so linear exponents are expected to be negative.
    e_experts_init = -0.05
    e_params_init = -0.05
    e_interact_init = 0.0  # Start with no interaction effect
    e_quad_log_P_init = 0.0 # Start with no quadratic effect, allowing the optimizer to find its effect.

    initial_params = np.array([
        log_c0_init,
        e_experts_init,
        e_params_init,
        e_interact_init,
        e_quad_log_P_init, # Initial guess for the new quadratic parameter
        bias_init
    ])

    # --- Parameter Bounds ---
    # Bounds help constrain the optimization to physically meaningful regions, improving stability and convergence.
    bounds = [
        (-np.inf, np.inf),        # log_c0: Can be any real number.
        (-np.inf, 0.0),           # e_experts: Exponent for experts, typically negative for decreasing loss.
        (-np.inf, 0.0),           # e_params: Exponent for parameters, typically negative for decreasing loss.
        (-np.inf, np.inf),        # e_interact: Interaction term, can be positive or negative.
        (-np.inf, np.inf),        # e_quad_log_P: Quadratic term, can be positive or negative.
                                  # A positive value typically indicates diminishing returns.
        (0.0, min_loss - 1e-6)    # bias: Must be positive and strictly less than min_loss.
                                  # This ensures the 'scaling_component' from scaling_law_func is always positive.
    ]

    def objective(params):
        """Calculates the Mean Squared Error for the given parameters."""
        pred = scaling_law_func(data_points, params)
        
        # Check for NaN/Inf predictions; penalize heavily to guide optimizer away from invalid regions.
        if not np.all(np.isfinite(pred)):
            return np.finfo(float).max # Return a very large error to indicate an invalid state

        # Clip predictions to a reasonable range to prevent numerical issues in MSE calculation
        # and to reflect the practical range of loss values (1.8 to 3.8).
        # The lower bound ensures the scaling component is positive.
        pred = np.clip(pred, 1e-6, 10.0)

        mse = np.mean((pred - loss_values) ** 2)
        return mse

    # Use 'L-BFGS-B' method for bounded optimization, which is robust and suitable for this problem.
    result = minimize(objective, initial_params, method='L-BFGS-B', bounds=bounds)

    # Return the optimized parameters if the optimization was successful; otherwise, return initial guesses.
    params_opt = result.x if result.success else initial_params

    return params_opt
# EVOLVE-BLOCK-END

#5 Run 4 R² = 0.957564

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios
This evolved program maintains a robust and proven scaling law model for MoE architectures,
combining multiplicative power-law effects with additive logarithmic and linear terms for
expert counts. The primary refinement focuses on optimizing the parameter bounds,
especially for the linear expert term, to improve numerical stability and convergence
within the observed data range.
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    # data_points: (N,2) array with columns [num_experts, dense_parameter_count]
    # params: Array of 6 parameters: [p0, p1, p2, p3, p4, p5]
    # Model form: L = p0 * ( (P_dense/P_ref)^p1 ) * ( (N_experts/N_ref)^p2 ) + p3 * log(N_experts/N_ref) + p4 * (N_experts/N_ref) + p5

    num_experts = data_points[:, 0]
    dense_parameter_count = data_points[:, 1]

    # Reference values for normalization. These constants are chosen based on the
    # dataset's minimum observed values (1e8 for dense params, 1 for experts)
    # to ensure normalized values are >= 1, aiding numerical stability for power functions.
    P_ref = 1e8  # Minimum dense_parameter_count in the dataset
    N_ref = 1    # Minimum num_experts in the dataset

    P_norm = dense_parameter_count / P_ref
    N_norm = num_experts / N_ref

    # Ensure params is a 1D array of 6 elements for consistent unpacking.
    p = np.asarray(params).flatten()
    if len(p) != 6:
        raise ValueError(f"params must contain exactly 6 elements, but got {len(p)}")

    # Unpack parameters for clarity.
    p0, p1, p2, p3, p4, p5 = p

    # Calculate the predicted loss based on the evolved scaling law.
    # Term 1: Multiplicative power-law scaling for dense parameters and experts.
    #         This term captures the primary scaling trends.
    # Term 2: Logarithmic effect of experts, often modeling diminishing returns or overheads.
    # Term 3: Linear effect of experts, capturing direct cost or benefit per expert.
    # Term 4: An irreducible loss offset.
    
    term1 = p0 * np.power(P_norm, p1) * np.power(N_norm, p2)
    term2 = p3 * np.log(N_norm) 
    term3 = p4 * N_norm        
    
    predicted_loss = term1 + term2 + term3 + p5

    return predicted_loss


def fit_scaling_law(data_points, loss_values):
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values).flatten()

    # Initial guess for the 6 parameters: [p0, p1, p2, p3, p4, p5]
    # p0: Multiplicative coefficient for the primary power-law term.
    # p1: Exponent for normalized dense_parameter_count. Typically negative (loss decreases with more parameters).
    # p2: Exponent for normalized num_experts in the multiplicative term. Can be positive or negative.
    # p3: Coefficient for log(N_norm). Can be positive (cost) or negative (benefit).
    # p4: Coefficient for N_norm. Can be positive (cost) or negative (benefit).
    # p5: Irreducible loss (asymptotic minimum).

    # Heuristic initial guesses designed to provide a reasonable starting point for the optimizer.
    # These are derived from general knowledge of scaling laws and the observed data range.
    initial_p0 = np.mean(y) 
    if initial_p0 < 0.001: initial_p0 = 0.001 # Ensure p0 is positive

    initial_p5 = np.min(y) * 0.9 
    if initial_p5 < 0: initial_p5 = 0.001 # Ensure p5 (offset) is non-negative

    initial_params = np.array([
        initial_p0,   # p0: Coefficient, related to average loss.
        -0.1,         # p1: Expected negative exponent for parameter count.
        -0.05,        # p2: Initial guess for expert exponent (can vary).
        0.0,          # p3: Start with no logarithmic effect.
        0.0,          # p4: Start with no linear effect.
        initial_p5    # p5: Irreducible loss, close to minimum observed.
    ])

    # Bounds for the parameters, crucial for guiding the optimizer and ensuring physical realism.
    # These bounds prevent unrealistic parameter values and improve convergence.
    # P_norm range: [1, 8]. N_norm range: [1, 64]. log(N_norm) range: [0, log(64) approx 4.15].
    # The observed validation loss range is 1.8 to 3.8.
    bounds = [
        (0.001, np.max(y) * 5),  # p0: Must be positive, upper bound generous but realistic.
        (-5.0, 0.0),             # p1: Exponent for dense_parameter_count. Loss should decrease or stay constant.
        (-5.0, 5.0),             # p2: Exponent for num_experts. Can be positive or negative.
        (-5.0, 5.0),             # p3: Coefficient for log(N_norm).
        (-0.02, 0.02),           # p4: Coefficient for N_norm. Tightened bounds to prevent linear term from over-dominating
                                 # given the overall loss range (e.g., 0.02 * 64 = 1.28, which is a significant portion).
        (0.0, np.max(y))         # p5: Irreducible loss. Must be non-negative and less than max observed.
    ]

    def objective(params):
        pred = scaling_law_func(X, params)
        
        # Robustness checks for numerical stability during optimization.
        # Assign a very large error if predictions lead to NaN or Inf.
        if np.any(np.isnan(pred)) or np.any(np.isinf(pred)):
            return 1e12 

        # Ensure predicted loss is non-negative, as cross-entropy loss cannot be negative.
        pred[pred < 0] = 0.001 

        mse = np.mean((pred - y) ** 2)
        return mse

    # Employ L-BFGS-B, a robust bounded optimization method suitable for this problem.
    # Increased maxiter and tightened ftol/gtol for better convergence.
    result = minimize(objective, initial_params, method='L-BFGS-B', bounds=bounds, 
                      options={'maxiter': 10000, 'ftol': 1e-9, 'gtol': 1e-6})

    # Return the optimized parameters if the optimization was successful;
    # otherwise, fall back to the initial guess to ensure a valid return value.
    params_opt = result.x if result.success else initial_params
    return params_opt
# EVOLVE-BLOCK-END