# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios
Evolved program with a more robust and proven scaling law for MoE architectures,
drawing inspiration from top-performing models in previous attempts.
Key improvements over the immediately preceding version:
1. **Refined Scaling Law Function**: The core scaling law function `scaling_law_func`
maintains the effective multiplicative power law form:
L = p0 * (P_dense_norm^p1) * (N_experts^p2) + p3 * (N_experts^p4) + p5
This structure separates the contributions of dense parameters and experts into a
primary interactive term and a secondary expert-specific term, plus an irreducible loss.
This form has demonstrated strong performance in modeling MoE architectures.
2. **Input Normalization**: `dense_parameter_count` is normalized by `1e8` to improve
numerical stability during optimization and make parameter interpretations more intuitive.
This normalization is a fixed constant, not dependent on the input data batch.
3. **Refined Initial Guesses**: Initial parameter values are carefully chosen to reflect
expected ranges after normalization and typical scaling law behaviors, aiding convergence.
These are based on the successful heuristic from previous top-performing programs.
4. **Optimized Bounds**: Parameter bounds are designed to be restrictive yet realistic,
guiding the optimizer towards physically plausible solutions. Small adjustments have been
made to `p0` and `p3` bounds to potentially explore a slightly wider optimal range,
while maintaining overall stability.
5. **Enhanced Objective Function**: The objective function includes robust checks for `NaN`,
`Inf`, and negative loss predictions, penalizing them heavily to steer the optimizer
away from unstable or unrealistic parameter spaces, ensuring numerical stability.
6. **Increased Max Iterations**: The `maxiter` for the L-BFGS-B optimizer has been further
increased to `2000` (from `1000`) to allow more steps for convergence, especially with
complex landscapes or tight bounds, potentially leading to a more precise minimum.
"""
import numpy as np
from scipy.optimize import minimize
def scaling_law_func(data_points, params):
# data_points: (N,2) array with columns [num_experts, dense_parameter_count]
# params: Array of 6 parameters: [p0, p1, p2, p3, p4, p5]
# Model form: L = p0 * (P_dense_norm^p1) * (N_experts^p2) + p3 * (N_experts^p4) + p5
num_experts = data_points[:, 0]
dense_parameter_count = data_points[:, 1]
# Normalize dense_parameter_count by its minimum observed value (1e8) or a similar reference point.
# This transforms the large parameter counts (1e8 to 8e8) into a smaller, more numerically stable range (1 to 8),
# which helps the optimizer and makes p0 more interpretable as a base loss contribution.
# This is a fixed constant, not derived from the current batch of data_points.
dense_parameter_count_norm = dense_parameter_count / 1e8
p = np.asarray(params).flatten()
if len(p) != 6:
raise ValueError(f"params must contain exactly 6 elements, but got {len(p)}")
# Calculate the predicted loss using the evolved scaling law function.
# np.power handles fractional and negative exponents robustly.
# num_experts ranges from 1 to 64, dense_parameter_count_norm from 1 to 8,
# ensuring no issues with 0^negative_exponent or extremely large base values.
# Smallest num_experts is 1, smallest dense_parameter_count_norm is 1.
# Term 1: Joint scaling of normalized dense parameters and experts.
# This term captures the primary scaling effect where model size and expert count interact.
term1 = p[0] * np.power(dense_parameter_count_norm, p[1]) * np.power(num_experts, p[2])
# Term 2: Expert-specific scaling.
# This term captures additional effects purely related to the number of experts,
# such as routing overheads, expert capacity benefits, or other MoE-specific dynamics.
term2 = p[3] * np.power(num_experts, p[4])
# Term 3: Irreducible loss.
# Represents the asymptotic minimum loss that cannot be reduced by increasing model size or experts.
predicted_loss = term1 + term2 + p[5]
return predicted_loss
def fit_scaling_law(data_points, loss_values):
X = np.atleast_2d(np.asarray(data_points))
y = np.asarray(loss_values).flatten()
# Initial guess for the 6 parameters: [p0, p1, p2, p3, p4, p5]
# p0: Coefficient for the primary scaling term (P_dense_norm^p1 * N_experts^p2).
# p1: Exponent for dense_parameter_count_norm (expected negative, as more parameters reduce loss).
# p2: Exponent for num_experts in the first term (can be positive or negative).
# p3: Coefficient for the secondary num_experts term.
# p4: Exponent for num_experts in the second term (can be positive or negative).
# p5: Irreducible loss (asymptotic minimum).
# Heuristic initial guess based on typical scaling law values and the characteristics of the data.
# p0 is now adjusted for normalized P_dense, making it closer to the actual loss values.
# p1 is typically negative, indicating diminishing returns with more parameters.
# p2 and p4 often show slight negative exponents, suggesting experts generally help reduce loss.
initial_p5 = np.min(y) * 0.9 # A good starting point for irreducible loss, slightly below the observed minimum.
initial_params = np.array([
2.0, # p0: Coefficient for the primary scaling term (closer to observed loss values after normalization)
-0.15, # p1: Exponent for dense_parameter_count (typical scaling exponent, slightly adjusted for better fit)
-0.05, # p2: Exponent for num_experts in the first term (experts often have a subtle scaling effect)
0.5, # p3: Coefficient for the secondary num_experts term (reduced initial guess for potentially smaller effect)
-0.05, # p4: Exponent for num_experts in the second term
initial_p5 # p5: Irreducible loss
])
# Bounds for the 6 parameters to guide the optimizer and ensure physical realism.
# Tighter bounds are set to constrain the search space to more plausible values,
# improving convergence speed and preventing physically unrealistic solutions.
# Minor adjustments to p0 and p3 bounds to allow for a slightly wider search.
bounds = [
(0.001, 20.0), # p0: Must be positive, wider upper bound to allow for more flexibility.
(-1.0, -0.01), # p1: Must be negative (more parameters -> less loss); tighter range for realistic exponents.
(-1.0, 1.0), # p2: Can be positive or negative; tighter range to prevent extreme expert scaling.
(0.0, 10.0), # p3: Must be non-negative; wider upper bound.
(-1.0, 1.0), # p4: Can be positive or negative; tighter range.
(0.0, np.max(y)) # p5: Irreducible loss must be non-negative and less than max observed loss.
]
def objective(params):
"""Calculates the Mean Squared Error for the given parameters, with penalties for invalid predictions."""
pred = scaling_law_func(X, params)
# Handle potential numerical instabilities (e.g., NaNs or Infs) or physically impossible predictions (e.g., negative loss).
# A very large error is returned to heavily penalize such parameter combinations and guide the optimizer away.
if np.any(np.isnan(pred)) or np.any(np.isinf(pred)) or np.any(pred < 0):
return 1e10 # Return a very large error to guide optimizer away from these regions
mse = np.mean((pred - y) ** 2)
return mse
# Using L-BFGS-B for bounded optimization, which is more robust for constrained problems
# compared to BFGS, and allows specifying bounds for parameters.
# Increased maxiter to allow the optimizer more steps to find a better minimum, especially with tighter bounds.
result = minimize(objective, initial_params, method='L-BFGS-B', bounds=bounds, options={'maxiter': 2000})
# Return optimized parameters if the optimization was successful, otherwise return the initial guess
# to ensure a valid output even if convergence fails.
params_opt = result.x if result.success else initial_params
return params_opt
# EVOLVE-BLOCK-END