# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM training scenarios under data-constrained conditions.
This evolved version refines the scaling law function to use log-transformed power terms
for improved numerical stability and updates the optimization algorithm with an
informed initial guess for coefficients and robust bounds, drawing inspiration from
high-performing models in the evolution history.
"""
import numpy as np
from scipy.optimize import minimize
def scaling_law_func(data_points, params):
"""
Predicts loss based on unique tokens, model parameters, and total tokens.
The scaling law used is of the form:
Loss = L0 + CU * (unique_tokens)^(-alphaU) + CP * (params)^(-alphaP) + CT * (tokens)^(-alphaT)
Parameters:
- data_points: (N,3) array with columns [unique_tokens, params, tokens].
These are typically large positive numbers.
- params: Array of 7 parameters: [L0, CU, alphaU, CP, alphaP, CT, alphaT].
- L0: The irreducible loss floor. Should be positive.
- CU, CP, CT: Positive coefficients for the power law terms.
- alphaU, alphaP, alphaT: Positive exponents for the inverse power law terms.
Returns:
- Predicted loss values (N,) array.
"""
X = np.atleast_2d(np.asarray(data_points))
# Unpack parameters according to the defined structure
# params: [L0, CU, alphaU, CP, alphaP, CT, alphaT]
L0, CU, alphaU, CP, alphaP, CT, alphaT = params
unique_tokens = X[:, 0]
model_params = X[:, 1]
tokens = X[:, 2]
# Add a small epsilon to the base to handle potential zero values and
# ensure numerical stability for np.log, though data ranges are positive.
epsilon = 1e-10
# Calculate the inverse power law terms using log-transform for improved numerical stability.
# Exponents are negated because alphaU, alphaP, alphaT are expected to be positive,
# leading to terms that decrease with increasing unique_tokens, params, or tokens.
term_U = CU * np.exp(-alphaU * np.log(unique_tokens + epsilon))
term_P = CP * np.exp(-alphaP * np.log(model_params + epsilon))
term_T = CT * np.exp(-alphaT * np.log(tokens + epsilon))
# Sum up the terms to get the predicted loss
pred_loss = L0 + term_U + term_P + term_T
# Removed the explicit clipping of pred_loss to 0.5.
# Instead, the objective function will penalize severely negative predictions.
return pred_loss
def fit_scaling_law(data_points, loss_values):
"""
Fits the scaling law function to the provided data using bounded optimization.
Parameters:
- data_points: (N,3) array with columns [unique_tokens, params, tokens].
- loss_values: (N,) array of corresponding loss values.
Returns:
- Optimized parameters: [L0, CU, alphaU, CP, alphaP, CT, alphaT].
"""
X = np.asarray(data_points)
y = np.asarray(loss_values)
# Define the objective function (Mean Squared Error) for optimization
def objective(params):
predicted_loss = scaling_law_func(X, params)
# Calculate Mean Squared Error
mse = np.mean((predicted_loss - y) ** 2)
# Add a penalty for non-finite or unrealistic loss predictions.
# This allows the optimizer to explore slightly negative values before a hard penalty,
# which can help in escaping local minima.
if not np.isfinite(mse) or np.any(predicted_loss < -100): # Penalize very negative predictions
return 1e12 # Return a very large value to strongly penalize bad predictions
return mse
# --- Improved Initial Guess for Parameters ---
# Parameters: [L0, CU, alphaU, CP, alphaP, CT, alphaT]
# L0: Irreducible loss floor. Typically positive and below the minimum observed loss.
min_loss_y = np.min(y)
initial_L0 = max(0.01, min_loss_y * 0.8) # Based on high-performing program's heuristic
# CU, CP, CT: Coefficients. These can be large because the X^(-alpha) terms are very small.
# A larger initial value provides better exploration for these coefficients.
initial_C = 1000.0 # Adopted from high-performing program
# alphaU, alphaP, alphaT: Exponents. Typically small positive values (e.g., 0.05 to 0.2).
initial_alpha = 0.1
initial_params = np.array([
initial_L0,
initial_C, initial_alpha, # CU, alphaU
initial_C, initial_alpha, # CP, alphaP
initial_C, initial_alpha # CT, alphaT
])
# --- Define Bounds for Parameters ---
# Bounds help guide the optimizer to physically meaningful regions and improve stability.
# Parameters: [L0, CU, alphaU, CP, alphaP, CT, alphaT]
bounds = [
(0.001, np.max(y) + 1.0), # L0: Must be positive, up to slightly above max observed loss
(1e-9, None), # CU: Positive, unbounded above to allow for large coefficients
(1e-9, 2.0), # alphaU: Positive, typically < 1, allow up to 2.0
(1e-9, None), # CP: Positive, unbounded above
(1e-9, 2.0), # alphaP: Positive, typically < 1, allow up to 2.0
(1e-9, None), # CT: Positive, unbounded above
(1e-9, 2.0) # alphaT: Positive, typically < 1, allow up to 2.0
]
# Smallest positive lower bound 1e-9 to avoid numerical issues (e.g., log(0)).
# --- Optimization using L-BFGS-B ---
# L-BFGS-B is chosen for its ability to handle bounds effectively,
# which is crucial for the physical constraints of scaling law parameters.
# Optimizer options (maxiter, ftol) are tuned for robust convergence.
result = minimize(objective, initial_params, method='L-BFGS-B', bounds=bounds,
options={'disp': False, 'maxiter': 2000, 'ftol': 1e-9})
# Return optimized parameters if successful, otherwise the initial guess as a fallback.
params_opt = result.x if result.success else initial_params
# Final clip to bounds as a safeguard, in case optimizer returns values slightly outside bounds
# (e.g., due to floating point precision or stopping criteria).
params_opt = np.clip(params_opt,
[b[0] if b[0] is not None else -np.inf for b in bounds],
[b[1] if b[1] is not None else np.inf for b in bounds])
return params_opt
# EVOLVE-BLOCK-END