# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios.
This evolved program aims to significantly improve fitness by addressing key numerical
stability and optimization robustness issues present in the previous version.
The primary improvements include:
1. **Fixed Feature Normalization:** Applied within `scaling_law_func` using global reference
constants (`P_REF`, `V_REF`, `C_REF`). This is crucial for handling input features
spanning vastly different magnitudes (e.g., 10^7 to 10^12), preventing numerical
instability during power law calculations and making parameter interpretation more consistent.
2. **Multi-Restart Optimization:** The `fit_scaling_law` function now employs an increased
number of optimization restarts (`n_restarts = 40`) with randomized initial guesses. This
strategy is vital for non-convex objective functions like scaling law fitting, helping
to escape local minima and find a more globally optimal solution.
3. **Refined Parameter Bounds and Initial Guesses:** Bounds for coefficients (A, B, C) and
exponents (alpha_P, alpha_V, alpha_C) are carefully set to ensure physical meaningfulness
and numerical stability for normalized inputs. Initial guesses are dynamically estimated
from observed loss values and randomized to thoroughly explore the parameter space.
4. **Theoretically Grounded Functional Form:** The core scaling law function retains the
additive inverse power law form (Lossu = -( A * (Feature/Ref)^(-alpha) + ... + L_min )),
which is a widely accepted and parameter-efficient model, effectively utilizing the
maximum allowed 7 parameters. This form captures diminishing returns as resources increase.
"""
import numpy as np
from scipy.optimize import minimize
# Define fixed reference scales for feature normalization.
# These constants are chosen to roughly center the input features around 1 or a manageable range,
# improving numerical stability for the power law calculations.
# - P_non_vocab: Data ranges from 3.3e7 to 1.1e9. P_REF = 1e8 is a good central value.
# - vocab_size: Data ranges from 4096 to 96256. V_REF = 1e4 is a good central value.
# - num_characters: Data ranges from 1e8 to 5e12. C_REF = 1e12 is chosen to prevent
# the smallest normalized values from becoming extremely small (and thus their inverse
# power becoming extremely large), which can destabilize optimization. This brings
# num_characters into a range of ~1e-4 to 5.
P_REF = 1e8
V_REF = 1e4
C_REF = 1e12
def scaling_law_func(data_points, params):
"""
Predicts Lossu values based on a scaling law model with fixed feature normalization.
The model form is:
Lossu = -( A * (P_non_vocab/P_REF)^(-alpha_P) + B * (vocab_size/V_REF)^(-alpha_V) + C * (num_characters/C_REF)^(-alpha_C) + L_min )
Parameters:
data_points (np.ndarray): (N,3) array with columns [P_non_vocab, vocab_size, num_characters].
Assumed to be positive.
params (np.ndarray): 1D array of 7 parameters:
[A, alpha_P, B, alpha_V, C, alpha_C, L_min]
A, B, C: positive coefficients for each term.
alpha_P, alpha_V, alpha_C: positive exponents for each term.
L_min: positive irreducible loss component (in the positive-loss space).
Returns:
np.ndarray: Predicted Lossu values (negative, where more negative is better).
"""
X = np.asarray(data_points)
# Apply fixed normalization using predefined reference scales
# and ensure all normalized inputs are positive to prevent issues with fractional powers.
X_norm = np.copy(X).astype(float) # Ensure float type for division and powers
X_norm[:, 0] = np.maximum(X_norm[:, 0] / P_REF, 1e-12) # P_non_vocab
X_norm[:, 1] = np.maximum(X_norm[:, 1] / V_REF, 1e-12) # vocab_size
X_norm[:, 2] = np.maximum(X_norm[:, 2] / C_REF, 1e-12) # num_characters
# Unpack parameters for clarity
A, alpha_P, B, alpha_V, C, alpha_C, L_min = params
# Calculate the positive-valued loss components (Y)
# Each term represents a diminishing return as the corresponding factor increases.
predicted_Y = (A * (X_norm[:, 0] ** -alpha_P) +
B * (X_norm[:, 1] ** -alpha_V) +
C * (X_norm[:, 2] ** -alpha_C) +
L_min)
# Lossu measures improvement over unigram (negative = better),
# so we return the negative of the predicted positive-valued loss Y.
return -predicted_Y
def fit_scaling_law(data_points, loss_values):
"""
Fits the scaling law function to the provided data using L-BFGS-B optimization.
This version incorporates:
1. Fixed feature normalization (handled by `scaling_law_func` using global constants).
2. More extensive multiple restarts to better explore the parameter space and avoid local minima.
3. Refined initial parameter guesses and randomization ranges tailored for normalized inputs
and the varying impact of different features.
4. Adjusted parameter bounds for exponents to allow for a wider search.
Parameters:
data_points (np.ndarray): (N,3) array with columns [P_non_vocab, vocab_size, num_characters].
loss_values (np.ndarray): (N,) array of corresponding Lossu values.
Returns:
np.ndarray: Optimized parameters [A, alpha_P, B, alpha_V, C, alpha_C, L_min].
"""
X = np.asarray(data_points)
y_lossu = np.asarray(loss_values)
# Convert Lossu (negative = better) to positive loss (Y) for easier interpretation of L_min
y_positive_loss = -y_lossu
min_observed_positive_loss = np.min(y_positive_loss)
max_observed_positive_loss = np.max(y_positive_loss)
# Define parameter bounds to ensure physical meaningfulness and numerical stability.
# Coefficients (A, B, C) and L_min must be positive.
# Exponents (alpha_P, alpha_V, alpha_C) must be positive and typically not extremely large.
# L_min should be positive and strictly less than the minimum observed positive loss.
bounds = [
(1e-10, None), # A: Coefficient, positive.
(1e-9, 2.0), # alpha_P: Exponent, positive and up to 2.0 (common for scaling laws).
(1e-10, None), # B: Coefficient, positive.
(1e-9, 2.0), # alpha_V: Exponent, positive and up to 2.0.
(1e-10, None), # C: Coefficient, positive.
(1e-9, 2.0), # alpha_C: Exponent, positive and up to 2.0.
(1e-9, min_observed_positive_loss - 1e-6) # L_min: Must be positive and strictly less than min observed loss.
]
# Ensure L_min upper bound is valid; if calculated upper bound is too tight or invalid, set a small default.
if bounds[6][1] <= 1e-9:
bounds[6] = (1e-9, 0.1)
def objective(params):
"""Calculates the Mean Squared Error between predicted and actual Lossu."""
predicted_lossu = scaling_law_func(X, params)
mse = np.mean((predicted_lossu - y_lossu) ** 2)
return mse
best_params = None
min_mse = np.inf
n_restarts = 40 # Increased number of restarts for more thorough exploration of the parameter space.
# Base initial guess for parameters, adjusted for the feature normalization and their relative impact.
# L_min is initialized as a fraction of the minimum observed positive loss.
l_min_init_guess = min_observed_positive_loss * 0.1 if min_observed_positive_loss * 0.1 > 1e-9 else 0.01
if l_min_init_guess > bounds[6][1]: # Ensure the initial guess for L_min is within its upper bound
l_min_init_guess = bounds[6][1] / 2
if l_min_init_guess < bounds[6][0]: # Ensure the initial guess for L_min is within its lower bound
l_min_init_guess = bounds[6][0] * 2
# Estimated coefficients based on typical median normalized feature contributions (assuming alpha=0.5)
# and distributing the average loss. 'C' is smaller as 'num_characters' normalized factor is often larger.
base_initial_params = np.array([
2.5, # A: Coefficient for P_non_vocab term (normalized)
0.5, # alpha_P: Exponent
2.5, # B: Coefficient for vocab_size term (normalized)
0.5, # alpha_V: Exponent
0.25, # C: Coefficient for num_characters term (normalized)
0.5, # alpha_C: Exponent
l_min_init_guess # L_min: Irreducible loss component
])
# Randomization ranges for coefficients A, B, C.
# Allowing a wider range for coefficients to capture diverse contributions,
# especially considering the wide range of normalized feature values.
coeff_rand_min_val = 1e-6 # Allows coefficients to be very small if a feature has minimal impact
coeff_rand_max_val = max_observed_positive_loss * 15 # Allows for larger coefficients if a feature dominates or normalized factor is small
for i in range(n_restarts):
# Generate randomized initial guesses for each parameter.
# Exponents are randomized across their full defined bounds.
# L_min is randomized within its valid range.
l_min_rand_upper = min(bounds[6][1], (min_observed_positive_loss + l_min_init_guess) / 2)
rand_params = np.array([
np.random.uniform(max(bounds[0][0], coeff_rand_min_val), coeff_rand_max_val), # A
np.random.uniform(bounds[1][0], bounds[1][1]), # alpha_P (full range)
np.random.uniform(max(bounds[2][0], coeff_rand_min_val), coeff_rand_max_val), # B
np.random.uniform(bounds[3][0], bounds[3][1]), # alpha_V (full range)
np.random.uniform(max(bounds[4][0], coeff_rand_min_val), coeff_rand_max_val), # C
np.random.uniform(bounds[5][0], bounds[5][1]), # alpha_C (full range)
np.random.uniform(bounds[6][0], l_min_rand_upper) # L_min
])
# Perform optimization using L-BFGS-B, which is suitable for bounded problems.
result = minimize(objective, rand_params, method='L-BFGS-B', bounds=bounds)
if result.success:
current_mse = objective(result.x)
if current_mse < min_mse:
min_mse = current_mse
best_params = result.x
if best_params is not None:
return best_params
else:
# Fallback: If all randomized restarts fail, attempt one final optimization with the fixed base initial guess.
print("Warning: All randomized optimization restarts failed. Attempting final optimization with base initial guess.")
result = minimize(objective, base_initial_params, method='L-BFGS-B', bounds=bounds)
if result.success:
return result.x
else:
# If even the base guess fails, return the base initial guess as a last resort.
print(f"Warning: Final optimization with base guess failed: {result.message}. Returning base initial guess.")
return base_initial_params
# EVOLVE-BLOCK-END