# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios.
This evolved program introduces a new functional form for the scaling law and
enhances the optimization strategy to improve accuracy and robustness.
Key Improvements:
1. **Scaling Law Functional Form:** The `scaling_law_func` now models each output
domain's loss (L_j) based on the sum of power-law contributions from all input
domain proportions (x_k), where the exponent (e_j) is specific to the *output*
domain j, rather than specific to the input domain k.
New form: `L_j = b_j + sum_{k=1 to F} (c_jk * x_k^e_j)`
- `b_j`: Bias for output domain j.
- `c_jk`: Coefficient for influence of input domain k on output domain j.
- `e_j`: Exponent for output domain j, applied to all input proportions affecting L_j.
This maintains the 35-parameter limit (F biases + F*F coefficients + F exponents = 5 + 25 + 5 = 35).
This structure allows each output domain to exhibit its own scaling behavior with respect
to the mixture proportions, which might better capture cross-domain generalization effects.
2. **Optimization Robustness:** The `fit_scaling_law` function now employs:
- **Multiple Random Initializations:** In addition to a deterministic initial guess,
it performs `N_TRIALS` (increased to 30) optimizations from randomly generated
initial parameters within their defined bounds. This helps to explore the
non-convex parameter space more thoroughly and mitigate getting stuck in local minima.
- **Refined Exponent Initialization Range:** For the random initializations,
the exponents are initialized within a narrower, more common range (0.0 to 2.0)
to guide the search towards plausible scaling behaviors, while the broader
L-BFGS-B bounds (0.0 to 5.0) still allow the optimizer to explore further if beneficial.
"""
import numpy as np
from scipy.optimize import minimize
def scaling_law_func(data_points, params):
"""
Predicts multi-domain loss values based on domain proportions using a generalized power law.
The model for each domain's loss L_j is:
L_j = b_j + sum_{k=1 to F} (c_jk * x_k^e_j)
Where:
- F is the number of domains (5).
- x_k is the proportion of domain k.
- b_j is the bias term for output domain j.
- c_jk is the coefficient representing the influence of input proportion x_k on output loss L_j.
- e_j is the exponent for output domain j, shared across all input proportions affecting L_j.
Total parameters: F (biases) + F*F (coefficients) + F (exponents) = 5 + 25 + 5 = 35.
Args:
data_points (np.ndarray): (N, F) array with domain proportions for F domains.
params (np.ndarray): 1D array of 35 parameters.
Returns:
np.ndarray: Predicted multi-domain loss values (N, F).
"""
X = np.atleast_2d(np.asarray(data_points)) # (N, F)
N, F = X.shape # F = 5 (number of domains)
# Unpack parameters: [b_1..b_F, c_11..c_FF, e_1..e_F]
# The parameters are ordered as: F biases, F*F coefficients, F exponents
biases = params[:F] # (F,) - bias for each output domain L_j
coeffs_flat = params[F : F + F*F] # (F*F,) - flattened coefficients
coeffs = coeffs_flat.reshape(F, F) # (F, F) - c_jk where c_jk is coeffs[j, k]
# coeffs[j, k] means the influence of input domain k on output domain j
# In this model, exponents are specific to the output domain j, so e_j
exponents = params[F + F*F : F + F*F + F] # (F,) - exponent for each output domain L_j
predicted_losses = np.zeros_like(X, dtype=float) # (N, F)
# Calculate predictions for each output domain L_j
for j in range(F):
# Get the exponent specific to this output domain j
ej = exponents[j]
# Calculate X_k^ej for all input domains k and all data points N
# np.where handles 0^e = 0, which is desired for proportions where 0 means absence.
# The bounds ensure exponents are non-negative, so 0^0 = 1 behavior is avoided for X=0.
X_powered_by_ej = np.where(X > 0, np.power(X, ej), 0.0) # (N, F)
# Multiply by coefficients c_jk for this specific output domain j
# coeffs[j, :] gives the (F,) array of coefficients [c_j1, c_j2, ..., c_jF]
# [None, :] broadcasts it to (1, F) for element-wise multiplication with (N, F)
contributions_for_j = X_powered_by_ej * coeffs[j, :][None, :] # (N, F)
# Sum contributions from all input domains k for each data point
sum_contributions_for_j = np.sum(contributions_for_j, axis=1) # (N,)
# Add bias for this output domain j
predicted_losses[:, j] = biases[j] + sum_contributions_for_j
return predicted_losses
def fit_scaling_law(data_points, loss_values):
"""
Optimizes the parameters for the scaling_law_func using L-BFGS-B with multiple initializations.
Args:
data_points (np.ndarray): (N, F) array with domain proportions for F domains.
loss_values (np.ndarray): Corresponding multi-domain losses (N, F).
Returns:
np.ndarray: Optimized parameters (1D array of 35 parameters).
"""
X = np.atleast_2d(np.asarray(data_points)) # (N, F)
y = np.asarray(loss_values) # (N, F)
N, F = X.shape # F = 5 (number of domains)
# Number of parameters for the multi-output model: F biases + F*F coefficients + F exponents
num_params = F + F*F + F # 5 + 25 + 5 = 35
# Bounds for parameters to ensure numerical stability and reasonable values
# Biases: Loss values are typically positive. Range [0, 5] covers 1.8-4.2 well.
bias_bounds = [(0.0, 5.0)] * F
# Coefficients: Can be positive or negative, allowing for various interaction types. Range [-10, 10].
coeff_bounds = [(-10.0, 10.0)] * (F*F)
# Exponents: Must be non-negative. Common scaling law exponents are often between 0 and 2.
# Allowing up to 5 provides flexibility for the optimizer, but initial random search will be tighter.
exponent_bounds_optimizer = [(0.0, 5.0)] * F
# Combined bounds for the L-BFGS-B optimizer
bounds = bias_bounds + coeff_bounds + exponent_bounds_optimizer
def objective(params_flat):
"""Calculates the Mean Squared Error between predictions and actual loss values."""
pred = scaling_law_func(X, params_flat) # (N, F)
mse = np.mean((pred - y) ** 2)
return mse
best_mse = np.inf
best_params = None
# --- Deterministic initial guess ---
# Biases: Mean loss for each domain is a good starting point.
init_biases_det = np.mean(y, axis=0) # (F,)
# Coefficients: Initialize to zeros. The optimizer will find the interactions.
init_coeffs_det = np.zeros((F, F)) # (F, F)
# Exponents: Initialize to 1.0 (linear scaling) for all output domains.
init_exponents_det = np.ones(F) * 1.0 # (F,)
# Combine initial parameters into a single 1D array
initial_params_det = np.concatenate([init_biases_det, init_coeffs_det.ravel(), init_exponents_det]) # (num_params,)
# 1. Run optimization with the deterministic initial guess
result_det = minimize(objective, initial_params_det, method='L-BFGS-B', bounds=bounds)
if result_det.success and result_det.fun < best_mse:
best_mse = result_det.fun
best_params = result_det.x
# --- Multiple Random Initializations ---
N_TRIALS = 30 # Number of random initializations to try for better global optimum search
# Tighter random initialization bounds for exponents (more typical range)
exponent_random_init_range = (0.0, 2.0)
for _ in range(N_TRIALS):
# Generate random initial parameters within their respective bounds
random_biases = np.random.uniform(bias_bounds[0][0], bias_bounds[0][1], F)
random_coeffs = np.random.uniform(coeff_bounds[0][0], coeff_bounds[0][1], F*F)
random_exponents = np.random.uniform(exponent_random_init_range[0], exponent_random_init_range[1], F)
random_initial_params = np.concatenate([random_biases, random_coeffs, random_exponents])
result_rand = minimize(objective, random_initial_params, method='L-BFGS-B', bounds=bounds)
if result_rand.success and result_rand.fun < best_mse:
best_mse = result_rand.fun
best_params = result_rand.x
# Fallback: if no successful optimization was found, return the deterministic initial guess.
# This scenario should be rare with good bounds and initial guesses.
if best_params is None:
return initial_params_det
return best_params
# EVOLVE-BLOCK-END