# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios
Evolved program using a 4-parameter power law with a data size shift,
improved initial parameter guessing, robust bounds, parameter transformation for optimization,
and vectorized scaling_law_func.
"""
import numpy as np
from scipy.optimize import minimize
def scaling_law_func(data_points, params):
"""
Predicts loss values based on a 4-parameter scaling law:
L = A * (data_size + C)^(-alpha) + B
data_points: (N,1) array with columns [data_size]
params: Array of up to 4 parameters. Expected shape (T, P) where P=4, or (P,) for T=1.
[A, alpha, C, B] for each of T parameter sets.
Returns: Predicted loss values (N,) or (N, T)
"""
X = np.atleast_2d(np.asarray(data_points)) # (N, F)
N, F = X.shape
# As per problem description, F=1 (data_size is the only feature)
if F != 1:
raise ValueError(f"scaling_law_func expects data_points to have 1 feature (data_size), but got {F}.")
params = np.asarray(params)
if params.ndim == 1:
# If a single set of parameters is passed as a 1D array, reshape it to (1, P)
params = params[None, :]
T, P = params.shape # T: number of parameter sets, P: number of parameters per set
# Ensure P matches the expected 4 parameters for our scaling law
if P != 4:
raise ValueError(f"Expected 4 parameters [A, alpha, C, B] for scaling_law_func, but got {P}.")
A_vals = params[:, 0] # (T,)
alpha_vals = params[:, 1] # (T,)
C_vals = params[:, 2] # (T,)
B_vals = params[:, 3] # (T,)
data_size = X[:, 0] # (N,)
# Expand data_size to (N, 1) and parameter arrays to (1, T) for broadcasting
data_size_expanded = data_size[:, np.newaxis] # (N, 1)
# Calculate shifted data size (data_size + C) for all N data points and T parameter sets
# shifted_data_size will be (N, T)
shifted_data_size = data_size_expanded + C_vals[np.newaxis, :]
# Numerical stability: ensure base for power operation is positive.
# This handles cases where C might temporarily lead to non-positive bases during optimization.
shifted_data_size = np.maximum(shifted_data_size, 1e-9)
# Compute the power term for each (N, T) combination
# alpha_vals[np.newaxis, :] makes it (1, T) for broadcasting
power_term = shifted_data_size ** (-alpha_vals[np.newaxis, :]) # (N, T) ** (1, T) -> (N, T)
# Compute the scaling law: L = A * (data_size + C)^(-alpha) + B
# A_vals[np.newaxis, :] and B_vals[np.newaxis, :] make them (1, T) for broadcasting
pred = A_vals[np.newaxis, :] * power_term + B_vals[np.newaxis, :] # (1, T) * (N, T) + (1, T) -> (N, T)
# Return predictions. If only one parameter set (T=1), return a 1D array.
return pred[:, 0] if T == 1 else pred
def fit_scaling_law(data_points, loss_values):
"""
Optimizes the parameters for the 4-parameter scaling law using L-BFGS-B.
Uses parameter transformation (log-space for A, alpha, B) for improved optimization stability,
along with refined initial guesses and robust bounds.
data_points: (N,1) array with columns [data_size]
loss_values: Array of corresponding loss values (N,)
Returns: Optimized parameters (1D array of 4 parameters: [A, alpha, C, B])
"""
X = np.atleast_2d(np.asarray(data_points)) # (N, F)
y = np.asarray(loss_values) # (N,)
# Ensure F=1 as expected for this specific scaling law
N, F = X.shape
if F != 1:
raise ValueError(f"fit_scaling_law expects data_points to have 1 feature (data_size), but got {F}.")
data_size = X[:, 0] # Extract the data_size feature
# --- Initial Guess for Parameters [A, alpha, C, B] (in their original scale) ---
# A: Scaling coefficient.
A_init_orig = (y.max() - y.min()) * (data_size.min()**0.2)
# alpha: Exponent of the power law.
alpha_init_orig = 0.2
# C: Data size shift.
C_init_orig = 0
# B: Asymptotic minimum loss (irreducible loss).
# Estimate from the tail of the loss values.
B_init_orig = np.mean(y[-max(1, int(len(y)*0.2)):])
# Ensure B_init is below the 5th percentile of observed losses, but at least 1e-6 for log-transform safety.
B_init_orig = min(B_init_orig, np.percentile(y, 5))
B_init_orig = max(B_init_orig, 1e-6)
# --- Transform initial guesses for optimization (log-space for A, alpha, B) ---
# Ensure positive values before taking log, based on their future lower bounds.
A_init_orig = max(A_init_orig, 1e-6) # Lower bound for A
alpha_init_orig = max(alpha_init_orig, 1e-3) # Lower bound for alpha
initial_params_transformed = np.array([
np.log(A_init_orig),
np.log(alpha_init_orig),
C_init_orig, # C is not log-transformed
np.log(B_init_orig)
])
# --- Parameter Bounds ---
min_data_size = np.min(data_size)
# Original bounds for [A, alpha, C, B]
bounds_orig = [
(1e-6, None), # A: Must be positive.
(1e-3, 1.0), # alpha: Positive, typically < 1.0 for loss scaling.
(-min_data_size + 1.0, None), # C: Ensures (data_size + C) > 0 for all data_size.
(1e-6, max(1e-6, np.percentile(y, 10))) # B: Must be positive, and typically below the lower percentiles of observed loss.
# Use max(1e-6, ...) to ensure log-transform safety and prevent upper bound < lower bound.
]
# Transform bounds for log-space parameters (A, alpha, B)
bounds_transformed = [
(np.log(bounds_orig[0][0]), None), # log(A)
(np.log(bounds_orig[1][0]), np.log(bounds_orig[1][1])), # log(alpha)
bounds_orig[2], # C (no transformation)
(np.log(bounds_orig[3][0]), np.log(bounds_orig[3][1])) # log(B)
]
# Adjust initial_params_transformed to be strictly within their bounds.
# This helps L-BFGS-B start from a valid point.
for i in range(len(initial_params_transformed)):
lower_bound = bounds_transformed[i][0]
upper_bound = bounds_transformed[i][1]
if lower_bound is not None:
initial_params_transformed[i] = max(initial_params_transformed[i], lower_bound)
if upper_bound is not None:
initial_params_transformed[i] = min(initial_params_transformed[i], upper_bound)
# --- Objective Function ---
def objective(params_transformed_opt):
# Convert transformed parameters back to original scale for scaling_law_func
A_val = np.exp(params_transformed_opt[0])
alpha_val = np.exp(params_transformed_opt[1])
C_val = params_transformed_opt[2]
B_val = np.exp(params_transformed_opt[3])
current_params = np.array([A_val, alpha_val, C_val, B_val])
pred = scaling_law_func(X, current_params)
mse = np.mean((pred - y) ** 2)
return mse
# --- Optimization ---
# Use L-BFGS-B, which supports bounds, for robust local optimization.
result = minimize(objective, initial_params_transformed, method='L-BFGS-B', bounds=bounds_transformed)
# Convert optimized parameters back to original scale
if result.success:
params_opt_transformed = result.x
else:
# If optimization fails, use the (adjusted) initial guess
params_opt_transformed = initial_params_transformed
A_opt = np.exp(params_opt_transformed[0])
alpha_opt = np.exp(params_opt_transformed[1])
C_opt = params_opt_transformed[2]
B_opt = np.exp(params_opt_transformed[3])
params_opt = np.array([A_opt, alpha_opt, C_opt, B_opt])
return params_opt
# EVOLVE-BLOCK-END