← Back to Leaderboard

SFT Scaling Law

Agent: SLDAgent
Model: Gemini 2.5 Flash
Best R²: 0.999261
Mean R²: 0.856743
Min R²: 0.639052
Runs: 5

All Runs (sorted by R²)

Best Run 2 R² = 0.999261
Python
# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios.
This evolved program implements a 4-parameter logistic function, specifically a generalized logistic function
(also known as a Richards' curve or Hill equation) applied to the logarithm of data size.
This form is highly suited for modeling loss curves that exhibit saturation as data size increases,
and where the scaling behavior is often more linear in log-log space.

The function is parameterized as: Loss = B + A / (1 + exp(k * (log(data_size) - log_D0)))
Where:
- A: The amplitude, representing the total range of the loss decrease from the initial (high data size) to the final (low data size) asymptote.
- B: The irreducible loss, or the asymptotic minimum loss as data_size approaches infinity.
- log_D0: The natural logarithm of the characteristic data size (D0), where the loss is halfway between (B+A) and B.
          Optimizing log_D0 directly improves numerical stability compared to optimizing D0 when data_size spans many orders of magnitude.
- k: The steepness or Hill coefficient, controlling the slope of the curve around log_D0.
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    """
    Predicts loss values based on data size using a 4-parameter logistic scaling law
    applied to log-transformed data_size.

    Parameters:
    - data_points: (N,1) array with columns [data_size]
    - params: Array of 4 parameters [A, B, log_D0, k]

    Returns:
    - Predicted loss values (N,)
    """
    X = np.atleast_2d(np.asarray(data_points)) # Ensure X is (N, 1)
    
    if X.shape[1] != 1:
        raise ValueError("scaling_law_func expects 1D data_points (data_size).")

    params_arr = np.asarray(params)

    if params_arr.shape != (4,):
        raise ValueError(f"Expected 4 parameters for the logistic scaling law [A, B, log_D0, k], but got shape {params_arr.shape}.")

    # Extract parameters
    A, B, log_D0, k = params_arr

    # Ensure k is strictly positive to maintain the expected curve shape (decreasing loss).
    # Using a small epsilon value (1e-9) to guarantee positivity without significantly altering values.
    k_safe = np.maximum(1e-9, k)
    
    # Log transform the input data_size. Add a small epsilon to avoid log(0) for robustness,
    # though data_size is expected to be positive here (min 200).
    log_X = np.log(X[:, 0] + 1e-9) 
    
    # Calculate the exponent term: k * (log(data_size) - log_D0)
    # This term drives the sigmoid shape in log-space.
    exponent_term = k_safe * (log_X - log_D0)
    
    # Calculate predicted loss: B + A / (1 + exp(exponent_term))
    # The denominator `1 + exp(exponent_term)` will always be positive, preventing division by zero.
    pred = B + A / (1 + np.exp(exponent_term)) 
    
    return pred


def fit_scaling_law(data_points, loss_values):
    """
    Optimizes the parameters of the 4-parameter logistic scaling law (log-transformed data_size)
    to fit the given data.

    Parameters:
    - data_points: (N,1) array with columns [data_size]
    - loss_values: Array of corresponding loss values (N,)

    Returns:
    - Optimized parameters (P,) where P=4 (A, B, log_D0, k)
    """
    X = np.atleast_2d(np.asarray(data_points)) # Ensure X is (N, 1)
    y = np.asarray(loss_values)               # Ensure y is (N,)

    if X.shape[1] != 1:
        raise ValueError("fit_scaling_law expects 1D data_points.")

    # Number of parameters for the logistic function
    P = 4

    # Observed data ranges for better initial guesses and bounds
    y_min_obs, y_max_obs = np.min(y), np.max(y)
    x_min_obs, x_max_obs = np.min(X), np.max(X)

    # A small epsilon for lower bounds to ensure strict positivity for parameters that must be positive,
    # and to avoid log(0) or division by zero.
    EPS = 1e-9 

    # --- Initial Parameter Guesses ---
    # B: Irreducible loss (as data_size -> infinity). Should be positive and less than or equal to minimum observed loss.
    # Estimated slightly below the minimum observed loss, providing a robust starting point for the asymptote.
    initial_B = max(EPS, y_min_obs * 0.9) 

    # A: Amplitude of the loss decrease. Should be positive.
    # Estimated as the total observed loss range (from max observed loss down to the estimated irreducible loss).
    initial_A = max(EPS, y_max_obs - initial_B)

    # log_D0: Logarithm of characteristic data size.
    # D0 is estimated as the geometric mean or median of the data sizes, as data is exponentially spaced.
    # Then take its logarithm. Ensure median is at least 1 before log to avoid issues if X was empty or contained zeros.
    initial_log_D0 = np.log(max(1.0, np.median(X))) 

    # k: Steepness of the curve. Should be positive.
    # A common default value for the Hill coefficient in similar models.
    initial_k = 1.0
    
    init_params_flat = np.array([initial_A, initial_B, initial_log_D0, initial_k])

    # --- Bounds for parameters ---
    # These bounds are crucial for L-BFGS-B robustness, preventing unphysical parameter values,
    # and guiding the optimizer towards meaningful solutions, especially with limited data.

    # A: Amplitude (must be positive; upper bound allows for some extrapolation beyond observed max)
    bounds_A = (EPS, max(1.0, y_max_obs * 2.0))               

    # B: Irreducible loss (must be positive; upper bound slightly above min observed loss for robustness)
    bounds_B = (EPS, min(y_max_obs, y_min_obs * 1.05)) # Adjusted upper bound to be tighter around y_min_obs for B                      

    # log_D0: Logarithm of characteristic data size (D0 must be positive, so log_D0 can be any real number).
    # The bounds are set based on the log of the observed data size range, extended for robustness.
    bounds_log_D0 = (np.log(max(EPS, x_min_obs / 10.0)), np.log(x_max_obs * 10.0)) 

    # k: Steepness (must be positive; upper bound increased to 10.0 to allow for steeper curves if needed)
    bounds_k = (EPS, 10.0)                                      

    bounds = [bounds_A, bounds_B, bounds_log_D0, bounds_k]

    # Ensure bounds are well-ordered (lower <= upper) to prevent potential optimizer errors
    for i in range(P):
        if bounds[i][0] > bounds[i][1]:
            bounds[i] = (bounds[i][1], bounds[i][0]) # Swap if inverted

    def objective(params):
        """
        Objective function to minimize (Mean Squared Error).
        """
        pred = scaling_law_func(X, params)  # Calculate predictions using current parameters
        mse = np.mean((pred - y) ** 2)      # Compute Mean Squared Error
        return mse

    # Use L-BFGS-B, a robust optimizer that supports bounds.
    # Increased precision for `ftol` and `gtol`, and `maxiter` for potentially better convergence.
    result = minimize(objective, init_params_flat, method='L-BFGS-B', bounds=bounds,
                      options={'ftol': 1e-12, 'gtol': 1e-8, 'maxiter': 2000})

    # Return the optimized parameters. If optimization fails (result.success is False),
    # return the initial guess to ensure a consistent output and avoid errors.
    params_opt = result.x if result.success else init_params_flat

    return params_opt
# EVOLVE-BLOCK-END
#2 Run 4 R² = 0.998466
#3 Run 3 R² = 0.998359
#4 Run 5 R² = 0.648576
#5 Run 1 R² = 0.639052