SLD - SFT Scaling Law - SLDAgent + Gemini 2.5 Flash

All Runs (sorted by R²)

Best Run 2 R² = 0.999261

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios.
This evolved program implements a 4-parameter logistic function, specifically a generalized logistic function
(also known as a Richards' curve or Hill equation) applied to the logarithm of data size.
This form is highly suited for modeling loss curves that exhibit saturation as data size increases,
and where the scaling behavior is often more linear in log-log space.

The function is parameterized as: Loss = B + A / (1 + exp(k * (log(data_size) - log_D0)))
Where:
- A: The amplitude, representing the total range of the loss decrease from the initial (high data size) to the final (low data size) asymptote.
- B: The irreducible loss, or the asymptotic minimum loss as data_size approaches infinity.
- log_D0: The natural logarithm of the characteristic data size (D0), where the loss is halfway between (B+A) and B.
          Optimizing log_D0 directly improves numerical stability compared to optimizing D0 when data_size spans many orders of magnitude.
- k: The steepness or Hill coefficient, controlling the slope of the curve around log_D0.
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    """
    Predicts loss values based on data size using a 4-parameter logistic scaling law
    applied to log-transformed data_size.

    Parameters:
    - data_points: (N,1) array with columns [data_size]
    - params: Array of 4 parameters [A, B, log_D0, k]

    Returns:
    - Predicted loss values (N,)
    """
    X = np.atleast_2d(np.asarray(data_points)) # Ensure X is (N, 1)
    
    if X.shape[1] != 1:
        raise ValueError("scaling_law_func expects 1D data_points (data_size).")

    params_arr = np.asarray(params)

    if params_arr.shape != (4,):
        raise ValueError(f"Expected 4 parameters for the logistic scaling law [A, B, log_D0, k], but got shape {params_arr.shape}.")

    # Extract parameters
    A, B, log_D0, k = params_arr

    # Ensure k is strictly positive to maintain the expected curve shape (decreasing loss).
    # Using a small epsilon value (1e-9) to guarantee positivity without significantly altering values.
    k_safe = np.maximum(1e-9, k)
    
    # Log transform the input data_size. Add a small epsilon to avoid log(0) for robustness,
    # though data_size is expected to be positive here (min 200).
    log_X = np.log(X[:, 0] + 1e-9) 
    
    # Calculate the exponent term: k * (log(data_size) - log_D0)
    # This term drives the sigmoid shape in log-space.
    exponent_term = k_safe * (log_X - log_D0)
    
    # Calculate predicted loss: B + A / (1 + exp(exponent_term))
    # The denominator `1 + exp(exponent_term)` will always be positive, preventing division by zero.
    pred = B + A / (1 + np.exp(exponent_term)) 
    
    return pred


def fit_scaling_law(data_points, loss_values):
    """
    Optimizes the parameters of the 4-parameter logistic scaling law (log-transformed data_size)
    to fit the given data.

    Parameters:
    - data_points: (N,1) array with columns [data_size]
    - loss_values: Array of corresponding loss values (N,)

    Returns:
    - Optimized parameters (P,) where P=4 (A, B, log_D0, k)
    """
    X = np.atleast_2d(np.asarray(data_points)) # Ensure X is (N, 1)
    y = np.asarray(loss_values)               # Ensure y is (N,)

    if X.shape[1] != 1:
        raise ValueError("fit_scaling_law expects 1D data_points.")

    # Number of parameters for the logistic function
    P = 4

    # Observed data ranges for better initial guesses and bounds
    y_min_obs, y_max_obs = np.min(y), np.max(y)
    x_min_obs, x_max_obs = np.min(X), np.max(X)

    # A small epsilon for lower bounds to ensure strict positivity for parameters that must be positive,
    # and to avoid log(0) or division by zero.
    EPS = 1e-9 

    # --- Initial Parameter Guesses ---
    # B: Irreducible loss (as data_size -> infinity). Should be positive and less than or equal to minimum observed loss.
    # Estimated slightly below the minimum observed loss, providing a robust starting point for the asymptote.
    initial_B = max(EPS, y_min_obs * 0.9) 

    # A: Amplitude of the loss decrease. Should be positive.
    # Estimated as the total observed loss range (from max observed loss down to the estimated irreducible loss).
    initial_A = max(EPS, y_max_obs - initial_B)

    # log_D0: Logarithm of characteristic data size.
    # D0 is estimated as the geometric mean or median of the data sizes, as data is exponentially spaced.
    # Then take its logarithm. Ensure median is at least 1 before log to avoid issues if X was empty or contained zeros.
    initial_log_D0 = np.log(max(1.0, np.median(X))) 

    # k: Steepness of the curve. Should be positive.
    # A common default value for the Hill coefficient in similar models.
    initial_k = 1.0
    
    init_params_flat = np.array([initial_A, initial_B, initial_log_D0, initial_k])

    # --- Bounds for parameters ---
    # These bounds are crucial for L-BFGS-B robustness, preventing unphysical parameter values,
    # and guiding the optimizer towards meaningful solutions, especially with limited data.

    # A: Amplitude (must be positive; upper bound allows for some extrapolation beyond observed max)
    bounds_A = (EPS, max(1.0, y_max_obs * 2.0))               

    # B: Irreducible loss (must be positive; upper bound slightly above min observed loss for robustness)
    bounds_B = (EPS, min(y_max_obs, y_min_obs * 1.05)) # Adjusted upper bound to be tighter around y_min_obs for B                      

    # log_D0: Logarithm of characteristic data size (D0 must be positive, so log_D0 can be any real number).
    # The bounds are set based on the log of the observed data size range, extended for robustness.
    bounds_log_D0 = (np.log(max(EPS, x_min_obs / 10.0)), np.log(x_max_obs * 10.0)) 

    # k: Steepness (must be positive; upper bound increased to 10.0 to allow for steeper curves if needed)
    bounds_k = (EPS, 10.0)                                      

    bounds = [bounds_A, bounds_B, bounds_log_D0, bounds_k]

    # Ensure bounds are well-ordered (lower <= upper) to prevent potential optimizer errors
    for i in range(P):
        if bounds[i][0] > bounds[i][1]:
            bounds[i] = (bounds[i][1], bounds[i][0]) # Swap if inverted

    def objective(params):
        """
        Objective function to minimize (Mean Squared Error).
        """
        pred = scaling_law_func(X, params)  # Calculate predictions using current parameters
        mse = np.mean((pred - y) ** 2)      # Compute Mean Squared Error
        return mse

    # Use L-BFGS-B, a robust optimizer that supports bounds.
    # Increased precision for `ftol` and `gtol`, and `maxiter` for potentially better convergence.
    result = minimize(objective, init_params_flat, method='L-BFGS-B', bounds=bounds,
                      options={'ftol': 1e-12, 'gtol': 1e-8, 'maxiter': 2000})

    # Return the optimized parameters. If optimization fails (result.success is False),
    # return the initial guess to ensure a consistent output and avoid errors.
    params_opt = result.x if result.success else init_params_flat

    return params_opt
# EVOLVE-BLOCK-END

#2 Run 4 R² = 0.998466

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios
Evolved program with a robust power law formulation (A * (N+C)^(-alpha) + B)
using exponential parameterization for stability and improved adaptive initial guesses.

This version builds upon the highest performing previous programs by enhancing the
initialization strategy for the `fit_scaling_law` function. While retaining the
effective BFGS optimizer and the log-transformed parameter approach, it introduces:

1.  **Adaptive `logA` Initialization:** The initial guess for `logA` is now dynamically set
    to `np.log(max(1e-6, np.max(y)))`. This ensures the initial amplitude of the power-law
    component is scaled appropriately to the observed maximum loss in the dataset,
    improving relevance across different loss ranges.
2.  **Adaptive `logC` Initialization:** The initial guess for `logC` is now dynamically set
    to `np.log(max(1e-6, np.min(X)))`. This ties the data size shift parameter's starting
    point to the minimum data size observed, which is more robust for datasets with varying
    scales of `sft_data_size`.
3.  **Retained Robustness:** The `log_alpha` and `log_B` initial guesses (fixed `np.log(0.2)`
    and adaptive `np.log(max(1e-6, np.min(y) * 0.9))`, respectively) are kept as they have
    proven highly effective and generally applicable.
4.  **BFGS Optimizer:** The BFGS method is retained for its efficiency in unconstrained
    optimization, which pairs well with the log-transformed parameters.
5.  **`scaling_law_func` Stability:** The mathematical form of `scaling_law_func` remains
    unchanged, utilizing 4 parameters (A, alpha, B, C) and handling numerical stability
    (e.g., `np.power` with `where` clause).
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    # data_points: (N,1) array with columns [data_size]
    # params: Array of up to 4 parameters. For this function, params will be [logA, log_alpha, log_B, logC]
    # Returns: Predicted loss values

    # Extract the 1D data size array from the input (N,)
    X = np.atleast_2d(np.asarray(data_points))[:, 0] 
    
    # Handle both single parameter set (1D) and multiple parameter sets (2D)
    if params.ndim == 1:
        params_2d = params[None, :] # Reshape to (1, P)
    else:
        params_2d = params # Already (T, P)
    
    # Extract log-transformed parameters for each curve
    logA = params_2d[:, 0]
    log_alpha = params_2d[:, 1]
    log_B = params_2d[:, 2]
    logC = params_2d[:, 3] # The 4th parameter for the data size shift

    # Transform parameters using exponential to ensure positivity and physical meaning
    # A: Scale factor for the power law component
    A = np.exp(logA)
    # alpha: Positive scaling exponent (loss decreases with data)
    alpha = np.exp(log_alpha)
    # B: Irreducible loss (asymptotic minimum loss)
    B = np.exp(log_B)
    # C: Data size shift parameter (ensures N+C is always positive and shifts the curve)
    C = np.exp(logC) 

    # Calculate the shifted data size for each data point and each parameter set
    # X[:, None] is (N, 1), C[None, :] is (1, T). Result is (N, T).
    shifted_X = X[:, None] + C[None, :]
    
    # Calculate the power term: (N, T) ** -(1, T) -> (N, T)
    # shifted_X is guaranteed positive since X > 0 and C > 0.
    # Using np.power with where clause for robustness, though not strictly necessary here
    # as shifted_X is positive, it prevents potential warnings with zero-valued bases.
    power_term = np.power(shifted_X, -alpha[None, :], where=(shifted_X != 0))
    
    # Calculate the final prediction: (1, T) * (N, T) + (1, T) -> (N, T)
    pred = A[None, :] * power_term + B[None, :]
    
    # Return (N,) if a single parameter set was provided, otherwise (N, T)
    return pred[:, 0] if params.ndim == 1 else pred


def fit_scaling_law(data_points, loss_values):
    # data_points: (N,1) array with columns [data_size]
    # loss_values: Array of corresponding loss values
    # Returns: Optimized parameters (up to 4 parameters)

    X = np.atleast_2d(np.asarray(data_points))[:, 0] # (N,)
    y = np.asarray(loss_values)
    
    # P is the number of parameters for a single curve. Now 4: [logA, log_alpha, log_B, logC]
    P = 4 

    # Ensure loss_values is 2D, even if fitting a single curve
    if y.ndim == 1:
        y2d = y[:, None] # (N, 1)
    else:
        y2d = y # (N, T)
    T = y2d.shape[1] # Number of different curves/parameter sets to fit

    # --- Initial parameter guesses for logA, log_alpha, log_B, logC ---
    # These are chosen to be reasonable starting points for the log-transformed parameters.
    # The goal is to make them more adaptive to the input data range for better generalization.
    
    # logA: Scale factor. Related to the maximum observed loss.
    # Using max(y) as an initial guess for the amplitude of the power law component.
    # Ensure argument to log is positive.
    logA_init = np.log(max(1e-6, np.max(y))) 
    
    # log_alpha: Positive scaling exponent, typically between 0 and 1. log(0.2) is a common value.
    # Keeping this fixed as it's a general property of scaling laws that doesn't strongly depend on data scale.
    log_alpha_init = np.log(0.2) 
    
    # log_B: Irreducible loss, should be less than the minimum observed loss.
    # Initialized slightly below the minimum observed loss across ALL data.
    min_overall_loss = np.min(y)
    log_B_init = np.log(max(1e-6, min_overall_loss * 0.9)) 
    
    # logC: Data size shift parameter. Related to the scale of data sizes.
    # Using min(X) as an initial guess for the shift, as C often represents
    # a "minimum effective data size" or shifts the curve for small N.
    # Ensure argument to log is positive.
    logC_init = np.log(max(1e-6, np.min(X))) 

    initial_param_guesses = np.array([logA_init, log_alpha_init, log_B_init, logC_init])
    
    # Tile these initial guesses for all T curves
    init = np.tile(initial_param_guesses, (T, 1)) # Resulting shape (T, P)

    def objective(flat_params):
        # Reshape the flattened parameters back to (T, P)
        params = flat_params.reshape(T, P)
        # Call the scaling law function. X[:, None] ensures data_points is (N, 1).
        pred = scaling_law_func(X[:, None], params) 
        # Calculate Mean Squared Error
        mse = np.mean((pred - y2d) ** 2)
        return mse

    # Use BFGS for optimization. It's suitable for unconstrained problems,
    # which our log-transformed parameters effectively are.
    result = minimize(objective, init.ravel(), method='BFGS')
    
    # Reshape optimized parameters back to (T, P) if successful, otherwise use initial guesses
    # Using initial guesses on failure provides a reasonable fallback.
    params_opt = result.x.reshape(T, P) if result.success else init

    # Return (P,) if only one curve was fitted, otherwise (T, P)
    return params_opt[0] if T == 1 else params_opt
# EVOLVE-BLOCK-END

#3 Run 3 R² = 0.998359

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios
Evolved program using a 4-parameter power law with a data size shift,
improved initial parameter guessing, robust bounds, parameter transformation for optimization,
and vectorized scaling_law_func.
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    """
    Predicts loss values based on a 4-parameter scaling law:
    L = A * (data_size + C)^(-alpha) + B

    data_points: (N,1) array with columns [data_size]
    params: Array of up to 4 parameters. Expected shape (T, P) where P=4, or (P,) for T=1.
            [A, alpha, C, B] for each of T parameter sets.
    Returns: Predicted loss values (N,) or (N, T)
    """
    X = np.atleast_2d(np.asarray(data_points)) # (N, F)
    N, F = X.shape

    # As per problem description, F=1 (data_size is the only feature)
    if F != 1:
        raise ValueError(f"scaling_law_func expects data_points to have 1 feature (data_size), but got {F}.")

    params = np.asarray(params)
    if params.ndim == 1:
        # If a single set of parameters is passed as a 1D array, reshape it to (1, P)
        params = params[None, :]
    T, P = params.shape # T: number of parameter sets, P: number of parameters per set

    # Ensure P matches the expected 4 parameters for our scaling law
    if P != 4:
        raise ValueError(f"Expected 4 parameters [A, alpha, C, B] for scaling_law_func, but got {P}.")

    A_vals = params[:, 0]     # (T,)
    alpha_vals = params[:, 1] # (T,)
    C_vals = params[:, 2]     # (T,)
    B_vals = params[:, 3]     # (T,)

    data_size = X[:, 0] # (N,)

    # Expand data_size to (N, 1) and parameter arrays to (1, T) for broadcasting
    data_size_expanded = data_size[:, np.newaxis] # (N, 1)

    # Calculate shifted data size (data_size + C) for all N data points and T parameter sets
    # shifted_data_size will be (N, T)
    shifted_data_size = data_size_expanded + C_vals[np.newaxis, :]

    # Numerical stability: ensure base for power operation is positive.
    # This handles cases where C might temporarily lead to non-positive bases during optimization.
    shifted_data_size = np.maximum(shifted_data_size, 1e-9)

    # Compute the power term for each (N, T) combination
    # alpha_vals[np.newaxis, :] makes it (1, T) for broadcasting
    power_term = shifted_data_size ** (-alpha_vals[np.newaxis, :]) # (N, T) ** (1, T) -> (N, T)

    # Compute the scaling law: L = A * (data_size + C)^(-alpha) + B
    # A_vals[np.newaxis, :] and B_vals[np.newaxis, :] make them (1, T) for broadcasting
    pred = A_vals[np.newaxis, :] * power_term + B_vals[np.newaxis, :] # (1, T) * (N, T) + (1, T) -> (N, T)

    # Return predictions. If only one parameter set (T=1), return a 1D array.
    return pred[:, 0] if T == 1 else pred


def fit_scaling_law(data_points, loss_values):
    """
    Optimizes the parameters for the 4-parameter scaling law using L-BFGS-B.
    Uses parameter transformation (log-space for A, alpha, B) for improved optimization stability,
    along with refined initial guesses and robust bounds.

    data_points: (N,1) array with columns [data_size]
    loss_values: Array of corresponding loss values (N,)
    Returns: Optimized parameters (1D array of 4 parameters: [A, alpha, C, B])
    """
    X = np.atleast_2d(np.asarray(data_points)) # (N, F)
    y = np.asarray(loss_values) # (N,)

    # Ensure F=1 as expected for this specific scaling law
    N, F = X.shape
    if F != 1:
        raise ValueError(f"fit_scaling_law expects data_points to have 1 feature (data_size), but got {F}.")

    data_size = X[:, 0] # Extract the data_size feature

    # --- Initial Guess for Parameters [A, alpha, C, B] (in their original scale) ---
    # A: Scaling coefficient.
    A_init_orig = (y.max() - y.min()) * (data_size.min()**0.2)
    
    # alpha: Exponent of the power law.
    alpha_init_orig = 0.2 
    
    # C: Data size shift.
    C_init_orig = 0 
    
    # B: Asymptotic minimum loss (irreducible loss).
    # Estimate from the tail of the loss values.
    B_init_orig = np.mean(y[-max(1, int(len(y)*0.2)):]) 
    # Ensure B_init is below the 5th percentile of observed losses, but at least 1e-6 for log-transform safety.
    B_init_orig = min(B_init_orig, np.percentile(y, 5))
    B_init_orig = max(B_init_orig, 1e-6)

    # --- Transform initial guesses for optimization (log-space for A, alpha, B) ---
    # Ensure positive values before taking log, based on their future lower bounds.
    A_init_orig = max(A_init_orig, 1e-6) # Lower bound for A
    alpha_init_orig = max(alpha_init_orig, 1e-3) # Lower bound for alpha
    
    initial_params_transformed = np.array([
        np.log(A_init_orig),
        np.log(alpha_init_orig),
        C_init_orig, # C is not log-transformed
        np.log(B_init_orig)
    ])

    # --- Parameter Bounds ---
    min_data_size = np.min(data_size)

    # Original bounds for [A, alpha, C, B]
    bounds_orig = [
        (1e-6, None),               # A: Must be positive.
        (1e-3, 1.0),                # alpha: Positive, typically < 1.0 for loss scaling.
        (-min_data_size + 1.0, None), # C: Ensures (data_size + C) > 0 for all data_size.
        (1e-6, max(1e-6, np.percentile(y, 10))) # B: Must be positive, and typically below the lower percentiles of observed loss.
                                                # Use max(1e-6, ...) to ensure log-transform safety and prevent upper bound < lower bound.
    ]

    # Transform bounds for log-space parameters (A, alpha, B)
    bounds_transformed = [
        (np.log(bounds_orig[0][0]), None), # log(A)
        (np.log(bounds_orig[1][0]), np.log(bounds_orig[1][1])), # log(alpha)
        bounds_orig[2],                     # C (no transformation)
        (np.log(bounds_orig[3][0]), np.log(bounds_orig[3][1])) # log(B)
    ]
    
    # Adjust initial_params_transformed to be strictly within their bounds.
    # This helps L-BFGS-B start from a valid point.
    for i in range(len(initial_params_transformed)):
        lower_bound = bounds_transformed[i][0]
        upper_bound = bounds_transformed[i][1]
        if lower_bound is not None:
            initial_params_transformed[i] = max(initial_params_transformed[i], lower_bound)
        if upper_bound is not None:
            initial_params_transformed[i] = min(initial_params_transformed[i], upper_bound)

    # --- Objective Function ---
    def objective(params_transformed_opt):
        # Convert transformed parameters back to original scale for scaling_law_func
        A_val = np.exp(params_transformed_opt[0])
        alpha_val = np.exp(params_transformed_opt[1])
        C_val = params_transformed_opt[2]
        B_val = np.exp(params_transformed_opt[3])
        
        current_params = np.array([A_val, alpha_val, C_val, B_val])
        pred = scaling_law_func(X, current_params)
        mse = np.mean((pred - y) ** 2)
        return mse

    # --- Optimization ---
    # Use L-BFGS-B, which supports bounds, for robust local optimization.
    result = minimize(objective, initial_params_transformed, method='L-BFGS-B', bounds=bounds_transformed)

    # Convert optimized parameters back to original scale
    if result.success:
        params_opt_transformed = result.x
    else:
        # If optimization fails, use the (adjusted) initial guess
        params_opt_transformed = initial_params_transformed
    
    A_opt = np.exp(params_opt_transformed[0])
    alpha_opt = np.exp(params_opt_transformed[1])
    C_opt = params_opt_transformed[2]
    B_opt = np.exp(params_opt_transformed[3])
    
    params_opt = np.array([A_opt, alpha_opt, C_opt, B_opt])
    
    return params_opt
# EVOLVE-BLOCK-END

#4 Run 5 R² = 0.648576

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios.

This evolved program refines the shifted power law model with an irreducible loss term,
a canonical form for scaling laws that often provides a robust fit for loss curves.
It continues to use L-BFGS-B for optimization, with carefully chosen bounds and
dynamically estimated initial guesses for better numerical stability and parameter interpretability.

The mathematical form of the scaling law is:
Loss = A * (data_size + C)^(-k) + B

Where:
- A: Scaling coefficient (magnitude of the scaling effect).
- k: Exponent (rate of loss decay with increasing data size).
- C: Data size shift (handles initial data points and non-zero asymptotic behavior near data_size=0).
- B: Irreducible loss (asymptotic loss as data_size approaches infinity).

Key improvements in this version focus on:
1.  **Dynamic Initial Guess for 'k':** Instead of a fixed value, 'k' is now estimated
    from the log-log slope of the data, providing a more data-adaptive starting point.
2.  **Robust Initial Parameter Bounding:** Initial guesses for 'B' are explicitly clipped
    to ensure they fall within the defined bounds, improving optimizer stability.
3.  **Enhanced Numerical Stability in `scaling_law_func`:** Small positive floors are applied
    to parameters (A, k, B, C) and the base of the power operation (`data_size + C`)
    to prevent numerical issues (e.g., division by zero, log of zero, very small bases for powers)
    that can arise during optimization steps, even when bounds are used. This acts as an
    additional safeguard for the gradient-based optimizer.
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    """
    Predicts loss values based on data size using a shifted power law with an offset.
    The form is: Loss = A * (data_size + C)^(-k) + B

    Parameters:
    - data_points: (N,1) array with columns [data_size]
    - params: Array of 4 parameters [A, k, C, B]

    Returns:
    - Predicted loss values (N,) or (N, T) if params represent multiple tasks.
    """
    X = np.atleast_2d(np.asarray(data_points)) # (N, 1)
    
    if X.shape[1] != 1:
        raise ValueError("scaling_law_func expects 1D data_points (data_size) for this model form.")

    params = np.asarray(params)
    
    # Handle params being (P,) for a single fit or (T, P) for multiple simultaneous fits.
    # The fit_scaling_law function will pass params as (T, P) where T=1 in this context.
    if params.ndim == 1:
        params = params[None, :] # Reshape to (1, P)
    
    T, P = params.shape

    if P != 4:
        raise ValueError(f"Expected 4 parameters for the power law scaling law [A, k, C, B], but got {P}.")

    # Extract parameters for each task T
    # A (scaling coefficient), k (exponent), C (data size shift), B (irreducible loss)
    A = params[:, 0]  # (T,)
    k = params[:, 1]  # (T,)
    C = params[:, 2]  # (T,)
    B = params[:, 3]  # (T,)

    # Apply small positive floors to parameters for enhanced numerical stability.
    # This prevents issues if optimization steps propose values very close to or below zero,
    # even when bounds are set, ensuring mathematical operations remain valid.
    A_safe = np.maximum(1e-6, A)
    k_safe = np.maximum(1e-6, k)
    C_safe = np.maximum(0.0, C)
    B_safe = np.maximum(1e-6, B)

    # Calculate (data_size + C)^(-k)
    # X (N,1) broadcasts against C_safe (T,) to produce (X + C_safe) (N, T)
    # X values are >= 200 (from data characteristics), and C_safe is >= 0,
    # so the base will always be positive and non-zero. A small floor ensures robustness.
    base = X + C_safe
    base = np.maximum(1e-9, base) # Ensure base is strictly positive for power calculation

    # np.power handles broadcasting (N, T) ** (-k_safe) to produce power_term (N, T)
    power_term = np.power(base, -k_safe)

    # Calculate predicted loss: A * power_term + B
    # A_safe (T,) and B_safe (T,) broadcast against power_term (N, T) to produce pred (N, T)
    pred = A_safe * power_term + B_safe

    # Match original return format: if T=1, return (N,) else (N, T)
    return pred[:, 0] if T == 1 else pred


def fit_scaling_law(data_points, loss_values):
    """
    Optimizes the parameters of the shifted power law scaling law to fit the given data.
    The model is: Loss = A * (data_size + C)^(-k) + B

    Parameters:
    - data_points: (N,1) array with columns [data_size]
    - loss_values: Array of corresponding loss values (N,)

    Returns:
    - Optimized parameters (P,) where P=4 (A, k, C, B)
    """
    X = np.atleast_2d(np.asarray(data_points)) # (N, 1)
    y = np.asarray(loss_values)               # (N,)

    if X.shape[1] != 1:
        raise ValueError("fit_scaling_law expects 1D data_points for this model form.")

    y2d = y[:, None] # (N, 1)
    T = y2d.shape[1] # In this problem, T will always be 1.

    P = 4 # Number of parameters for the power law function: [A, k, C, B]

    # Initial parameter guesses, crucial for robust optimization.
    y_min, y_max = np.min(y), np.max(y)
    x_min, x_max = np.min(X), np.max(X)

    # B: Irreducible loss (as data_size -> infinity). Must be positive and less than y_min.
    # The upper bound for B is y_min - epsilon, ensuring the power law term is positive for a decreasing curve.
    initial_B_upper_bound = y_min - 0.001 if y_min > 0.002 else 0.001
    # Ensure initial B is positive and within its valid range.
    initial_B = np.clip(y_min * 0.9, 0.001, initial_B_upper_bound)

    # k: Exponent (rate of decay). Estimate from log-log slope for better adaptability.
    log_x_min = np.log(x_min)
    log_x_max = np.log(x_max)
    
    # Get y values corresponding to x_min and x_max for slope estimation.
    # This assumes a general decreasing trend, typical for scaling laws.
    y_at_x_min = y[np.argmin(X)]
    y_at_x_max = y[np.argmax(X)]
    
    log_y_at_x_min = np.log(y_at_x_min)
    log_y_at_x_max = np.log(y_at_x_max)

    k_rough_estimate = 0.5 # Default initial k if range is too small or calculation fails
    if (log_x_max - log_x_min) > 1e-9: # Avoid division by zero
        # Approximate -k from log(Loss) ~ -k * log(data_size)
        # So, k ~ (log(y_at_x_min) - log(y_at_x_max)) / (log_x_max - log_x_min)
        k_rough_estimate = (log_y_at_x_min - log_y_at_x_max) / (log_x_max - log_x_min)
    
    # Clip k to a reasonable range, as this is a rough estimate.
    # Typical scaling exponents are often between 0.05 and 2.0.
    initial_k = np.clip(k_rough_estimate, 0.05, 2.0)

    # C: Data size shift. Helps handle initial points. Must be non-negative.
    # A fraction of x_min provides a reasonable starting point.
    initial_C = np.maximum(0.0, x_min / 10.0)

    # A: Scaling coefficient. Approximate A from (y_max - B) = A * (x_min + C)^(-k)
    # So A approx (y_max - B) * (x_min + C)^k
    diff_y_for_A = y_max - initial_B
    # Ensure this difference is positive for A calculation.
    # Given initial_B is bounded below y_min, and y_min <= y_max, diff_y_for_A should typically be positive.
    # A small floor adds robustness against floating point errors or edge cases.
    diff_y_for_A = np.maximum(1e-6, diff_y_for_A)
    
    # Ensure (x_min + initial_C) is positive for power calculation.
    # With x_min >= 200 and initial_C >= 0, this will always be positive.
    base_for_A = x_min + initial_C
    base_for_A = np.maximum(1e-9, base_for_A) # Small floor for robustness

    initial_A = diff_y_for_A * (base_for_A)**initial_k
    initial_A = np.maximum(0.001, initial_A) # Ensure A is positive and not too small

    # Initial parameters for the optimizer (1D array)
    init_params_flat = np.array([initial_A, initial_k, initial_C, initial_B])

    # Bounds for parameters, essential for L-BFGS-B and for physical realism.
    bounds = [
        (0.001, 5000.0),             # A: Positive, allows for a wide range of scaling magnitudes.
        (0.01, 5.0),                 # k: Positive, typical scaling exponents are often < 2, but 5 allows for steeper curves.
        (0.0, x_max * 2),            # C: Non-negative, allows for shifts up to twice the max observed data size.
        (0.001, initial_B_upper_bound) # B: Positive, strictly less than min observed loss to ensure decreasing curve.
    ]

    def objective(flat_params):
        # Reshape flat_params to (T, P) for scaling_law_func, where T=1
        params_for_func = flat_params.reshape(T, P)
        pred = scaling_law_func(X, params_for_func)  # (N, T)
        # Calculate Mean Squared Error
        mse = np.mean((pred - y2d) ** 2)
        return mse

    # Use L-BFGS-B, which supports bounds, improving robustness and parameter meaningfulness.
    result = minimize(objective, init_params_flat, method='L-BFGS-B', bounds=bounds)

    # Return the optimized parameters. If optimization fails, return the initial guess.
    # The output format (P,) matches the T=1 case of the original function's return.
    params_opt = result.x if result.success else init_params_flat

    return params_opt
# EVOLVE-BLOCK-END

#5 Run 1 R² = 0.639052

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios.
This evolved program utilizes a robust shifted power law model with an irreducible loss term,
which is a canonical and well-performing form for scaling laws in machine learning, particularly
for loss curves that exhibit a decreasing trend towards an asymptote.

The improvements primarily focus on enhancing the `fit_scaling_law` optimization algorithm by:
1.  **Model Choice**: The chosen model is the shifted power law: Loss = A * (data_size + C)^(-k) + B.
    This form is widely recognized for its ability to capture the diminishing returns characteristic
    of increasing data size on loss.
2.  **Maintaining L-BFGS-B**: Continues to use this efficient quasi-Newton method which handles
    box constraints (bounds) effectively, leading to more stable and meaningful parameter estimates.
3.  **Refining Bounds**: Carefully tuned bounds for parameters A, k, C, and B are applied. These
    bounds ensure numerical stability and prevent the optimizer from exploring unrealistic parameter
    spaces, which is crucial for robustness and cross-dataset generalization. Specifically, the upper
    bound for `A` is made more conservative, and `C` is capped at `x_max` for better interpretability.
4.  **Enhancing Initial Parameter Guesses**: Robust logic is employed for calculating initial guesses
    for all parameters, dynamically adapting to the input data's range (min/max loss, min/max data size).
    This adaptivity significantly helps the optimizer start closer to a good solution, improving
    convergence speed and accuracy across diverse model-dataset combinations, including cases with
    small loss ranges or flat data.

The mathematical form of the scaling law is:
Loss = A * (data_size + C)^(-k) + B

Where:
- A: Scaling coefficient (magnitude of the scaling effect). Must be positive.
- k: Exponent (rate of loss decay with increasing data size). Must be positive.
- C: Data size shift (handles initial data points and non-zero asymptotic behavior near data_size=0). Must be non-negative.
- B: Irreducible loss (asymptotic loss as data_size approaches infinity). Must be positive and less than the minimum observed loss.
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    """
    Predicts loss values based on data size using a shifted power law with an offset.
    The form is: Loss = A * (data_size + C)^(-k) + B

    Parameters:
    - data_points: (N,1) array with columns [data_size]
    - params: Array of 4 parameters [A, k, C, B]

    Returns:
    - Predicted loss values (N,) or (N, T) if params represent multiple tasks.
    """
    X = np.atleast_2d(np.asarray(data_points)) # (N, 1)
    
    if X.shape[1] != 1:
        raise ValueError("scaling_law_func expects 1D data_points (data_size) for this model form.")

    params = np.asarray(params)
    
    # Handle params being (P,) for a single fit or (T, P) for multiple simultaneous fits.
    # The fit_scaling_law function will pass params as (T, P) where T=1 in this context.
    if params.ndim == 1:
        params = params[None, :] # Reshape to (1, P)
    
    T, P = params.shape

    if P != 4:
        raise ValueError(f"Expected 4 parameters for the power law scaling law [A, k, C, B], but got {P}.")

    # Extract parameters for each task T
    # A (scaling coefficient), k (exponent), C (data size shift), B (irreducible loss)
    A = params[:, 0]  # (T,)
    k = params[:, 1]  # (T,)
    C = params[:, 2]  # (T,)
    B = params[:, 3]  # (T,)

    # Ensure parameters are positive/non-negative for stability and theoretical soundness.
    # np.maximum ensures values stay above a small epsilon or zero.
    A_safe = np.maximum(1e-6, A)
    k_safe = np.maximum(1e-6, k)
    C_safe = np.maximum(0.0, C)
    B_safe = np.maximum(1e-6, B)

    # Calculate (data_size + C)^(-k)
    # X (N,1) broadcasts against C_safe (T,) to produce (X + C_safe) (N, T)
    # Data characteristics state data_size >= 200, so base will always be positive and non-zero,
    # preventing issues with exponentiation.
    base = X + C_safe

    # np.power handles broadcasting (N, T) ** (-k_safe) to produce power_term (N, T)
    power_term = np.power(base, -k_safe)

    # Calculate predicted loss: A_safe * power_term + B_safe
    # A_safe (T,) and B_safe (T,) broadcast against power_term (N, T) to produce pred (N, T)
    pred = A_safe * power_term + B_safe

    # Match original return format: if T=1, return (N,) else (N, T)
    return pred[:, 0] if T == 1 else pred


def fit_scaling_law(data_points, loss_values):
    """
    Optimizes the parameters of the shifted power law scaling law to fit the given data.
    The model is: Loss = A * (data_size + C)^(-k) + B

    Parameters:
    - data_points: (N,1) array with columns [data_size]
    - loss_values: Array of corresponding loss values (N,)

    Returns:
    - Optimized parameters (P,) where P=4 (A, k, C, B)
    """
    X = np.atleast_2d(np.asarray(data_points)) # (N, 1)
    y = np.asarray(loss_values)               # (N,)

    if X.shape[1] != 1:
        raise ValueError("fit_scaling_law expects 1D data_points for this model form.")

    # Convert y to (N, 1) for consistency with scaling_law_func's broadcasting.
    y2d = y[:, None] # (N, 1)
    T = y2d.shape[1] # In this problem, T will always be 1.

    # Number of parameters for the power law function: [A, k, C, B]
    P = 4

    # Initial parameter guesses, crucial for robust optimization.
    y_min, y_max = np.min(y), np.max(y)
    x_min, x_max = np.min(X), np.max(X)

    # --- Refined Initial Guesses ---

    # B: Irreducible loss (as data_size -> infinity). Must be positive and less than y_min.
    # Set it slightly below the minimum observed loss, ensuring it's positive.
    # Adjust if the loss range is very small to ensure B is distinct from y_min.
    initial_B = np.maximum(0.001, y_min * 0.9)
    if (y_max - y_min) < 0.05: # If loss range is very small (e.g., flat data)
        initial_B = np.maximum(0.001, y_min - 0.01) # Try to go slightly below y_min

    # k: Exponent (rate of decay). Typically positive.
    initial_k = 0.5 # A common value for scaling exponents in LLMs

    # C: Data size shift. Helps handle initial points. Must be non-negative.
    # A small fraction of x_min to prevent it from being too dominant or too small.
    # Given data_size range 200 to 819,200, x_min is always > 0.
    initial_C = x_min / 10.0 # e.g., 200/10 = 20
    initial_C = np.maximum(0.0, initial_C) # Ensure non-negative

    # A: Scaling coefficient. Represents the magnitude of the scaling effect.
    # Approximate A from the largest loss difference (y_max - B) at the smallest data point (x_min).
    # Based on: y_max approx A * (x_min + C)^(-k) + B  =>  A approx (y_max - B) * (x_min + C)^k
    diff_y = y_max - initial_B
    # Ensure diff_y is positive and significant enough for A calculation.
    # Fallback if initial_B is too high or y_max is too low (e.g., very flat data).
    if diff_y <= 0:
        diff_y = (y_max - y_min) * 0.8 # Use a fraction of the total range
        if diff_y <= 0: # Still non-positive, e.g., if y_max == y_min
            diff_y = 0.1 * y_max if y_max > 0 else 0.1 # Fallback to a small positive value
    
    initial_A = diff_y * (x_min + initial_C)**initial_k
    initial_A = np.maximum(0.01, initial_A) # Ensure positive and not too small

    # Initial parameters for the optimizer (1D array)
    init_params_flat = np.array([initial_A, initial_k, initial_C, initial_B])

    # --- Refined Bounds for parameters ---
    # A: (0.001, 1000.0) - Positive, allows for a wide range of scaling magnitudes,
    #    but capped at 1000.0 as losses are in single digits (1.7-4.9), so very large A is unlikely.
    # k: (0.01, 5.0) - Positive, typical scaling exponents are often < 2, but 5 allows for steeper curves.
    # C: (0.0, x_max) - Non-negative, allows for shifts up to the max observed data size.
    #    Limiting to x_max (instead of 2*x_max) provides better interpretability and prevents overfitting
    #    to extreme shifts that are not physically meaningful.
    # B: (0.001, y_min - 0.001) - Irreducible loss, positive, strictly less than min observed loss
    #                            to ensure the A * (...) term is positive for a decreasing curve.
    #                            The `y_min > 0.002` check prevents a negative upper bound if y_min is very small.
    bounds = [
        (0.001, 1000.0),             # A
        (0.01, 5.0),                 # k
        (0.0, x_max),                # C 
        (0.001, y_min - 0.001 if y_min > 0.002 else 0.001) # B
    ]

    def objective(flat_params):
        # Reshape flat_params to (T, P) for scaling_law_func, where T=1
        params_for_func = flat_params.reshape(T, P)
        pred = scaling_law_func(X, params_for_func)  # (N, T)
        # Calculate Mean Squared Error
        mse = np.mean((pred - y2d) ** 2)
        return mse

    # Use L-BFGS-B, which supports bounds, improving robustness and parameter meaningfulness.
    # It is generally more robust than BFGS for bounded problems and non-linear least squares.
    result = minimize(objective, init_params_flat, method='L-BFGS-B', bounds=bounds)

    # Return the optimized parameters. If optimization fails, return the initial guess.
    # The output format (P,) matches the T=1 case of the original function's return.
    params_opt = result.x if result.success else init_params_flat

    return params_opt
# EVOLVE-BLOCK-END