SLD - LR-BSZ Scaling Law - SLDAgent + Gemini 2.5 Flash

All Runs (sorted by R²)

Best Run 1 R² = 0.918060

▼

Python

# EVOLVE-BLOCK-START
import numpy as np
from scipy.optimize import least_squares

def scaling_law_func(data_points, params):
    """
    Predicts LM loss based on learning rate, batch size, data size, and non-embedding parameter size.

    The model is of the form:
    Loss = L_0 + c_lr_pos * lr^e_lr_pos + c_lr_neg * lr^e_lr_neg + c_bsz * bsz^e_bsz + c_data * data_size^e_data + c_params * non_embedding_param_size^e_params

    Args:
        data_points (np.ndarray): (N, 4) array with columns [lr, bsz, data_size, non_embedding_param_size].
        params (np.ndarray): Array of model parameters. Can be (P,) for a single model or (T, P) for multiple.
                             Expected P=11: [L_0, c_lr_pos, e_lr_pos, c_lr_neg, e_lr_neg, c_bsz, e_bsz, c_data, e_data, c_params, e_params].

    Returns:
        np.ndarray: Predicted lm loss values. Shape (N,) if params is (P,), or (N, T) if params is (T, P).
    """
    X = np.atleast_2d(np.asarray(data_points)) # (N, F)
    # Ensure all inputs are positive to avoid issues with log(0) or log(negative)
    # Using a small epsilon (1e-10) to prevent log of zero or negative numbers.
    X = np.maximum(X, 1e-10)

    params_arr = np.asarray(params)
    # Adapt to the original framework's potential (T, P) parameter passing
    if params_arr.ndim == 1:
        params_arr = params_arr[None, :] # Make it (1, P)

    T, P = params_arr.shape # T: number of parameter sets, P: number of parameters per set

    # Expected number of parameters for this specific model structure
    # 1 (L0) + 2*2 (LR: c_pos, e_pos, c_neg, e_neg) + 1*2 (BSZ) + 1*2 (Data) + 1*2 (Params) = 11
    EXPECTED_P = 11
    if P != EXPECTED_P:
        if P > EXPECTED_P:
            params_arr = params_arr[:, :EXPECTED_P]
            P = EXPECTED_P
        else:
            raise ValueError(f"Expected {EXPECTED_P} parameters per set for the scaling law model, but received {P}.")

    # Extract parameters for each parameter set (T sets)
    L0_arr = params_arr[:, 0]
    c_lr_pos_arr, e_lr_pos_arr = params_arr[:, 1], params_arr[:, 2]
    c_lr_neg_arr, e_lr_neg_arr = params_arr[:, 3], params_arr[:, 4]
    c_bsz_arr, e_bsz_arr = params_arr[:, 5], params_arr[:, 6]
    c_data_arr, e_data_arr = params_arr[:, 7], params_arr[:, 8]
    c_params_arr, e_params_arr = params_arr[:, 9], params_arr[:, 10]

    # Ensure coefficients are non-negative for power laws, robustifying against
    # potential floating point issues or edge cases in optimization.
    # These are safeguards; proper bounds in fit_scaling_law should enforce this for the optimized result.
    c_lr_pos_arr = np.maximum(c_lr_pos_arr, 1e-10)
    c_lr_neg_arr = np.maximum(c_lr_neg_arr, 1e-10)
    c_bsz_arr = np.maximum(c_bsz_arr, 1e-10)
    c_data_arr = np.maximum(c_data_arr, 1e-10)
    c_params_arr = np.maximum(c_params_arr, 1e-10)

    # Calculate individual contributions using log-space for numerical stability (x^e = exp(e * log(x)))
    # X[:, feature_idx][:, None] makes it (N, 1) for broadcasting against (1, T) parameter arrays
    log_X_lr = np.log(X[:, 0][:, None])
    log_X_bsz = np.log(X[:, 1][:, None])
    log_X_data = np.log(X[:, 2][:, None])
    log_X_params = np.log(X[:, 3][:, None])

    term_lr_pos = c_lr_pos_arr[None, :] * np.exp(e_lr_pos_arr[None, :] * log_X_lr)
    term_lr_neg = c_lr_neg_arr[None, :] * np.exp(e_lr_neg_arr[None, :] * log_X_lr)
    term_bsz = c_bsz_arr[None, :] * np.exp(e_bsz_arr[None, :] * log_X_bsz)
    term_data = c_data_arr[None, :] * np.exp(e_data_arr[None, :] * log_X_data)
    term_params = c_params_arr[None, :] * np.exp(e_params_arr[None, :] * log_X_params)

    # Sum all contributions
    pred = L0_arr[None, :] + term_lr_pos + term_lr_neg + term_bsz + term_data + term_params

    # If only one set of parameters was passed (T=1), return a 1D array (N,)
    return pred[:, 0] if T == 1 else pred


def fit_scaling_law(data_points, loss_values):
    """
    Fits the scaling law function to the given data points and loss values.

    Args:
        data_points (np.ndarray): (N, 4) array with columns [lr, bsz, data_size, non_embedding_param_size].
        loss_values (np.ndarray): (N,) array of corresponding lm loss values.

    Returns:
        np.ndarray: Optimized parameters (P,) for the scaling law function.
                    [L_0, c_lr_pos, e_lr_pos, c_lr_neg, e_lr_neg, c_bsz, e_bsz, c_data, e_data, c_params, e_params].
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values)

    def residuals(params, X, y):
        pred = scaling_law_func(X, params)
        # CRITICAL IMPROVEMENT: Replace non-finite (NaN, inf) predictions with the mean of observed losses
        # to prevent optimizer from failing due to extreme values. This significantly
        # improves robustness during optimization, especially with power laws.
        pred_clean = np.copy(pred) # Make a copy to avoid modifying 'pred' in place if it's reused
        mean_y = np.mean(y)
        pred_clean[~np.isfinite(pred_clean)] = mean_y 
        return pred_clean - y

    # Total number of parameters for the model (11 parameters)
    P = 11

    # Initial guess for parameters: [L_0, c_lr_pos, e_lr_pos, c_lr_neg, e_lr_neg, c_bsz, e_bsz, c_data, e_data, c_params, e_params]
    # These initial guesses are informed by typical LLM scaling laws and data ranges,
    # and designed to capture the U-shaped LR effect.
    # Reverted initial c_lr_pos/neg values to be more moderate (1.0).
    initial_params = np.array([
        np.min(y) * 0.95, # L_0: Irreducible loss, slightly below min observed loss
        1.0,   1.0,       # c_lr_pos, e_lr_pos: For high LR, loss increases (positive exponent).
        1.0,  -1.0,       # c_lr_neg, e_lr_neg: For low LR, loss increases (negative exponent).
        1e-3,  0.5,       # c_bsz, e_bsz: Batch size effect (e.g., slight increase in loss with larger bsz if not scaled LR).
        5.0,   -0.1,      # c_data, e_data: Data typically reduces loss (negative exponent), positive coeff
        3.0,   -0.1        # c_params, e_params: Parameters typically reduce loss (negative exponent), positive coeff
    ])

    # Bounds for parameters to guide the optimizer and ensure physical realism.
    # Reverted LR exponent bounds to allow zero (non-strict inequality), as in top performing programs,
    # which can improve optimization flexibility.
    lower_bounds = np.array([
        0.0,              # L_0: Irreducible loss must be non-negative
        1e-10, 0.0,       # c_lr_pos (positive), e_lr_pos (non-negative for increasing effect)
        1e-10, -5.0,      # c_lr_neg (positive), e_lr_neg (non-positive for increasing effect with decreasing LR)
        1e-10, -2.0,      # c_bsz, e_bsz
        1e-10, -1.0,      # c_data, e_data: e_data typically negative (more data = less loss)
        1e-10, -1.0       # c_params, e_params: e_params typically negative (more params = less loss)
    ])
    upper_bounds = np.array([
        np.max(y) * 1.5,  # L_0: Cannot exceed max observed loss significantly
        1e5,   5.0,       # c_lr_pos, e_lr_pos
        1e5,   0.0,       # c_lr_neg, e_lr_neg
        1e5,   2.0,       # c_bsz, e_bsz
        1e5,   0.0,       # c_data, e_data: e_data <= 0
        1e5,   0.0         # c_params, e_params: e_params <= 0
    ])

    # Clip initial parameters to ensure they are within the defined bounds
    initial_params = np.clip(initial_params, lower_bounds, upper_bounds)

    # Use 'trf' (Trust Region Reflective) method, which handles bounds effectively and is robust for non-linear least squares.
    # verbose=0 suppresses convergence messages.
    # max_nfev reverted to 4000, which was effective in top performing programs.
    result = least_squares(residuals, initial_params, args=(X, y),
                           bounds=(lower_bounds, upper_bounds),
                           method='trf', verbose=0, max_nfev=4000)

    if result.success:
        return result.x
    else:
        # Fallback to initial parameters if optimization fails.
        # A warning is printed to alert about potential issues.
        print("Warning: least_squares optimization failed. Returning initial parameters.")
        return initial_params
# EVOLVE-BLOCK-END

#2 Run 4 R² = 0.917935

▼

Python

# EVOLVE-BLOCK-START
import numpy as np
from scipy.optimize import least_squares

def scaling_law_func(data_points, params):
    """
    Predicts LM loss based on learning rate, batch size, data size, and non-embedding parameter size.

    The model is of the form:
    Loss = L_0 + exp(log_c_lr) * lr^e_lr + exp(log_c_bsz) * bsz^e_bsz + exp(log_c_data) * data_size^e_data + exp(log_c_params) * non_embedding_param_size^e_params

    To improve numerical stability during optimization, the coefficients (c_lr, c_bsz, c_data, c_params)
    are optimized in their logarithmic form (log_c_i). This ensures they remain positive and handles large ranges.

    Args:
        data_points (np.ndarray): (N, 4) array with columns [lr, bsz, data_size, non_embedding_param_size].
        params (np.ndarray): Array of model parameters. Can be (P,) for a single model or (T, P) for multiple.
                             Expected P=9: [L_0, log_c_lr, e_lr, log_c_bsz, e_bsz, log_c_data, e_data, log_c_params, e_params].

    Returns:
        np.ndarray: Predicted lm loss values. Shape (N,) if params is (P,), or (N, T) if params is (T, P).
    """
    X = np.atleast_2d(np.asarray(data_points)) # (N, F)
    
    # Ensure all input features are strictly positive to avoid issues with np.log(0) or negative bases.
    # A small epsilon (1e-10) is used to clamp values if they are zero or negative.
    X = np.maximum(X, 1e-10)

    params_arr = np.asarray(params)
    # The framework might pass parameters as (P,) or (T, P).
    # We ensure it's at least 2D (1, P) for consistent broadcasting.
    if params_arr.ndim == 1:
        params_arr = params_arr[None, :] # Make it (1, P)

    T, P = params_arr.shape # T: number of parameter sets, P: number of parameters per set

    # This model expects 9 parameters: 1 (L0) + 4 features * 2 (log_coefficient, exponent)
    EXPECTED_P = 9
    if P != EXPECTED_P:
        # Robustly handle cases where too many parameters are passed by taking the first EXPECTED_P.
        # If too few, it's an error in parameter passing.
        if P > EXPECTED_P:
            params_arr = params_arr[:, :EXPECTED_P]
            P = EXPECTED_P
        else:
            raise ValueError(f"Expected {EXPECTED_P} parameters per set for the scaling law model, but received {P}.")

    # Extract parameters for each parameter set (T sets)
    L0_arr = params_arr[:, 0]
    # Coefficients are stored and optimized in log-space for numerical stability
    log_c_lr_arr, e_lr_arr = params_arr[:, 1], params_arr[:, 2]
    log_c_bsz_arr, e_bsz_arr = params_arr[:, 3], params_arr[:, 4]
    log_c_data_arr, e_data_arr = params_arr[:, 5], params_arr[:, 6]
    log_c_params_arr, e_params_arr = params_arr[:, 7], params_arr[:, 8]

    # Convert log-coefficients back to actual coefficients (c_i = exp(log_c_i))
    # This ensures coefficients are strictly positive.
    c_lr_arr = np.exp(log_c_lr_arr)
    c_bsz_arr = np.exp(log_c_bsz_arr)
    c_data_arr = np.exp(log_c_data_arr)
    c_params_arr = np.exp(log_c_params_arr)

    # Calculate individual power law contributions.
    # Using np.exp(exponent * np.log(base)) is numerically more stable than base**exponent,
    # especially for very small or very large bases, and correctly handles fractional exponents.
    # X[:, feature_idx][:, None] makes it (N, 1) for broadcasting across parameter sets.
    # c_arr[None, :] and e_arr[None, :] makes them (1, T) for broadcasting across data points.
    # Resulting terms are (N, T).

    log_X_lr = np.log(X[:, 0][:, None])
    log_X_bsz = np.log(X[:, 1][:, None])
    log_X_data = np.log(X[:, 2][:, None])
    log_X_params = np.log(X[:, 3][:, None])

    term_lr = c_lr_arr[None, :] * np.exp(e_lr_arr[None, :] * log_X_lr)
    term_bsz = c_bsz_arr[None, :] * np.exp(e_bsz_arr[None, :] * log_X_bsz)
    term_data = c_data_arr[None, :] * np.exp(e_data_arr[None, :] * log_X_data)
    term_params = c_params_arr[None, :] * np.exp(e_params_arr[None, :] * log_X_params)

    # Sum all contributions to get the predicted loss
    pred = L0_arr[None, :] + term_lr + term_bsz + term_data + term_params

    # Loss values must be non-negative. Clamp any predicted negative values to a small positive epsilon.
    pred = np.maximum(pred, 1e-10)

    # If only one set of parameters was passed (T=1), return a 1D array (N,).
    # Otherwise, return (N, T).
    return pred[:, 0] if T == 1 else pred


def fit_scaling_law(data_points, loss_values):
    """
    Fits the scaling law function to the given data points and loss values using non-linear least squares.
    Optimizes coefficients in log-space for improved numerical stability and convergence,
    especially for coefficients that can span many orders of magnitude and must be positive.

    Args:
        data_points (np.ndarray): (N, 4) array with columns [lr, bsz, data_size, non_embedding_param_size].
        loss_values (np.ndarray): (N,) array of corresponding lm loss values.

    Returns:
        np.ndarray: Optimized parameters (P,) for the scaling law function.
                    [L_0, log_c_lr, e_lr, log_c_bsz, e_bsz, log_c_data, e_data, log_c_params, e_params].
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values)

    # Define the residual function for scipy.optimize.least_squares
    # This function calculates the difference between predicted and actual loss.
    def residuals(params, X, y):
        pred = scaling_law_func(X, params)
        return pred - y

    F = X.shape[1] # Number of features is 4
    P = 1 + F * 2  # Total number of parameters for the model: 1 (L0) + 4*2 (log_coefficient, exponent) = 9

    # --- Intelligent Initial Guess for Parameters ---
    # Good initial guesses are crucial for non-linear optimization convergence.

    # 1. L_0 (Irreducible Loss): Estimate as slightly below the minimum observed loss.
    L0_guess = np.maximum(np.min(y) * 0.9, 0.01) # Ensure L0 is positive and not too small

    # 2. Exponent Initial Guesses (e_i): Based on common LLM scaling literature.
    #    - Learning rate (lr): higher LR often leads to higher loss.
    #    - Batch size (bsz): higher Bsz often leads to slightly higher loss.
    #    - Data size (data_size): more data reduces loss (negative exponent).
    #    - Parameter size (non_embedding_param_size): more parameters reduce loss (negative exponent).
    e_lr_init = 0.5
    e_bsz_init = 0.1
    e_data_init = -0.07 # Typical range: -0.05 to -0.2
    e_params_init = -0.07 # Typical range: -0.05 to -0.2

    # 3. Coefficient Initial Guesses (c_i):
    #    Calculate coefficients such that each power law term contributes a reasonable portion
    #    of the remaining loss (total loss - L_0) when evaluated at the geometric mean of its feature.
    
    # Clamp X to avoid log(0) for geometric mean calculation.
    X_clamped = np.maximum(X, 1e-10)
    geom_mean_lr = np.exp(np.mean(np.log(X_clamped[:, 0])))
    geom_mean_bsz = np.exp(np.mean(np.log(X_clamped[:, 1])))
    geom_mean_data_size = np.exp(np.mean(np.log(X_clamped[:, 2])))
    geom_mean_param_size = np.exp(np.mean(np.log(X_clamped[:, 3])))

    remaining_mean_loss = np.mean(y) - L0_guess
    # Ensure this target is positive to avoid issues with coefficient calculation.
    target_contrib_per_term = np.maximum(remaining_mean_loss / 4.0, 1e-5) 

    # c_i = target_contribution / (geometric_mean_feature_i ^ initial_exponent_i)
    # Ensure calculated c_i are positive before taking log.
    c_lr_init = np.maximum(target_contrib_per_term / (geom_mean_lr ** e_lr_init), 1e-10)
    c_bsz_init = np.maximum(target_contrib_per_term / (geom_mean_bsz ** e_bsz_init), 1e-10)
    c_data_init = np.maximum(target_contrib_per_term / (geom_mean_data_size ** e_data_init), 1e-10)
    c_params_init = np.maximum(target_contrib_per_term / (geom_mean_param_size ** e_params_init), 1e-10)

    # Assemble the complete initial_params array, with coefficients log-transformed
    initial_params = np.array([
        L0_guess,
        np.log(c_lr_init), e_lr_init,
        np.log(c_bsz_init), e_bsz_init,
        np.log(c_data_init), e_data_init,
        np.log(c_params_init), e_params_init
    ])

    # --- Define Bounds for Parameters ---
    # Bounds constrain the search space, improve stability, and enforce physical realism.
    # For log-transformed coefficients, the bounds become log(min_val) and log(max_val).
    log_c_min = np.log(1e-10) # Corresponds to c_i >= 1e-10
    log_c_max = np.log(1e10)  # Corresponds to c_i <= 1e10

    lower_bounds = np.array([
        0.0,              # L_0: Irreducible loss must be non-negative.
        log_c_min, -5.0,  # log_c_lr, e_lr: log_c_lr within reasonable range; e_lr broad typical range.
        log_c_min, -5.0,  # log_c_bsz, e_bsz: log_c_bsz within reasonable range; e_bsz broad typical range.
        log_c_min, -2.0,  # log_c_data, e_data: e_data typically negative, expanded lower bound to -2.0 for stronger scaling.
        log_c_min, -2.0   # log_c_params, e_params: e_params typically negative, expanded lower bound to -2.0 for stronger scaling.
    ])
    upper_bounds = np.array([
        np.max(y) * 1.5,  # L_0: Bounded above by a value slightly higher than max observed loss.
        log_c_max, 5.0,   # log_c_lr, e_lr: log_c_max within reasonable range; e_lr broad typical range.
        log_c_max, 5.0,   # log_c_bsz, e_bsz: log_c_max within reasonable range; e_bsz broad typical range.
        log_c_max, 0.0,   # log_c_data, e_data: e_data <= 0 (increasing data should not increase loss).
        log_c_max, 0.0    # log_c_params, e_params: e_params <= 0 (increasing parameters should not increase loss).
    ])
    
    # Clip the initial parameters to ensure they are within the defined bounds.
    # This is important for optimizers that use bounds, like 'trf', to prevent initial values from being out-of-range.
    initial_params = np.clip(initial_params, lower_bounds, upper_bounds)

    # --- Optimization using least_squares ---
    # 'trf' (Trust Region Reflective) is robust for bounded non-linear least squares.
    # Increased max_nfev (max function evaluations) to allow more iterations for complex landscapes.
    # Tightened ftol, xtol, gtol for a more precise and thorough fit.
    result = least_squares(residuals, initial_params, args=(X, y),
                           bounds=(lower_bounds, upper_bounds),
                           method='trf', verbose=0, max_nfev=3000,
                           ftol=1e-9, xtol=1e-9, gtol=1e-9)

    if result.success:
        return result.x
    else:
        # If optimization fails (e.g., did not converge), return the initial parameters as a fallback.
        # This prevents the program from crashing and provides a default, albeit suboptimal, result.
        print("Warning: least_squares optimization failed. Returning initial parameters.")
        return initial_params
# EVOLVE-BLOCK-END

#3 Run 3 R² = 0.914771

▼

Python

# EVOLVE-BLOCK-START
import numpy as np
from scipy.optimize import least_squares

def scaling_law_func(data_points, params):
    """
    Predicts LM loss based on learning rate, batch size, data size, and non-embedding parameter size.

    The model uses a general two-term power law for learning rate, and power laws
    for batch size, data size, and non-embedding parameter size, plus an irreducible loss L_0.
    The form is:
    Loss = L_0 + (c_lr_pos * lr^e_lr_pos + c_lr_neg * lr^e_lr_neg) + c_bsz * bsz^e_bsz + c_data * data_size^e_data + c_params * non_embedding_param_size^e_params

    Args:
        data_points (np.ndarray): (N, 4) array with columns [lr, bsz, data_size, non_embedding_param_size].
        params (np.ndarray): Array of model parameters. Can be (P,) for a single model or (T, P) for multiple.
                             Expected P=11: [L_0, c_lr_pos, e_lr_pos, c_lr_neg, e_lr_neg, c_bsz, e_bsz, c_data, e_data, c_params, e_params].

    Returns:
        np.ndarray: Predicted lm loss values. Shape (N,) if params is (P,), or (N, T) if params is (T, P).
    """
    X = np.atleast_2d(np.asarray(data_points))
    # Ensure all inputs are positive before power calculation to prevent issues with fractional exponents of negative numbers
    X = np.maximum(X, 1e-10) 

    params_arr = np.asarray(params)
    if params_arr.ndim == 1:
        params_arr = params_arr[None, :]

    T, P = params_arr.shape
    EXPECTED_P = 11
    if P != EXPECTED_P:
        if P > EXPECTED_P:
            params_arr = params_arr[:, :EXPECTED_P]
            P = EXPECTED_P
        else:
            raise ValueError(f"Expected {EXPECTED_P} parameters per set for the scaling law model, but received {P}.")

    # Extract parameters for each parameter set (T sets)
    L0_arr = params_arr[:, 0]
    c_lr_pos_arr, e_lr_pos_arr = params_arr[:, 1], params_arr[:, 2]
    c_lr_neg_arr, e_lr_neg_arr = params_arr[:, 3], params_arr[:, 4]
    c_bsz_arr, e_bsz_arr = params_arr[:, 5], params_arr[:, 6]
    c_data_arr, e_data_arr = params_arr[:, 7], params_arr[:, 8]
    c_params_arr, e_params_arr = params_arr[:, 9], params_arr[:, 10]

    lr_val = X[:, 0][:, None]
    bsz_val = X[:, 1][:, None]
    data_val = X[:, 2][:, None]
    params_val = X[:, 3][:, None]

    term_lr_pos = c_lr_pos_arr[None, :] * np.power(lr_val, e_lr_pos_arr[None, :])
    term_lr_neg = c_lr_neg_arr[None, :] * np.power(lr_val, e_lr_neg_arr[None, :])
    term_lr = term_lr_pos + term_lr_neg

    term_bsz = c_bsz_arr[None, :] * np.power(bsz_val, e_bsz_arr[None, :])
    term_data = c_data_arr[None, :] * np.power(data_val, e_data_arr[None, :])
    term_params = c_params_arr[None, :] * np.power(params_val, e_params_arr[None, :])

    pred = L0_arr[None, :] + term_lr + term_bsz + term_data + term_params

    return pred[:, 0] if T == 1 else pred


def fit_scaling_law(data_points, loss_values):
    """
    Fits the scaling law function to the given data points and loss values using
    non-linear least squares optimization with robust initial guesses and bounds.

    Args:
        data_points (np.ndarray): (N, 4) array with columns [lr, bsz, data_size, non_embedding_param_size].
        loss_values (np.ndarray): (N,) array of corresponding lm loss values.

    Returns:
        np.ndarray: Optimized parameters (P,) for the scaling law function.
                    [L_0, c_lr_pos, e_lr_pos, c_lr_neg, e_lr_neg, c_bsz, e_bsz, c_data, e_data, c_params, e_params].
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values)

    def residuals(params, X, y):
        pred = scaling_law_func(X, params)
        res = pred - y
        # Penalize non-finite values heavily to guide the optimizer away from invalid regions
        res[~np.isfinite(res)] = 1e9 
        return res

    P = 11

    # Refined Initial guess for parameters, based on typical scaling law values and data ranges.
    # [L_0, c_lr_pos, e_lr_pos, c_lr_neg, e_lr_neg, c_bsz, e_bsz, c_data, e_data, c_params, e_params]
    initial_params = np.array([
        np.min(y) * 0.9,  # L_0: Irreducible loss, a bit below min observed loss
        0.1,   1.0,       # c_lr_pos, e_lr_pos: Increased c_lr_pos for larger impact
        1e-4, -1.0,       # c_lr_neg, e_lr_neg: Increased c_lr_neg for larger impact
        1e-2,  0.0,       # c_bsz, e_bsz: Small coefficient, neutral exponent
        1e2,  -0.07,      # c_data, e_data: Data typically reduces loss (negative exponent)
        1e2,  -0.07       # c_params, e_params: Parameters typically reduce loss (negative exponent)
    ])

    # Refined Bounds for parameters, guiding the optimizer to physically realistic regions.
    lower_bounds = np.array([
        0.0,              # L_0: must be non-negative
        1e-5, 0.1,        # c_lr_pos (positive, min), e_lr_pos (must be positive)
        1e-8, -5.0,       # c_lr_neg (positive, min), e_lr_neg (must be negative)
        1e-5, -0.8,       # c_bsz (positive), e_bsz (can be negative, slightly tighter range)
        1e-1, -0.2,       # c_data (positive), e_data (negative for loss reduction)
        1e-1, -0.2        # c_params (positive), e_params (negative for loss reduction)
    ])
    upper_bounds = np.array([
        np.max(y) * 1.1,  # L_0: bounded above by a value slightly higher than max observed loss
        1e1,  5.0,        # c_lr_pos (upper bound), e_lr_pos (upper bound for exponent)
        5e-4, -0.1,       # c_lr_neg (upper bound, significantly tightened), e_lr_neg (must be negative, not too close to 0)
        1e0,  0.8,        # c_bsz (upper bound), e_bsz (upper bound for exponent, slightly tighter range)
        1e3, -0.01,       # c_data (upper bound), e_data (must be negative, not too close to 0)
        1e3, -0.01        # c_params (upper bound), e_params (must be negative, not too close to 0)
    ])

    result = least_squares(residuals, initial_params, args=(X, y),
                           bounds=(lower_bounds, upper_bounds),
                           method='trf', verbose=0, max_nfev=10000,
                           x_scale='jac', ftol=1e-8, xtol=1e-8)

    if result.success:
        return result.x
    else:
        print(f"Warning: least_squares optimization failed. Status: {result.status}, Message: {result.message}. Returning initial parameters.")
        return initial_params
# EVOLVE-BLOCK-END

#4 Run 5 R² = 0.914512

▼

Python

# EVOLVE-BLOCK-START
import numpy as np
from scipy.optimize import least_squares

def scaling_law_func(data_points, params):
    """
    Predicts LM loss based on learning rate, batch size, data size, and non-embedding parameter size.

    The model is of the form:
    Loss = L_0 + c_lr1 * lr^e_lr1 + c_lr2 * lr^e_lr2 + c_bsz * bsz^e_bsz + c_data * data_size^e_data + c_params * non_embedding_param_size^e_params

    This model uses two learning rate terms to capture a U-shaped or more complex relationship,
    where one term typically models the benefit of increasing LR (e_lr1 < 0) and the other
    the detriment (e_lr2 > 0).

    Args:
        data_points (np.ndarray): (N, 4) array with columns [lr, bsz, data_size, non_embedding_param_size].
        params (np.ndarray): Array of model parameters. Can be (P,) for a single model or (T, P) for multiple.
                             Expected P=11: [L_0, c_lr1, e_lr1, c_lr2, e_lr2, c_bsz, e_bsz, c_data, e_data, c_params, e_params].

    Returns:
        np.ndarray: Predicted lm loss values. Shape (N,) if params is (P,), or (N, T) if params is (T, P).
    """
    X = np.atleast_2d(np.asarray(data_points)) # (N, F)
    # Ensure all inputs are positive before log/power to prevent numerical issues
    X = np.maximum(X, 1e-10)

    params_arr = np.asarray(params)
    # Adapt to the original framework's potential (T, P) parameter passing
    if params_arr.ndim == 1:
        params_arr = params_arr[None, :] # Make it (1, P)

    T, P = params_arr.shape # T: number of parameter sets, P: number of parameters per set

    # Expected number of parameters for this specific model structure
    # 1 (L0) + 4 (lr terms) + 2 (bsz) + 2 (data_size) + 2 (params_size) = 11
    EXPECTED_P = 11
    if P != EXPECTED_P:
        if P > EXPECTED_P:
            # If more parameters are passed than expected, use only the first EXPECTED_P
            params_arr = params_arr[:, :EXPECTED_P]
            P = EXPECTED_P
        else:
            # If fewer parameters are passed, it's an error in model definition or parameter passing.
            raise ValueError(f"Expected {EXPECTED_P} parameters per set for the scaling law model, but received {P}. "
                             "Please check the number of parameters defined in the model structure.")

    # Extract parameters for each parameter set (T sets)
    L0_arr = params_arr[:, 0]
    c_lr1_arr, e_lr1_arr = params_arr[:, 1], params_arr[:, 2]
    c_lr2_arr, e_lr2_arr = params_arr[:, 3], params_arr[:, 4]
    c_bsz_arr, e_bsz_arr = params_arr[:, 5], params_arr[:, 6]
    c_data_arr, e_data_arr = params_arr[:, 7], params_arr[:, 8]
    c_params_arr, e_params_arr = params_arr[:, 9], params_arr[:, 10]

    # Ensure coefficients are non-negative for power laws, robustifying against
    # potential floating point issues or edge cases in optimization.
    c_lr1_arr = np.maximum(c_lr1_arr, 1e-10)
    c_lr2_arr = np.maximum(c_lr2_arr, 1e-10)
    c_bsz_arr = np.maximum(c_bsz_arr, 1e-10)
    c_data_arr = np.maximum(c_data_arr, 1e-10)
    c_params_arr = np.maximum(c_params_arr, 1e-10)

    # Calculate individual contributions using log-space for numerical stability,
    # then exponentiate. This is generally preferred over direct X**e for robustness
    # with arbitrary real exponents and very small base values.
    # X[:, feature_idx][:, None] makes it (N, 1) for broadcasting against (1, T) parameter arrays
    log_X_lr = np.log(X[:, 0][:, None])
    log_X_bsz = np.log(X[:, 1][:, None])
    log_X_data = np.log(X[:, 2][:, None])
    log_X_params = np.log(X[:, 3][:, None])

    # Learning rate terms (U-shaped contribution)
    term_lr1 = c_lr1_arr[None, :] * np.exp(e_lr1_arr[None, :] * log_X_lr)
    term_lr2 = c_lr2_arr[None, :] * np.exp(e_lr2_arr[None, :] * log_X_lr)
    term_lr = term_lr1 + term_lr2

    # Other terms
    term_bsz = c_bsz_arr[None, :] * np.exp(e_bsz_arr[None, :] * log_X_bsz)
    term_data = c_data_arr[None, :] * np.exp(e_data_arr[None, :] * log_X_data)
    term_params = c_params_arr[None, :] * np.exp(e_params_arr[None, :] * log_X_params)

    # Sum all contributions
    pred = L0_arr[None, :] + term_lr + term_bsz + term_data + term_params

    # Ensure predictions are non-negative, as loss cannot be negative.
    # This also helps clip any numerically unstable negative predictions that might arise.
    pred = np.maximum(pred, 0.0)

    # If only one set of parameters was passed (T=1), return a 1D array (N,)
    return pred[:, 0] if T == 1 else pred


def fit_scaling_law(data_points, loss_values):
    """
    Fits the scaling law function to the given data points and loss values.

    Args:
        data_points (np.ndarray): (N, 4) array with columns [lr, bsz, data_size, non_embedding_param_size].
        loss_values (np.ndarray): (N,) array of corresponding lm loss values.

    Returns:
        np.ndarray: Optimized parameters (P,) for the scaling law function.
                    [L_0, c_lr1, e_lr1, c_lr2, e_lr2, c_bsz, e_bsz, c_data, e_data, c_params, e_params].
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values)

    def residuals(params, X, y):
        pred = scaling_law_func(X, params)
        res = pred - y
        # Robustly handle NaN/Inf predictions.
        # Assign a large finite value (1e10) with the correct sign to problematic residuals
        # to strongly penalize these regions during optimization.
        problematic_indices = ~np.isfinite(res)
        if np.any(problematic_indices):
            # Use np.where to ensure that even if res is NaN, it gets a sign (e.g., from a small positive value)
            # This makes the gradient more meaningful than just NaN.
            res[problematic_indices] = 1e10 * np.sign(np.where(np.isfinite(res), res, 1e-6))[problematic_indices]
        return res

    # Total number of parameters for the new model (L0 + 2*LR + 2*BSZ + 2*Data + 2*Params = 11)
    P = 11

    # Initial guess for parameters: [L_0, c_lr1, e_lr1, c_lr2, e_lr2, c_bsz, e_bsz, c_data, e_data, c_params, e_params]
    # These initial guesses are informed by typical LLM scaling laws and data ranges,
    # with a focus on capturing the U-shaped learning rate behavior.
    initial_params = np.array([
        np.min(y) * 0.9,  # L_0: Irreducible loss, slightly below min observed loss
        1e-4, -1.0,       # c_lr1, e_lr1: For the decreasing loss part with increasing LR (e.g., 1/LR)
        1e3,   1.0,       # c_lr2, e_lr2: For the increasing loss part with increasing LR (e.g., LR)
        0.1,   0.1,       # c_bsz, e_bsz: Small effect, potentially slightly positive exponent for batch size
        10.0,  -0.1,      # c_data, e_data: Data typically reduces loss (negative exponent)
        5.0,   -0.1        # c_params, e_params: Parameters typically reduce loss (negative exponent)
    ])

    # Refined bounds for parameters to guide the optimizer and ensure physical realism.
    # Coefficients (c_i) are generally positive. Exponents (e_i) are constrained based on expected effects.
    lower_bounds = np.array([
        0.0,              # L_0: Irreducible loss must be non-negative
        1e-10, -3.0,      # c_lr1 (positive), e_lr1 (negative for 1/lr effect)
        1e-10,  0.01,     # c_lr2 (positive), e_lr2 (positive for lr effect)
        1e-10, -1.0,      # c_bsz (positive), e_bsz (can be negative or positive, but not too extreme)
        1e-10, -1.0,      # c_data (positive), e_data (negative or zero, increasing data should not increase loss)
        1e-10, -1.0       # c_params (positive), e_params (negative or zero, increasing params should not increase loss)
    ])
    upper_bounds = np.array([
        np.max(y) * 1.5,  # L_0: Cannot exceed max observed loss significantly
        1e2,   -0.01,     # c_lr1, e_lr1 (must be negative, e.g., <= -0.01)
        1e5,   3.0,       # c_lr2, e_lr2 (must be positive, e.g., >= 0.01)
        1e3,   1.0,       # c_bsz, e_bsz
        1e4,   0.0,       # c_data, e_data (<= 0)
        1e4,   0.0         # c_params, e_params (<= 0)
    ])

    # Use 'trf' (Trust Region Reflective) method, which handles bounds effectively and is robust for non-linear least squares.
    # verbose=0 suppresses convergence messages.
    # max_nfev increased to allow more iterations for complex landscapes.
    # ftol and xtol tightened for better convergence precision, to help with the more complex model.
    result = least_squares(residuals, initial_params, args=(X, y),
                           bounds=(lower_bounds, upper_bounds),
                           method='trf', verbose=0, max_nfev=5000, ftol=1e-8, xtol=1e-8)

    if result.success:
        return result.x
    else:
        # If optimization fails, return the initial_params as a robust fallback.
        print("Warning: least_squares optimization failed. Returning initial parameters.")
        return initial_params
# EVOLVE-BLOCK-END

#5 Run 2 R² = 0.904116

▼

Python

# EVOLVE-BLOCK-START
import numpy as np
from scipy.optimize import least_squares

def scaling_law_func(data_points, params):
    """
    Predicts LM loss based on learning rate, batch size, data size, and non-embedding parameter size.

    This evolved model uses a quadratic term for the learning rate in log-space to capture its
    U-shaped effect on loss (loss increases for both very low and very high learning rates).
    The model also includes power-law terms for model size, data size, and batch size,
    plus an irreducible loss component. This reduces the number of parameters compared to
    the previous two-power-law LR model, aiming for better parameter efficiency and stability.

    Model form:
    Loss = L_inf + C_N * N^E_N + C_D * D^E_D + C_bsz * bsz^E_bsz + K_lr * (log(lr) - log_lr_opt)^2

    Args:
        data_points (np.ndarray): (N, 4) array with columns [lr, bsz, data_size, non_embedding_param_size].
                                  N is the number of data points.
        params (np.ndarray): Array of model parameters. Can be (P,) for a single set of parameters
                             or (T, P) for multiple sets (where T is the number of sets).
                             Expected P=9 parameters:
                             [L_inf, C_N, E_N, C_D, E_D, C_bsz, E_bsz, K_lr, log_lr_opt].

    Returns:
        np.ndarray: Predicted lm loss values.
                    Shape (N,) if params is (P,), or (N, T) if params is (T, P).
    """
    X = np.atleast_2d(np.asarray(data_points))
    
    # Ensure all input features are strictly positive before log transformation.
    # A small epsilon (1e-10) is added to prevent log(0) issues.
    X = np.maximum(X, 1e-10) 

    params_arr = np.asarray(params)
    # Adapt to potential (T, P) parameter passing for generality.
    # If params is a 1D array (P,), reshape it to (1, P) for consistent broadcasting.
    if params_arr.ndim == 1:
        params_arr = params_arr[None, :] # Make it (1, P)

    T, P = params_arr.shape # T: number of parameter sets, P: number of parameters per set

    # Define the expected number of parameters for this specific scaling law model.
    EXPECTED_P = 9 # L_inf, C_N, E_N, C_D, E_D, C_bsz, E_bsz, K_lr, log_lr_opt
    if P != EXPECTED_P:
        if P > EXPECTED_P: # Truncate if too many parameters are passed
            params_arr = params_arr[:, :EXPECTED_P]
            P = EXPECTED_P
        else:
            raise ValueError(f"Expected {EXPECTED_P} parameters per set for the scaling law model, but received {P}.")

    # Extract parameters for each parameter set (T sets).
    L_inf_arr = params_arr[:, 0]
    C_N_arr, E_N_arr = params_arr[:, 1], params_arr[:, 2] # Non-embedding parameter size (N)
    C_D_arr, E_D_arr = params_arr[:, 3], params_arr[:, 4] # Data size (D)
    C_bsz_arr, E_bsz_arr = params_arr[:, 5], params_arr[:, 6] # Batch size (bsz)
    K_lr_arr, log_lr_opt_arr = params_arr[:, 7], params_arr[:, 8] # Learning rate (lr) quadratic term

    # Extract individual features from the data_points array.
    # X is (N_data_points, 4). Appending [:, None] reshapes them to (N_data_points, 1) for broadcasting
    # with parameter arrays of shape (1, T).
    lr = X[:, 0][:, None]
    bsz = X[:, 1][:, None]
    data_size = X[:, 2][:, None]
    non_embedding_param_size = X[:, 3][:, None]

    # Calculate each power-law term using log-space for numerical stability.
    # C * X^E is performed as C * exp(E * log(X)).
    term_N = C_N_arr[None, :] * np.exp(E_N_arr[None, :] * np.log(non_embedding_param_size))
    term_D = C_D_arr[None, :] * np.exp(E_D_arr[None, :] * np.log(data_size))
    term_bsz = C_bsz_arr[None, :] * np.exp(E_bsz_arr[None, :] * np.log(bsz))
    
    # Calculate the quadratic learning rate term.
    # K_lr * (log(lr) - log_lr_opt)^2
    log_lr = np.log(lr)
    term_lr = K_lr_arr[None, :] * (log_lr - log_lr_opt_arr[None, :])**2

    # The total predicted loss is the sum of the irreducible loss (L_inf) and all feature contributions.
    pred = L_inf_arr[None, :] + term_N + term_D + term_bsz + term_lr
    
    # If only one set of parameters was passed (T=1), return a 1D array (N_data_points,).
    # Otherwise, return (N_data_points, T).
    return pred[:, 0] if T == 1 else pred


def fit_scaling_law(data_points, loss_values):
    """
    Fits the evolved scaling law function to the given data points and loss values using non-linear least squares.

    This function utilizes `scipy.optimize.least_squares` with refined initial guesses and tighter bounds
    to improve optimization robustness and accuracy. The model form has been simplified for learning rate.

    Args:
        data_points (np.ndarray): (N, 4) array with columns [lr, bsz, data_size, non_embedding_param_size].
        loss_values (np.ndarray): (N,) array of corresponding lm loss values.

    Returns:
        np.ndarray: Optimized parameters (9,) for the scaling law function:
                    [L_inf, C_N, E_N, C_D, E_D, C_bsz, E_bsz, K_lr, log_lr_opt].
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values)

    # Define the residual function for scipy.optimize.least_squares.
    def residuals(params, X, y):
        pred = scaling_law_func(X, params)
        return pred - y

    # The model has 9 parameters.
    P = 9 

    # --- Initial Guesses for Parameters ---
    # These values are chosen based on common LLM scaling law observations and analysis
    # of the provided data ranges, aiming to guide the optimizer efficiently.
    min_loss = np.min(y)
    
    # Calculate log(lr) for estimating optimal LR range
    log_lr_data = np.log(X[:, 0])
    
    initial_params = np.array([
        min_loss * 0.9,           # L_inf: Irreducible loss, slightly below the minimum observed loss.
                                  # This is a good heuristic to start L_inf.
        1.0, -0.07,               # C_N, E_N: For non_embedding_param_size. More params reduce loss (E_N < 0).
                                  # C_N=1.0 is reasonable given param_size values (e.g., 1e9) and N^-0.07 ~ 0.1-0.3.
        1.0, -0.07,               # C_D, E_D: For data_size. More data reduces loss (E_D < 0).
                                  # C_D=1.0 is reasonable given data_size values (e.g., 1e11) and D^-0.07 ~ 0.1-0.2.
        0.01, 0.0,                # C_bsz, E_bsz: For batch size. Small coefficient, neutral exponent as a starting point.
        0.1, np.median(log_lr_data) # K_lr, log_lr_opt: Quadratic LR term. K_lr must be positive for U-shape.
                                  # log_lr_opt initialized to median log LR in the data.
    ])

    # --- Bounds for Parameters ---
    # These are crucial for numerical stability and ensuring physically meaningful results.
    # They prevent the optimizer from exploring unrealistic parameter spaces.
    lower_bounds = np.array([
        0.0,                      # L_inf: Irreducible loss must be non-negative.
        1e-5, -0.5,               # C_N: positive. E_N: negative (more params should not increase loss).
        1e-5, -0.5,               # C_D: positive. E_D: negative (more data should not increase loss).
        1e-5, -0.5,               # C_bsz: positive. E_bsz: flexible but usually small magnitude.
        1e-5, np.min(log_lr_data) # K_lr: positive (for U-shape). log_lr_opt: within observed log(lr) range.
    ])
    upper_bounds = np.array([
        np.max(y) * 1.2,          # L_inf: Upper bound slightly above the maximum observed loss value.
        10.0, 0.0,                # C_N: upper bound for coefficient. E_N: upper bound of 0.0.
        10.0, 0.0,                # C_D: upper bound for coefficient. E_D: upper bound of 0.0.
        1.0, 0.5,                 # C_bsz: upper bound for coefficient. E_bsz: upper bound.
        5.0, np.max(log_lr_data)  # K_lr: upper bound for coefficient. log_lr_opt: within observed log(lr) range.
    ])

    # Use 'trf' (Trust Region Reflective) method, which is highly effective for non-linear
    # least squares problems with bounds. It is generally robust and efficient.
    # `verbose=0` suppresses output during optimization.
    # `max_nfev` (maximum number of function evaluations) is increased to allow sufficient
    # iterations for convergence over a potentially complex parameter landscape.
    result = least_squares(residuals, initial_params, args=(X, y),
                           bounds=(lower_bounds, upper_bounds),
                           method='trf', verbose=0, max_nfev=5000) # Increased max_nfev

    # Return the optimized parameters if the optimization was successful.
    # If the optimization fails, return the initial parameters as a robust fallback.
    if result.success:
        return result.x
    else:
        # print("Warning: least_squares optimization failed. Returning initial parameters.")
        return initial_params
# EVOLVE-BLOCK-END