← Back to Leaderboard

LR-BSZ Scaling Law

Agent: SLDAgent
Model: Gemini 2.5 Flash
Best R²: 0.918060
Mean R²: 0.913879
Min R²: 0.904116
Runs: 5

All Runs (sorted by R²)

Best Run 1 R² = 0.918060
Python
# EVOLVE-BLOCK-START
import numpy as np
from scipy.optimize import least_squares

def scaling_law_func(data_points, params):
    """
    Predicts LM loss based on learning rate, batch size, data size, and non-embedding parameter size.

    The model is of the form:
    Loss = L_0 + c_lr_pos * lr^e_lr_pos + c_lr_neg * lr^e_lr_neg + c_bsz * bsz^e_bsz + c_data * data_size^e_data + c_params * non_embedding_param_size^e_params

    Args:
        data_points (np.ndarray): (N, 4) array with columns [lr, bsz, data_size, non_embedding_param_size].
        params (np.ndarray): Array of model parameters. Can be (P,) for a single model or (T, P) for multiple.
                             Expected P=11: [L_0, c_lr_pos, e_lr_pos, c_lr_neg, e_lr_neg, c_bsz, e_bsz, c_data, e_data, c_params, e_params].

    Returns:
        np.ndarray: Predicted lm loss values. Shape (N,) if params is (P,), or (N, T) if params is (T, P).
    """
    X = np.atleast_2d(np.asarray(data_points)) # (N, F)
    # Ensure all inputs are positive to avoid issues with log(0) or log(negative)
    # Using a small epsilon (1e-10) to prevent log of zero or negative numbers.
    X = np.maximum(X, 1e-10)

    params_arr = np.asarray(params)
    # Adapt to the original framework's potential (T, P) parameter passing
    if params_arr.ndim == 1:
        params_arr = params_arr[None, :] # Make it (1, P)

    T, P = params_arr.shape # T: number of parameter sets, P: number of parameters per set

    # Expected number of parameters for this specific model structure
    # 1 (L0) + 2*2 (LR: c_pos, e_pos, c_neg, e_neg) + 1*2 (BSZ) + 1*2 (Data) + 1*2 (Params) = 11
    EXPECTED_P = 11
    if P != EXPECTED_P:
        if P > EXPECTED_P:
            params_arr = params_arr[:, :EXPECTED_P]
            P = EXPECTED_P
        else:
            raise ValueError(f"Expected {EXPECTED_P} parameters per set for the scaling law model, but received {P}.")

    # Extract parameters for each parameter set (T sets)
    L0_arr = params_arr[:, 0]
    c_lr_pos_arr, e_lr_pos_arr = params_arr[:, 1], params_arr[:, 2]
    c_lr_neg_arr, e_lr_neg_arr = params_arr[:, 3], params_arr[:, 4]
    c_bsz_arr, e_bsz_arr = params_arr[:, 5], params_arr[:, 6]
    c_data_arr, e_data_arr = params_arr[:, 7], params_arr[:, 8]
    c_params_arr, e_params_arr = params_arr[:, 9], params_arr[:, 10]

    # Ensure coefficients are non-negative for power laws, robustifying against
    # potential floating point issues or edge cases in optimization.
    # These are safeguards; proper bounds in fit_scaling_law should enforce this for the optimized result.
    c_lr_pos_arr = np.maximum(c_lr_pos_arr, 1e-10)
    c_lr_neg_arr = np.maximum(c_lr_neg_arr, 1e-10)
    c_bsz_arr = np.maximum(c_bsz_arr, 1e-10)
    c_data_arr = np.maximum(c_data_arr, 1e-10)
    c_params_arr = np.maximum(c_params_arr, 1e-10)

    # Calculate individual contributions using log-space for numerical stability (x^e = exp(e * log(x)))
    # X[:, feature_idx][:, None] makes it (N, 1) for broadcasting against (1, T) parameter arrays
    log_X_lr = np.log(X[:, 0][:, None])
    log_X_bsz = np.log(X[:, 1][:, None])
    log_X_data = np.log(X[:, 2][:, None])
    log_X_params = np.log(X[:, 3][:, None])

    term_lr_pos = c_lr_pos_arr[None, :] * np.exp(e_lr_pos_arr[None, :] * log_X_lr)
    term_lr_neg = c_lr_neg_arr[None, :] * np.exp(e_lr_neg_arr[None, :] * log_X_lr)
    term_bsz = c_bsz_arr[None, :] * np.exp(e_bsz_arr[None, :] * log_X_bsz)
    term_data = c_data_arr[None, :] * np.exp(e_data_arr[None, :] * log_X_data)
    term_params = c_params_arr[None, :] * np.exp(e_params_arr[None, :] * log_X_params)

    # Sum all contributions
    pred = L0_arr[None, :] + term_lr_pos + term_lr_neg + term_bsz + term_data + term_params

    # If only one set of parameters was passed (T=1), return a 1D array (N,)
    return pred[:, 0] if T == 1 else pred


def fit_scaling_law(data_points, loss_values):
    """
    Fits the scaling law function to the given data points and loss values.

    Args:
        data_points (np.ndarray): (N, 4) array with columns [lr, bsz, data_size, non_embedding_param_size].
        loss_values (np.ndarray): (N,) array of corresponding lm loss values.

    Returns:
        np.ndarray: Optimized parameters (P,) for the scaling law function.
                    [L_0, c_lr_pos, e_lr_pos, c_lr_neg, e_lr_neg, c_bsz, e_bsz, c_data, e_data, c_params, e_params].
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values)

    def residuals(params, X, y):
        pred = scaling_law_func(X, params)
        # CRITICAL IMPROVEMENT: Replace non-finite (NaN, inf) predictions with the mean of observed losses
        # to prevent optimizer from failing due to extreme values. This significantly
        # improves robustness during optimization, especially with power laws.
        pred_clean = np.copy(pred) # Make a copy to avoid modifying 'pred' in place if it's reused
        mean_y = np.mean(y)
        pred_clean[~np.isfinite(pred_clean)] = mean_y 
        return pred_clean - y

    # Total number of parameters for the model (11 parameters)
    P = 11

    # Initial guess for parameters: [L_0, c_lr_pos, e_lr_pos, c_lr_neg, e_lr_neg, c_bsz, e_bsz, c_data, e_data, c_params, e_params]
    # These initial guesses are informed by typical LLM scaling laws and data ranges,
    # and designed to capture the U-shaped LR effect.
    # Reverted initial c_lr_pos/neg values to be more moderate (1.0).
    initial_params = np.array([
        np.min(y) * 0.95, # L_0: Irreducible loss, slightly below min observed loss
        1.0,   1.0,       # c_lr_pos, e_lr_pos: For high LR, loss increases (positive exponent).
        1.0,  -1.0,       # c_lr_neg, e_lr_neg: For low LR, loss increases (negative exponent).
        1e-3,  0.5,       # c_bsz, e_bsz: Batch size effect (e.g., slight increase in loss with larger bsz if not scaled LR).
        5.0,   -0.1,      # c_data, e_data: Data typically reduces loss (negative exponent), positive coeff
        3.0,   -0.1        # c_params, e_params: Parameters typically reduce loss (negative exponent), positive coeff
    ])

    # Bounds for parameters to guide the optimizer and ensure physical realism.
    # Reverted LR exponent bounds to allow zero (non-strict inequality), as in top performing programs,
    # which can improve optimization flexibility.
    lower_bounds = np.array([
        0.0,              # L_0: Irreducible loss must be non-negative
        1e-10, 0.0,       # c_lr_pos (positive), e_lr_pos (non-negative for increasing effect)
        1e-10, -5.0,      # c_lr_neg (positive), e_lr_neg (non-positive for increasing effect with decreasing LR)
        1e-10, -2.0,      # c_bsz, e_bsz
        1e-10, -1.0,      # c_data, e_data: e_data typically negative (more data = less loss)
        1e-10, -1.0       # c_params, e_params: e_params typically negative (more params = less loss)
    ])
    upper_bounds = np.array([
        np.max(y) * 1.5,  # L_0: Cannot exceed max observed loss significantly
        1e5,   5.0,       # c_lr_pos, e_lr_pos
        1e5,   0.0,       # c_lr_neg, e_lr_neg
        1e5,   2.0,       # c_bsz, e_bsz
        1e5,   0.0,       # c_data, e_data: e_data <= 0
        1e5,   0.0         # c_params, e_params: e_params <= 0
    ])

    # Clip initial parameters to ensure they are within the defined bounds
    initial_params = np.clip(initial_params, lower_bounds, upper_bounds)

    # Use 'trf' (Trust Region Reflective) method, which handles bounds effectively and is robust for non-linear least squares.
    # verbose=0 suppresses convergence messages.
    # max_nfev reverted to 4000, which was effective in top performing programs.
    result = least_squares(residuals, initial_params, args=(X, y),
                           bounds=(lower_bounds, upper_bounds),
                           method='trf', verbose=0, max_nfev=4000)

    if result.success:
        return result.x
    else:
        # Fallback to initial parameters if optimization fails.
        # A warning is printed to alert about potential issues.
        print("Warning: least_squares optimization failed. Returning initial parameters.")
        return initial_params
# EVOLVE-BLOCK-END
#2 Run 4 R² = 0.917935
#3 Run 3 R² = 0.914771
#4 Run 5 R² = 0.914512
#5 Run 2 R² = 0.904116