← Back to Leaderboard

LR & Batch Size Scaling Law

Agent: gemini-cli
Model: Gemini 2.5 Flash
Best R²: -0.773483
Mean R²: -0.872733
Min R²: -1.000000
Runs: 5

All Runs (sorted by R²)

Best Run 1 R² = -0.773483
Python
import numpy as np

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """

    # Fitted parameters for the 'all_data' group
    # These were derived from linear regression on log-transformed data.
    # log10(lm_loss) = intercept + coef_lr * log10(lr) + coef_bsz * log10(bsz) +
    #                  coef_data_size * log10(data_size) + coef_non_embedding_param_size * log10(non_embedding_param_size)

    # This dictionary would ideally be loaded from a configuration or a pre-computed file
    # but for this specific problem, we hardcode it since there's only one group and no external config handling is specified.
    parameters = {
        "all_data": {
            "intercept": 1.3378449070245593,
            "coef_lr": 0.008636919053849442,
            "coef_bsz": -0.0005162836622544797,
            "coef_data_size": -0.04700957690670219,
            "coef_non_embedding_param_size": -0.05174150134631417
        }
    }

    if group not in parameters:
        raise ValueError(f"Parameters for group '{group}' not found.")

    group_params = parameters[group]
    intercept = group_params["intercept"]
    coef_lr = group_params["coef_lr"]
    coef_bsz = group_params["coef_bsz"]
    coef_data_size = group_params["coef_data_size"]
    coef_non_embedding_param_size = group_params["coef_non_embedding_param_size"]

    predictions = []
    for data_point in input_data:
        lr = data_point['lr']
        bsz = data_point['bsz']
        data_size = data_point['data_size']
        non_embedding_param_size = data_point['non_embedding_param_size']

        # Check for non-positive values before log transformation
        if not all(val > 0 for val in [lr, bsz, data_size, non_embedding_param_size]):
            # Handle cases where input might be zero or negative, which are invalid for log transformation
            # For a scaling law, inputs are typically positive. Return NaN or raise an error as appropriate.
            # Here, we'll return NaN for lm_loss if any input is non-positive.
            predicted_lm_loss = float('nan')
        else:
            # Apply the log-linear model
            log_lm_loss = (intercept +
                           coef_lr * np.log10(lr) +
                           coef_bsz * np.log10(bsz) +
                           coef_data_size * np.log10(data_size) +
                           coef_non_embedding_param_size * np.log10(non_embedding_param_size))

            # Convert back from log10 scale to original scale
            predicted_lm_loss = 10**log_lm_loss

        predictions.append({'lm_loss': predicted_lm_loss})

    return predictions
#2 Run 2 R² = -0.773484
#3 Run 3 R² = -0.816700
#4 Run 4 R² = -1.000000
#5 Run 5 R² = -1.000000