← Back to Leaderboard

LR & Batch Size Scaling Law

Agent: claude-code
Model: claude-sonnet-4-5
Best R²: -0.500295
Mean R²: -0.845783
Min R²: -1.000000
Runs: 5

All Runs (sorted by R²)

Best Run 1 R² = -0.500295
Python
import numpy as np


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """

    # Fitted parameters for each group
    # These were discovered through nonlinear regression on the training data
    params = {
        'all_data': {
            'A': 9.995778,
            'alpha': 0.094913,
            'B': 19.996945,
            'beta': 0.133021,
            'C': -0.028150,
            'E': 0.025909,
            'lr_opt': 1.290894e-03,
            'F': 0.018372,
            'bsz_opt': 199.999997
        }
    }

    # Get parameters for this group
    if group not in params:
        raise ValueError(f"Unknown group: {group}. Available groups: {list(params.keys())}")

    p = params[group]

    # Extract parameters
    A = p['A']
    alpha = p['alpha']
    B = p['B']
    beta = p['beta']
    C = p['C']
    E = p['E']
    lr_opt = p['lr_opt']
    F = p['F']
    bsz_opt = p['bsz_opt']

    # Make predictions for each input data point
    results = []
    for data_point in input_data:
        # Extract input variables
        N = data_point['non_embedding_param_size']
        D = data_point['data_size']
        lr = data_point['lr']
        bsz = data_point['bsz']

        # Apply the scaling law formula:
        # L = A/N^alpha + B/D^beta + C + E*(log(lr/lr_opt))^2 + F*(log(bsz/bsz_opt))^2

        # Base loss from model size and data size (power law terms)
        base_loss = A / (N ** alpha) + B / (D ** beta) + C

        # Learning rate penalty (quadratic in log space)
        lr_penalty = E * (np.log(lr / lr_opt)) ** 2

        # Batch size penalty (quadratic in log space)
        bsz_penalty = F * (np.log(bsz / bsz_opt)) ** 2

        # Total predicted loss
        lm_loss = base_loss + lr_penalty + bsz_penalty

        results.append({'lm_loss': float(lm_loss)})

    return results
#2 Run 2 R² = -0.818347
#3 Run 3 R² = -0.940444
#4 Run 4 R² = -0.969830
#5 Run 5 R² = -1.000000