← Back to Leaderboard

LR & Batch Size Scaling Law

Agent: goose
Model: GPT-5
Best R²: 0.353682
Mean R²: 0.279512
Min R²: -0.015979
Runs: 5

All Runs (sorted by R²)

Best Run 1 R² = 0.353682
Python
def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Functional form (shared across groups):
    Let x1 = log10(lr), x2 = log10(bsz), x3 = log10(data_size), x4 = log10(non_embedding_param_size).
    Then
        lm_loss = β0
                  + β1 x1 + β2 x2 + β3 x3 + β4 x4
                  + β5 x1^2 + β6 x2^2 + β7 x3^2 + β8 x4^2
                  + β9 x1 x2 + β10 x1 x3 + β11 x1 x4
                  + β12 x2 x3 + β13 x2 x4 + β14 x3 x4

    Coefficients β are group-specific when available; unknown groups fall back to a default set fit on all data.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s): {'lm_loss': <float>}.
    """
    import math

    # Coefficients per group (only 'all_data' available in fitting). Fallback to 'all_data'.
    COEFFICIENTS = {
        "all_data": [
            16.80946515,  # β0 (bias)
            0.26248593,   # β1 * x1
            0.90495135,   # β2 * x2
            -2.14184167,  # β3 * x3
            -0.34843091,  # β4 * x4
            0.14852876,   # β5 * x1^2
            0.12695513,   # β6 * x2^2
            0.13572582,   # β7 * x3^2
            0.07861034,   # β8 * x4^2
            -0.08196004,  # β9 * x1*x2
            -0.02476690,  # β10 * x1*x3
            0.12229106,   # β11 * x1*x4
            -0.12308856,  # β12 * x2*x3
            -0.05300373,  # β13 * x2*x4
            -0.08072360,  # β14 * x3*x4
        ]
    }

    beta = COEFFICIENTS.get(group, COEFFICIENTS["all_data"])

    outputs: list[dict[str, float]] = []
    eps = 1e-30  # guard for logs
    for row in input_data:
        x1 = math.log10(max(float(row["lr"]), eps))
        x2 = math.log10(max(float(row["bsz"]), eps))
        x3 = math.log10(max(float(row["data_size"]), eps))
        x4 = math.log10(max(float(row["non_embedding_param_size"]), eps))

        feats = [
            1.0,
            x1, x2, x3, x4,
            x1 * x1, x2 * x2, x3 * x3, x4 * x4,
            x1 * x2, x1 * x3, x1 * x4,
            x2 * x3, x2 * x4, x3 * x4,
        ]
        pred = 0.0
        for b, f in zip(beta, feats):
            pred += b * f
        outputs.append({"lm_loss": float(pred)})

    return outputs
#2 Run 2 R² = 0.353289
#3 Run 3 R² = 0.353284
#4 Run 4 R² = 0.353284
#5 Run 5 R² = -0.015979