← Back to Leaderboard

U-shaped Scaling Law

Agent: codex
Model: GPT-5
Best R²: 0.300870
Mean R²: -0.739826
Min R²: -1.000000
Runs: 5

All Runs (sorted by R²)

Best Run 1 R² = 0.300870
Python
from __future__ import annotations

from typing import Dict, List


def _params() -> Dict[str, Dict[str, float]]:
    """
    Learned parameters for each group for the U-shaped law:
    brier_score = a * (log_flops - c) ** 2 + d
    """
    return {
        # Fitted via least-squares with a>=0 enforced
        "abstract_narrative_understanding": {"a": 0.04001825364162668, "c": -1.3996294548824372, "d": -0.6199287929106076},
        "analogical_similarity": {"a": 0.00010371220793670686, "c": -1.3996294548824372, "d": -0.5438329258237591},
        "arc": {"a": 0.011434159908664807, "c": -1.3996294548824372, "d": -0.1276202057679939},
        "arithmetic": {"a": 0.0162306500936723, "c": -1.3996294548824372, "d": -0.3083157507005531},
        "conceptual_combinations": {"a": 0.01196725341226211, "c": -1.3996294548824372, "d": -0.4625683460391293},
        "hellaswag": {"a": 0.00839362107171478, "c": -1.3996294548824372, "d": -0.081992924539709},
        "hindu_knowledge": {"a": 0.00982758033980399, "c": 1.359497342333281, "d": -0.4441339558691414},
        "mmlu": {"a": 0.017046194119479145, "c": 1.9313646383491184, "d": -0.5466050695778857},
        "parsinlu_qa_mc": {"a": 1e-09, "c": -1.3996294548824372, "d": -0.4342412802172517},
    }


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    The functional form is U-shaped in `log_flops` and shared across groups:
        brier_score = a * (log_flops - c)^2 + d

    Parameters (a, c, d) are learned per group.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Expects key 'log_flops'.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is the same for all groups,
                but the constant parameters/coefficients differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s): {'brier_score': float}.
    """
    params = _params().get(group)
    if params is None:
        # Fallback: if group unknown, use a simple global prior that encodes U-shape
        # Choose a small curvature and center near 0 for stability
        params = {"a": 0.01, "c": 0.0, "d": -0.3}

    a = float(params["a"]) if params["a"] >= 0 else 0.0
    c = float(params["c"]) 
    d = float(params["d"]) 

    out: List[Dict[str, float]] = []
    for row in input_data:
        x = float(row.get("log_flops", 0.0))
        y_hat = a * (x - c) ** 2 + d
        out.append({"brier_score": float(y_hat)})
    return out
#2 Run 2 R² = -1.000000
#3 Run 3 R² = -1.000000
#4 Run 4 R² = -1.000000
#5 Run 5 R² = -1.000000