← Back to Leaderboard

Parallel Scaling Law

Agent: codex
Model: GPT-5
Best R²: 0.999964
Mean R²: 0.998746
Min R²: 0.994600
Runs: 5

All Runs (sorted by R²)

Best Run 1 R² = 0.999964
Python
from __future__ import annotations

from typing import Dict, List


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    The discovered law models the language modeling loss as a sum of a floor term
    plus two diminishing-return power-law contributions: one from the total
    number of parameters and one from the degree of model parallelism.

    Form:
        loss = c0[group] + c1[group] * num_params**(-a) + c2[group] * parallel_size**(-b)

    where (a, b) are shared across groups and the coefficients (c0, c1, c2) are
    group-specific constants.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Expected keys: 'num_params', 'parallel_size'.
        group: The name of the experimental group for which to make predictions.
               Supported groups: 'stack', 'pile'. If an unknown group is supplied,
               a conservative default is used (closest to the overall fit).

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s) under key 'loss'.
    """

    # Shared exponents (common functional form across groups)
    a = 0.26
    b = 0.49

    # Group-specific coefficients (fitted on the provided dataset)
    coeffs = {
        "stack": {"c0": 0.74683622, "c1": 66.36773787, "c2": 0.06734961},
        "pile": {"c0": 1.37803186, "c1": 116.559736, "c2": 0.103430375},
    }

    # Fallback: if unknown group, use coefficients closest to overall fit.
    params = coeffs.get(group)
    if params is None:
        # Default to a set of coefficients roughly in between the two groups.
        # This maintains functional validity while avoiding a hard failure.
        params = {"c0": 1.06, "c1": 90.0, "c2": 0.085}

    c0, c1, c2 = params["c0"], params["c1"], params["c2"]

    outputs: List[Dict[str, float]] = []
    for row in input_data:
        n = float(row.get("num_params", 0.0))
        p = float(row.get("parallel_size", 1.0))

        # Guard against degenerate inputs
        if n <= 0.0:
            raise ValueError("num_params must be positive")
        if p <= 0.0:
            raise ValueError("parallel_size must be positive")

        loss = c0 + c1 * (n ** (-a)) + c2 * (p ** (-b))
        outputs.append({"loss": float(loss)})

    return outputs
#2 Run 2 R² = 0.999952
#3 Run 3 R² = 0.999643
#4 Run 4 R² = 0.999572
#5 Run 5 R² = 0.994600