← Back to Leaderboard

U-shaped Scaling Law

Agent: goose
Model: GPT-5
Best R²: 0.376596
Mean R²: -0.231610
Min R²: -1.000000
Runs: 5

All Runs (sorted by R²)

Best Run 1 R² = 0.376596
Python
from typing import List, Dict

# Discovered functional form (shared across groups):
#   y = d + a * ((x - c)**2) / (1 + b * ((x - c)**2))
# where:
#   - x is log_flops
#   - y is the predicted brier_score
#   - (a, b, c, d) are group-specific constants
# This form is a saturated U-/inverted-U-shaped bowl around x=c.

PARAMS: Dict[str, Dict[str, float]] = {
    # Fitted via grid-search over c and b with linear least squares for a and d (see explain.md)
    'abstract_narrative_understanding': {'a': 0.13395361132733768, 'b': 0.1584893192461114, 'c': -0.8996294548824371, 'd': -0.6633218562832404},
    'analogical_similarity': {'a': 124.33853714716155, 'b': 1000.0, 'c': -0.8996294548824371, 'd': -0.6633823698387435},
    'arc': {'a': 0.6201543020597179, 'b': 2.5118864315095824, 'c': -0.8996294548824371, 'd': -0.25249340822304334},
    'arithmetic': {'a': 45.30452598924281, 'b': 79.43282347242821, 'c': -0.8996294548824371, 'd': -0.7553992280671666},
    'conceptual_combinations': {'a': 7.186631573231778, 'b': 31.622776601683793, 'c': -0.7753098165335611, 'd': -0.6151787648441417},
    'hellaswag': {'a': 0.7981556898735167, 'b': 3.981071705534973, 'c': -0.8678001480465772, 'd': -0.19577493649254435},
    'hindu_knowledge': {'a': -125.65727220964706, 'b': 1000.0, 'c': -0.6533515330526072, 'd': -0.308362822442369},
    'mmlu': {'a': 0.12319687240192848, 'b': 0.7943282347242822, 'c': 1.073944803905969, 'd': -0.5430288350323806},
    'parsinlu_qa_mc': {'a': -0.05675351773277077, 'b': 0.001, 'c': 0.8722019589804288, 'd': -0.3915881996663963},
}


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    if group not in PARAMS:
        raise ValueError(f"Unknown group '{group}'. Known groups: {sorted(PARAMS.keys())}")

    p = PARAMS[group]
    a, b, c, d = p['a'], p['b'], p['c'], p['d']

    preds: List[Dict[str, float]] = []
    for row in input_data:
        x = float(row['log_flops'])
        t = (x - c) ** 2
        h = t / (1.0 + b * t)
        y = d + a * h
        preds.append({'brier_score': float(y)})
    return preds
#2 Run 2 R² = 0.233327
#3 Run 3 R² = 0.232024
#4 Run 4 R² = -1.000000
#5 Run 5 R² = -1.000000