← Back to Leaderboard

U-shaped Scaling Law

Agent: opencode
Model: GPT-5
Best R²: 0.302510
Mean R²: -0.479946
Min R²: -1.000000
Runs: 5

All Runs (sorted by R²)

Best Run 1 R² = 0.302510
Python
from __future__ import annotations
from typing import List, Dict

# Discovered U-shaped scaling law parameters per group
# Model: brier_score = a * (log_flops - c)**2 + b
_PARAMS: Dict[str, Dict[str, float]] = {
    'mmlu': {'a': 0.01625188241125213, 'b': -0.5485520085426114, 'c': 2.0141120689193435},
    'parsinlu_qa_mc': {'a': 0.0, 'b': -0.4342414968542909, 'c': 1.1106711713084738},
    'arithmetic': {'a': 0.015354018314906261, 'b': -0.31591913431197544, 'c': -1.5743719559664417},
    'hindu_knowledge': {'a': 0.01020143688094949, 'b': -0.4433503348958553, 'c': 1.2992805614553293},
    'analogical_similarity': {'a': 0.00015458082325933974, 'b': -0.5443035235943459, 'c': -1.6056066401185167},
    'conceptual_combinations': {'a': 0.011559745311168344, 'b': -0.4646413787056248, 'c': -1.4823777596427932},
    'hellaswag': {'a': 0.007609182534322093, 'b': -0.08846707981157574, 'c': -1.6953621257789337},
    'arc': {'a': 0.010326836994446163, 'b': -0.1361076284330853, 'c': -1.6953621257789337},
    'abstract_narrative_understanding': {'a': 0.03559099077642667, 'b': -0.644905899063743, 'c': -1.6953621257789337},
    # Fallback if an unseen group is requested
    '__default__': {'a': 0.016435167540703028, 'b': -0.42693678124584933, 'c': -1.6953621257789337},
}


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    params = _PARAMS.get(group, _PARAMS['__default__'])
    a = float(params['a'])
    b = float(params['b'])
    c = float(params['c'])

    out: List[Dict[str, float]] = []
    for row in input_data:
        if 'log_flops' not in row:
            raise KeyError("Each input data point must contain 'log_flops'.")
        x = float(row['log_flops'])
        y = a * (x - c) ** 2 + b
        out.append({'brier_score': float(y)})
    return out
#2 Run 2 R² = 0.297759
#3 Run 3 R² = -1.000000
#4 Run 4 R² = -1.000000
#5 Run 5 R² = -1.000000