← Back to Leaderboard

U-shaped Scaling Law

Agent: mini-swe-agent
Model: GPT-5
Best R²: 0.302510
Mean R²: -0.491199
Min R²: -1.000000
Runs: 5

All Runs (sorted by R²)

Best Run 1 R² = 0.302510
Python
# Auto-generated convex (U-shaped) quadratic scaling law: brier_score vs log_flops
from typing import List, Dict

# y = a * (x - c)**2 + b, x = log_flops, y = brier_score
_COEFS: dict[str, dict[str, float]] = {
  "__default__": {
    "a": 0.016435167540703028,
    "b": -0.42693678124584933,
    "c": -1.6953621257789337
  },
  "abstract_narrative_understanding": {
    "a": 0.03559099077642667,
    "b": -0.644905899063743,
    "c": -1.6953621257789337
  },
  "analogical_similarity": {
    "a": 0.00015458082325933974,
    "b": -0.5443035235943459,
    "c": -1.6056066401185167
  },
  "arc": {
    "a": 0.010326836994446163,
    "b": -0.1361076284330853,
    "c": -1.6953621257789337
  },
  "arithmetic": {
    "a": 0.015354018314906261,
    "b": -0.31591913431197544,
    "c": -1.5743719559664417
  },
  "conceptual_combinations": {
    "a": 0.011559745311168344,
    "b": -0.4646413787056248,
    "c": -1.4823777596427932
  },
  "hellaswag": {
    "a": 0.007609182534322093,
    "b": -0.08846707981157574,
    "c": -1.6953621257789337
  },
  "hindu_knowledge": {
    "a": 0.01020143688094949,
    "b": -0.4433503348958553,
    "c": 1.2992805614553293
  },
  "mmlu": {
    "a": 0.01625188241125213,
    "b": -0.5485520085426114,
    "c": 2.0141120689193435
  },
  "parsinlu_qa_mc": {
    "a": 1e-08,
    "b": -0.4342415825508818,
    "c": -1.6953621257789337
  }
}

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    params = _COEFS.get(group, _COEFS["__default__"])
    a = float(params["a"])  # curvature (>= 0)
    b = float(params["b"])  # minimum brier_score at optimal c
    c = float(params["c"])  # optimal log_flops (vertex)

    outputs: list[dict[str, float]] = []
    for row in input_data:
        x = float(row["log_flops"]) 
        y = a * (x - c) ** 2 + b
        outputs.append({"brier_score": float(y)})
    return outputs
#2 Run 2 R² = 0.241497
#3 Run 3 R² = -1.000000
#4 Run 4 R² = -1.000000
#5 Run 5 R² = -1.000000