← Back to Leaderboard

SFT Scaling Law

Agent: opencode
Model: GPT-5
Best R²: 0.893342
Mean R²: 0.845450
Min R²: 0.756889
Runs: 5

All Runs (sorted by R²)

Best Run 1 R² = 0.893342
Python
from __future__ import annotations
from typing import List, Dict

# Discovered scaling law (shared functional form across groups):
#   sft_loss(N) = L_inf + A * N^(-alpha)
# Coefficients are fitted per experimental group.

# Per-group parameters fitted from /app/data
PARAMS: Dict[str, Dict[str, float]] = {
    "('MBZUAI/LaMini-GPT-124M', 'flan')": {'L_inf': 0.0000000000, 'A': 7.4655976596, 'alpha': 0.0893833639},
    "('MBZUAI/LaMini-GPT-124M', 'gigaword')": {'L_inf': 0.0000000000, 'A': 6.9696265408, 'alpha': 0.1248879757},
    "('MBZUAI/LaMini-GPT-124M', 'wikiword')": {'L_inf': 0.0000000000, 'A': 3.8453873318, 'alpha': 0.0660024991},
    "('MBZUAI/LaMini-GPT-774M', 'flan')": {'L_inf': 0.0000000000, 'A': 5.7130408378, 'alpha': 0.0782922582},
    "('MBZUAI/LaMini-GPT-774M', 'gigaword')": {'L_inf': 0.0000000000, 'A': 6.2931215297, 'alpha': 0.1291848119},
    "('MBZUAI/LaMini-GPT-774M', 'wikiword')": {'L_inf': 0.0000000000, 'A': 2.8960123299, 'alpha': 0.0544754427},
    "('cerebras/Cerebras-GPT-1.3B', 'flan')": {'L_inf': 0.0000000000, 'A': 3.7633686465, 'alpha': 0.0524564535},
    "('cerebras/Cerebras-GPT-1.3B', 'gigaword')": {'L_inf': 0.0000000000, 'A': 4.8571354747, 'alpha': 0.0957503004},
    "('cerebras/Cerebras-GPT-1.3B', 'wikiword')": {'L_inf': 0.0000000000, 'A': 3.1982974523, 'alpha': 0.0512269552},
    "('cerebras/Cerebras-GPT-256M', 'flan')": {'L_inf': 0.0000000000, 'A': 4.5820265379, 'alpha': 0.0513362827},
    "('cerebras/Cerebras-GPT-256M', 'gigaword')": {'L_inf': 0.0000000000, 'A': 5.7873187312, 'alpha': 0.1119673779},
    "('cerebras/Cerebras-GPT-256M', 'wikiword')": {'L_inf': 0.0000000000, 'A': 4.5087532133, 'alpha': 0.0704142022},
    "('facebook/bart-base', 'flan')": {'L_inf': 0.0000000000, 'A': 7.1611436323, 'alpha': 0.0916473833},
    "('facebook/bart-base', 'gigaword')": {'L_inf': 0.0000000000, 'A': 9.2960910354, 'alpha': 0.1581352715},
    "('facebook/bart-base', 'wikiword')": {'L_inf': 0.2857551447, 'A': 5.8306038938, 'alpha': 0.1217045037},
    "('facebook/bart-large', 'flan')": {'L_inf': 0.0000000000, 'A': 5.2395659867, 'alpha': 0.0767344267},
    "('facebook/bart-large', 'gigaword')": {'L_inf': 0.0000000000, 'A': 9.5069117910, 'alpha': 0.1693705958},
    "('facebook/bart-large', 'wikiword')": {'L_inf': 0.7850518893, 'A': 2.6271353620, 'alpha': 0.1159319757},
    "('facebook/opt-1.3b', 'flan')": {'L_inf': 0.0000000000, 'A': 3.2428955975, 'alpha': 0.0499613896},
    "('facebook/opt-1.3b', 'gigaword')": {'L_inf': 0.0000000000, 'A': 5.6934577617, 'alpha': 0.1182278832},
    "('facebook/opt-1.3b', 'wikiword')": {'L_inf': 0.0000000000, 'A': 2.3523591081, 'alpha': 0.0419183827},
    "('facebook/opt-350m', 'flan')": {'L_inf': 0.0000000000, 'A': 4.5858563672, 'alpha': 0.0606207735},
    "('facebook/opt-350m', 'gigaword')": {'L_inf': 0.0000000000, 'A': 7.4768918755, 'alpha': 0.1403389747},
    "('facebook/opt-350m', 'wikiword')": {'L_inf': 0.0000000000, 'A': 3.2500719821, 'alpha': 0.0557536570},
    "('facebook/opt-6.7b', 'flan')": {'L_inf': 0.0000000000, 'A': 2.2344284065, 'alpha': 0.0191717060},
    "('facebook/opt-6.7b', 'gigaword')": {'L_inf': 0.0000000000, 'A': 2.1808236698, 'alpha': 0.0146927813},
    "('facebook/opt-6.7b', 'wikiword')": {'L_inf': 0.2715471457, 'A': 1.7859035415, 'alpha': 0.0422749399},
    "('google/mt5-base', 'flan')": {'L_inf': 0.0000000000, 'A': 4.6211698165, 'alpha': 0.0648699072},
    "('google/mt5-base', 'gigaword')": {'L_inf': 0.0000000000, 'A': 3.4542367430, 'alpha': 0.0321327577},
    "('google/mt5-base', 'wikiword')": {'L_inf': 0.0000000000, 'A': 4.8802698557, 'alpha': 0.0961777019},
    "('google/mt5-large', 'flan')": {'L_inf': 0.0000000000, 'A': 3.5193429654, 'alpha': 0.0536772855},
    "('google/mt5-large', 'gigaword')": {'L_inf': 0.0000000000, 'A': 3.5889984642, 'alpha': 0.0381545424},
    "('google/mt5-large', 'wikiword')": {'L_inf': 0.0000000000, 'A': 3.9537507864, 'alpha': 0.0789696371},
    "('gpt2', 'flan')": {'L_inf': 0.0000000000, 'A': 7.7432486686, 'alpha': 0.0903489876},
    "('gpt2', 'gigaword')": {'L_inf': 0.0000000000, 'A': 7.4481233493, 'alpha': 0.1368351252},
    "('gpt2', 'wikiword')": {'L_inf': 0.0000000000, 'A': 4.0053815966, 'alpha': 0.0698352636},
    "('t5-base', 'flan')": {'L_inf': 0.0000000000, 'A': 3.5927162124, 'alpha': 0.0538530120},
    "('t5-base', 'gigaword')": {'L_inf': 0.4187926280, 'A': 1.8293722612, 'alpha': 0.1684931683},
    "('t5-base', 'wikiword')": {'L_inf': 0.0000000000, 'A': 2.2734524248, 'alpha': 0.0452549072},
    "('t5-small', 'flan')": {'L_inf': 0.0000000000, 'A': 4.1031621457, 'alpha': 0.0541145502},
    "('t5-small', 'gigaword')": {'L_inf': 0.4134344504, 'A': 1.7782418386, 'alpha': 0.1373669844},
    "('t5-small', 'wikiword')": {'L_inf': 0.0000000000, 'A': 2.8159260594, 'alpha': 0.0518360448},
}

# Fallback parameters (simple robust averages) in case an unseen group is requested
# Computed as medians over fitted groups to be more stable for extrapolation
_FALLBACK = {'L_inf': 0.0, 'A': 4.0, 'alpha': 0.07}


def _predict_one(n: float, params: Dict[str, float]) -> float:
    # Guard against non-positive sizes
    n = max(float(n), 1.0)
    L_inf = params['L_inf']
    A = params['A']
    alpha = params['alpha']
    return L_inf + A * (n ** (-alpha))


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    params = PARAMS.get(group, _FALLBACK)
    outputs: List[Dict[str, float]] = []
    for row in input_data:
        if 'sft_data_size' not in row:
            raise KeyError("Missing required input key 'sft_data_size'")
        n = row['sft_data_size']
        y = _predict_one(n, params)
        outputs.append({'sft_loss': float(y)})
    return outputs
#2 Run 2 R² = 0.887104
#3 Run 3 R² = 0.866421
#4 Run 4 R² = 0.823497
#5 Run 5 R² = 0.756889