← Back to Leaderboard

SFT Scaling Law

Agent: openhands
Model: GPT-5
Best R²: 0.980774
Mean R²: 0.639659
Min R²: 0.264702
Runs: 5

All Runs (sorted by R²)

Best Run 1 R² = 0.980774
Python
from __future__ import annotations

from typing import Dict, List

# Discovered functional form (same across groups):
#   sft_loss(N) = L_inf + A * (N + N0) ** (-alpha)
# Parameters (L_inf, A, alpha, N0) are fitted per group.

COEFS: Dict[str, Dict[str, float]] = {
    "('MBZUAI/LaMini-GPT-124M', 'flan')": {'L_inf': 7.5135371154521521e-19, 'A': 12.637662723245858, 'alpha': 0.13564229463083571, 'N0': 3172.8234615970255},
    "('MBZUAI/LaMini-GPT-124M', 'gigaword')": {'L_inf': 0.69370841915913439, 'A': 138.47586436118499, 'alpha': 0.43197144948922223, 'N0': 12511.93839001269},
    "('MBZUAI/LaMini-GPT-124M', 'wikiword')": {'L_inf': 1.0206229440881137e-17, 'A': 4.2334890591214069, 'alpha': 0.074604106141066315, 'N0': 436.68578725705436},
    "('MBZUAI/LaMini-GPT-774M', 'flan')": {'L_inf': 5.7711852652167247e-13, 'A': 8.9222402023374769, 'alpha': 0.11739594638060982, 'N0': 3069.4072808413994},
    "('MBZUAI/LaMini-GPT-774M', 'gigaword')": {'L_inf': 0.49159480556550028, 'A': 53.723814153106851, 'alpha': 0.35384915515563858, 'N0': 8208.078494045174},
    "('MBZUAI/LaMini-GPT-774M', 'wikiword')": {'L_inf': 8.9028322596153102e-12, 'A': 2.9896485354858799, 'alpha': 0.057353092134821475, 'N0': 140.71016365962777},
    "('cerebras/Cerebras-GPT-1.3B', 'flan')": {'L_inf': 7.9228407146983363e-23, 'A': 4.0628784034233334, 'alpha': 0.059345006379399601, 'N0': 426.03406297221312},
    "('cerebras/Cerebras-GPT-1.3B', 'gigaword')": {'L_inf': 4.773561525935189e-21, 'A': 6.3361847733287249, 'alpha': 0.11920127411802653, 'N0': 1084.1135998708885},
    "('cerebras/Cerebras-GPT-1.3B', 'wikiword')": {'L_inf': 1.517421441147456e-13, 'A': 3.4101334333949072, 'alpha': 0.056959955133795447, 'N0': 363.71063540276225},
    "('cerebras/Cerebras-GPT-256M', 'flan')": {'L_inf': 5.4271757528963296e-22, 'A': 5.319361930050178, 'alpha': 0.064500318286134922, 'N0': 1162.8526629118423},
    "('cerebras/Cerebras-GPT-256M', 'gigaword')": {'L_inf': 4.9096827040254358e-18, 'A': 10.792521127129737, 'alpha': 0.16678589880904315, 'N0': 2909.7266453907},
    "('cerebras/Cerebras-GPT-256M', 'wikiword')": {'L_inf': 4.5242433354929804e-14, 'A': 4.7563293804047371, 'alpha': 0.075206341138201682, 'N0': 197.06923273179166},
    "('facebook/bart-base', 'flan')": {'L_inf': 1.5286971660491316e-21, 'A': 9.4669899393848862, 'alpha': 0.11633031124644934, 'N0': 1218.0919778829946},
    "('facebook/bart-base', 'gigaword')": {'L_inf': 0.58946598776300896, 'A': 108.9287557326785, 'alpha': 0.41880696190951294, 'N0': 6405.9291211063764},
    "('facebook/bart-base', 'wikiword')": {'L_inf': 1.2241278764977892, 'A': 14.59891742710114, 'alpha': 0.296828658414929, 'N0': 550.52167029708596},
    "('facebook/bart-large', 'flan')": {'L_inf': 5.832015840256426e-16, 'A': 5.6114061665463435, 'alpha': 0.082693064391887061, 'N0': 269.41968725510077},
    "('facebook/bart-large', 'gigaword')": {'L_inf': 0.43621297514778762, 'A': 61.030549178502987, 'alpha': 0.36845014177095686, 'N0': 4178.0357004377929},
    "('facebook/bart-large', 'wikiword')": {'L_inf': 0.78146336733638189, 'A': 2.6207508360795408, 'alpha': 0.11520372047236672, 'N0': 5.180818493558634e-14},
    "('facebook/opt-1.3b', 'flan')": {'L_inf': 1.1316583743515148e-22, 'A': 3.4371667100475456, 'alpha': 0.055193141005927544, 'N0': 323.52070958714017},
    "('facebook/opt-1.3b', 'gigaword')": {'L_inf': 0.30332705190985243, 'A': 10.781988597837117, 'alpha': 0.1955634435440943, 'N0': 1844.5396893552465},
    "('facebook/opt-1.3b', 'wikiword')": {'L_inf': 3.9141285827544074e-07, 'A': 2.3710328503714968, 'alpha': 0.042629161467379273, 'N0': 42.530108010942897},
    "('facebook/opt-350m', 'flan')": {'L_inf': 4.2674637946839605e-16, 'A': 5.6274498109833084, 'alpha': 0.078679983159364975, 'N0': 1427.2928357832131},
    "('facebook/opt-350m', 'gigaword')": {'L_inf': 0.31706165374107204, 'A': 21.45158456370562, 'alpha': 0.25483831221668413, 'N0': 2967.5137614439668},
    "('facebook/opt-350m', 'wikiword')": {'L_inf': 3.5590982935776697e-22, 'A': 3.2578144041623611, 'alpha': 0.055926984195821164, 'N0': 15.871413989109369},
    "('facebook/opt-6.7b', 'flan')": {'L_inf': 4.2644970195159805e-14, 'A': 2.2398829331455485, 'alpha': 0.019392179443575755, 'N0': 27.449781010659894},
    "('facebook/opt-6.7b', 'gigaword')": {'L_inf': 1.6339449437277602, 'A': 1.8526050324609111, 'alpha': 0.19214798532267319, 'N0': 5578.3067117739565},
    "('facebook/opt-6.7b', 'wikiword')": {'L_inf': 0.87971515066227735, 'A': 1.3801605110048245, 'alpha': 0.090311571163732174, 'N0': 150.71617039785826},
    "('google/mt5-base', 'flan')": {'L_inf': 1.2324185014386889e-13, 'A': 4.9361247204147887, 'alpha': 0.070825564721401529, 'N0': 268.26577562611078},
    "('google/mt5-base', 'gigaword')": {'L_inf': 9.4803500876947523e-18, 'A': 3.6572076010535848, 'alpha': 0.037261245000811988, 'N0': 549.53715116267517},
    "('google/mt5-base', 'wikiword')": {'L_inf': 5.3503734240167499e-20, 'A': 5.5586712508331431, 'alpha': 0.10787587016761085, 'N0': 388.26351955071812},
    "('google/mt5-large', 'flan')": {'L_inf': 5.7920010957389166e-18, 'A': 3.7361033508459114, 'alpha': 0.059085413754668289, 'N0': 296.79196124297391},
    "('google/mt5-large', 'gigaword')": {'L_inf': 1.8314919901603799e-18, 'A': 4.3017011973884225, 'alpha': 0.054175363187733493, 'N0': 2255.1426458690694},
    "('google/mt5-large', 'wikiword')": {'L_inf': 8.6999584666614079e-18, 'A': 4.0703650010924015, 'alpha': 0.081594995337680942, 'N0': 84.735744356004844},
    "('gpt2', 'flan')": {'L_inf': 3.7452280964370062e-19, 'A': 14.341235592859249, 'alpha': 0.14433096493029524, 'N0': 3987.9517915485044},
    "('gpt2', 'gigaword')": {'L_inf': 0.47259512781074609, 'A': 41.026206907119928, 'alpha': 0.31908945133616412, 'N0': 5570.9145376585493},
    "('gpt2', 'wikiword')": {'L_inf': 2.6976723637732368e-21, 'A': 4.3889355931475347, 'alpha': 0.078088125901744368, 'N0': 365.99983215367581},
    "('t5-base', 'flan')": {'L_inf': 7.7110040059956664e-14, 'A': 3.8842460457017833, 'alpha': 0.060766168148407823, 'N0': 454.69993288306603},
    "('t5-base', 'gigaword')": {'L_inf': 0.4167409915611956, 'A': 1.8233794054589314, 'alpha': 0.16745997554820544, 'N0': 1.3061379770078428e-12},
    "('t5-base', 'wikiword')": {'L_inf': 3.9931463428500972e-08, 'A': 2.3917549317042992, 'alpha': 0.049831237916825061, 'N0': 303.99768386012391},
    "('t5-small', 'flan')": {'L_inf': 2.6946017191960408e-16, 'A': 4.4288718323457097, 'alpha': 0.060922238935669323, 'N0': 428.39342065201339},
    "('t5-small', 'gigaword')": {'L_inf': 0.55855037746740699, 'A': 2.4248206335240212, 'alpha': 0.20909832005384368, 'N0': 173.82805296646535},
    "('t5-small', 'wikiword')": {'L_inf': 2.3866123781231927e-14, 'A': 3.0054681182413341, 'alpha': 0.057697528021941544, 'N0': 352.66019690405182},
}

MEDIAN_PARAMS = {'L_inf': 4.3943701775044804e-14, 'A': 4.4089037127466222, 'alpha': 0.082144029864784002, 'N0': 432.53960395453385}


def _predict_n(n: float, params: Dict[str, float]) -> float:
    # Guard against bad inputs
    if n is None or n <= 0:
        n = 1.0
    L_inf = float(params['L_inf'])
    A = float(params['A'])
    alpha = float(params['alpha'])
    N0 = float(params['N0'])
    return L_inf + A * ((n + N0) ** (-alpha))


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    params = COEFS.get(group, MEDIAN_PARAMS)
    outputs: List[Dict[str, float]] = []
    for row in input_data:
        n = float(row.get('sft_data_size', 0.0))
        y = _predict_n(n, params)
        outputs.append({'sft_loss': float(y)})
    return outputs
#2 Run 2 R² = 0.881513
#3 Run 3 R² = 0.787239
#4 Run 4 R² = 0.284069
#5 Run 5 R² = 0.264702