SLD - SFT Scaling Law - openhands + GPT-5

All Runs (sorted by R²)

Best Run 1 R² = 0.980774

▼

Python

from __future__ import annotations

from typing import Dict, List

# Discovered functional form (same across groups):
#   sft_loss(N) = L_inf + A * (N + N0) ** (-alpha)
# Parameters (L_inf, A, alpha, N0) are fitted per group.

COEFS: Dict[str, Dict[str, float]] = {
    "('MBZUAI/LaMini-GPT-124M', 'flan')": {'L_inf': 7.5135371154521521e-19, 'A': 12.637662723245858, 'alpha': 0.13564229463083571, 'N0': 3172.8234615970255},
    "('MBZUAI/LaMini-GPT-124M', 'gigaword')": {'L_inf': 0.69370841915913439, 'A': 138.47586436118499, 'alpha': 0.43197144948922223, 'N0': 12511.93839001269},
    "('MBZUAI/LaMini-GPT-124M', 'wikiword')": {'L_inf': 1.0206229440881137e-17, 'A': 4.2334890591214069, 'alpha': 0.074604106141066315, 'N0': 436.68578725705436},
    "('MBZUAI/LaMini-GPT-774M', 'flan')": {'L_inf': 5.7711852652167247e-13, 'A': 8.9222402023374769, 'alpha': 0.11739594638060982, 'N0': 3069.4072808413994},
    "('MBZUAI/LaMini-GPT-774M', 'gigaword')": {'L_inf': 0.49159480556550028, 'A': 53.723814153106851, 'alpha': 0.35384915515563858, 'N0': 8208.078494045174},
    "('MBZUAI/LaMini-GPT-774M', 'wikiword')": {'L_inf': 8.9028322596153102e-12, 'A': 2.9896485354858799, 'alpha': 0.057353092134821475, 'N0': 140.71016365962777},
    "('cerebras/Cerebras-GPT-1.3B', 'flan')": {'L_inf': 7.9228407146983363e-23, 'A': 4.0628784034233334, 'alpha': 0.059345006379399601, 'N0': 426.03406297221312},
    "('cerebras/Cerebras-GPT-1.3B', 'gigaword')": {'L_inf': 4.773561525935189e-21, 'A': 6.3361847733287249, 'alpha': 0.11920127411802653, 'N0': 1084.1135998708885},
    "('cerebras/Cerebras-GPT-1.3B', 'wikiword')": {'L_inf': 1.517421441147456e-13, 'A': 3.4101334333949072, 'alpha': 0.056959955133795447, 'N0': 363.71063540276225},
    "('cerebras/Cerebras-GPT-256M', 'flan')": {'L_inf': 5.4271757528963296e-22, 'A': 5.319361930050178, 'alpha': 0.064500318286134922, 'N0': 1162.8526629118423},
    "('cerebras/Cerebras-GPT-256M', 'gigaword')": {'L_inf': 4.9096827040254358e-18, 'A': 10.792521127129737, 'alpha': 0.16678589880904315, 'N0': 2909.7266453907},
    "('cerebras/Cerebras-GPT-256M', 'wikiword')": {'L_inf': 4.5242433354929804e-14, 'A': 4.7563293804047371, 'alpha': 0.075206341138201682, 'N0': 197.06923273179166},
    "('facebook/bart-base', 'flan')": {'L_inf': 1.5286971660491316e-21, 'A': 9.4669899393848862, 'alpha': 0.11633031124644934, 'N0': 1218.0919778829946},
    "('facebook/bart-base', 'gigaword')": {'L_inf': 0.58946598776300896, 'A': 108.9287557326785, 'alpha': 0.41880696190951294, 'N0': 6405.9291211063764},
    "('facebook/bart-base', 'wikiword')": {'L_inf': 1.2241278764977892, 'A': 14.59891742710114, 'alpha': 0.296828658414929, 'N0': 550.52167029708596},
    "('facebook/bart-large', 'flan')": {'L_inf': 5.832015840256426e-16, 'A': 5.6114061665463435, 'alpha': 0.082693064391887061, 'N0': 269.41968725510077},
    "('facebook/bart-large', 'gigaword')": {'L_inf': 0.43621297514778762, 'A': 61.030549178502987, 'alpha': 0.36845014177095686, 'N0': 4178.0357004377929},
    "('facebook/bart-large', 'wikiword')": {'L_inf': 0.78146336733638189, 'A': 2.6207508360795408, 'alpha': 0.11520372047236672, 'N0': 5.180818493558634e-14},
    "('facebook/opt-1.3b', 'flan')": {'L_inf': 1.1316583743515148e-22, 'A': 3.4371667100475456, 'alpha': 0.055193141005927544, 'N0': 323.52070958714017},
    "('facebook/opt-1.3b', 'gigaword')": {'L_inf': 0.30332705190985243, 'A': 10.781988597837117, 'alpha': 0.1955634435440943, 'N0': 1844.5396893552465},
    "('facebook/opt-1.3b', 'wikiword')": {'L_inf': 3.9141285827544074e-07, 'A': 2.3710328503714968, 'alpha': 0.042629161467379273, 'N0': 42.530108010942897},
    "('facebook/opt-350m', 'flan')": {'L_inf': 4.2674637946839605e-16, 'A': 5.6274498109833084, 'alpha': 0.078679983159364975, 'N0': 1427.2928357832131},
    "('facebook/opt-350m', 'gigaword')": {'L_inf': 0.31706165374107204, 'A': 21.45158456370562, 'alpha': 0.25483831221668413, 'N0': 2967.5137614439668},
    "('facebook/opt-350m', 'wikiword')": {'L_inf': 3.5590982935776697e-22, 'A': 3.2578144041623611, 'alpha': 0.055926984195821164, 'N0': 15.871413989109369},
    "('facebook/opt-6.7b', 'flan')": {'L_inf': 4.2644970195159805e-14, 'A': 2.2398829331455485, 'alpha': 0.019392179443575755, 'N0': 27.449781010659894},
    "('facebook/opt-6.7b', 'gigaword')": {'L_inf': 1.6339449437277602, 'A': 1.8526050324609111, 'alpha': 0.19214798532267319, 'N0': 5578.3067117739565},
    "('facebook/opt-6.7b', 'wikiword')": {'L_inf': 0.87971515066227735, 'A': 1.3801605110048245, 'alpha': 0.090311571163732174, 'N0': 150.71617039785826},
    "('google/mt5-base', 'flan')": {'L_inf': 1.2324185014386889e-13, 'A': 4.9361247204147887, 'alpha': 0.070825564721401529, 'N0': 268.26577562611078},
    "('google/mt5-base', 'gigaword')": {'L_inf': 9.4803500876947523e-18, 'A': 3.6572076010535848, 'alpha': 0.037261245000811988, 'N0': 549.53715116267517},
    "('google/mt5-base', 'wikiword')": {'L_inf': 5.3503734240167499e-20, 'A': 5.5586712508331431, 'alpha': 0.10787587016761085, 'N0': 388.26351955071812},
    "('google/mt5-large', 'flan')": {'L_inf': 5.7920010957389166e-18, 'A': 3.7361033508459114, 'alpha': 0.059085413754668289, 'N0': 296.79196124297391},
    "('google/mt5-large', 'gigaword')": {'L_inf': 1.8314919901603799e-18, 'A': 4.3017011973884225, 'alpha': 0.054175363187733493, 'N0': 2255.1426458690694},
    "('google/mt5-large', 'wikiword')": {'L_inf': 8.6999584666614079e-18, 'A': 4.0703650010924015, 'alpha': 0.081594995337680942, 'N0': 84.735744356004844},
    "('gpt2', 'flan')": {'L_inf': 3.7452280964370062e-19, 'A': 14.341235592859249, 'alpha': 0.14433096493029524, 'N0': 3987.9517915485044},
    "('gpt2', 'gigaword')": {'L_inf': 0.47259512781074609, 'A': 41.026206907119928, 'alpha': 0.31908945133616412, 'N0': 5570.9145376585493},
    "('gpt2', 'wikiword')": {'L_inf': 2.6976723637732368e-21, 'A': 4.3889355931475347, 'alpha': 0.078088125901744368, 'N0': 365.99983215367581},
    "('t5-base', 'flan')": {'L_inf': 7.7110040059956664e-14, 'A': 3.8842460457017833, 'alpha': 0.060766168148407823, 'N0': 454.69993288306603},
    "('t5-base', 'gigaword')": {'L_inf': 0.4167409915611956, 'A': 1.8233794054589314, 'alpha': 0.16745997554820544, 'N0': 1.3061379770078428e-12},
    "('t5-base', 'wikiword')": {'L_inf': 3.9931463428500972e-08, 'A': 2.3917549317042992, 'alpha': 0.049831237916825061, 'N0': 303.99768386012391},
    "('t5-small', 'flan')": {'L_inf': 2.6946017191960408e-16, 'A': 4.4288718323457097, 'alpha': 0.060922238935669323, 'N0': 428.39342065201339},
    "('t5-small', 'gigaword')": {'L_inf': 0.55855037746740699, 'A': 2.4248206335240212, 'alpha': 0.20909832005384368, 'N0': 173.82805296646535},
    "('t5-small', 'wikiword')": {'L_inf': 2.3866123781231927e-14, 'A': 3.0054681182413341, 'alpha': 0.057697528021941544, 'N0': 352.66019690405182},
}

MEDIAN_PARAMS = {'L_inf': 4.3943701775044804e-14, 'A': 4.4089037127466222, 'alpha': 0.082144029864784002, 'N0': 432.53960395453385}


def _predict_n(n: float, params: Dict[str, float]) -> float:
    # Guard against bad inputs
    if n is None or n <= 0:
        n = 1.0
    L_inf = float(params['L_inf'])
    A = float(params['A'])
    alpha = float(params['alpha'])
    N0 = float(params['N0'])
    return L_inf + A * ((n + N0) ** (-alpha))


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    params = COEFS.get(group, MEDIAN_PARAMS)
    outputs: List[Dict[str, float]] = []
    for row in input_data:
        n = float(row.get('sft_data_size', 0.0))
        y = _predict_n(n, params)
        outputs.append({'sft_loss': float(y)})
    return outputs

#2 Run 2 R² = 0.881513

▼

Python

from typing import List, Dict
import math

# Discovered scaling law (shared form across groups):
#   sft_loss(N) = L_inf + A * N**(-alpha)
# Parameters (L_inf, A, alpha) are fitted per experimental group.
# Keys match the dataset's string representation of the group.
_PARAMS: Dict[str, Dict[str, float]] = {
    "('MBZUAI/LaMini-GPT-124M', 'flan')": {"L_inf": 0.47411344881239503, "A": 7.374808337249902, "alpha": 0.10559131746215066},
    "('MBZUAI/LaMini-GPT-124M', 'gigaword')": {"L_inf": -0.27165359489412566, "A": 6.860897755882119, "alpha": 0.11000429377966452},
    "('MBZUAI/LaMini-GPT-124M', 'wikiword')": {"L_inf": 0.7548157825020428, "A": 3.4707438590784974, "alpha": 0.10469167102305689},
    "('MBZUAI/LaMini-GPT-774M', 'flan')": {"L_inf": 0.5986396543309487, "A": 5.504238879900254, "alpha": 0.10138910303291937},
    "('MBZUAI/LaMini-GPT-774M', 'gigaword')": {"L_inf": -0.26571541893347317, "A": 6.163958957295546, "alpha": 0.11217152849430964},
    "('MBZUAI/LaMini-GPT-774M', 'wikiword')": {"L_inf": 0.8463866229511964, "A": 2.3861909601450515, "alpha": 0.10675039464871704},
    "('cerebras/Cerebras-GPT-1.3B', 'flan')": {"L_inf": 1.1532050628799038, "A": 3.0593391008291815, "alpha": 0.1063286666070229},
    "('cerebras/Cerebras-GPT-1.3B', 'gigaword')": {"L_inf": 0.13731630548678075, "A": 4.832333492547224, "alpha": 0.10312003781282773},
    "('cerebras/Cerebras-GPT-1.3B', 'wikiword')": {"L_inf": 0.9903338078822375, "A": 2.5737102631364244, "alpha": 0.10382388972999458},
    "('cerebras/Cerebras-GPT-256M', 'flan')": {"L_inf": 1.2655018787222039, "A": 3.7596472555051346, "alpha": 0.09479040630151708},
    "('cerebras/Cerebras-GPT-256M', 'gigaword')": {"L_inf": -0.09998631088564425, "A": 5.777782392650072, "alpha": 0.10645628309837579},
    "('cerebras/Cerebras-GPT-256M', 'wikiword')": {"L_inf": 0.7785420197141477, "A": 4.152352522191937, "alpha": 0.10618673866174984},
    "('facebook/bart-base', 'flan')": {"L_inf": 0.36459417457512533, "A": 7.085796493184775, "alpha": 0.10462298211809325},
    "('facebook/bart-base', 'gigaword')": {"L_inf": -0.7686886755625248, "A": 8.529332274933846, "alpha": 0.11414571125661148},
    "('facebook/bart-base', 'wikiword')": {"L_inf": 0.28736890285933836, "A": 5.831010953341128, "alpha": 0.12180825697034668},
    "('facebook/bart-large', 'flan')": {"L_inf": 0.6377771788560049, "A": 4.983169060709315, "alpha": 0.1029142315676206},
    "('facebook/bart-large', 'gigaword')": {"L_inf": -0.8480423693355799, "A": 8.450682070471329, "alpha": 0.11613923216094764},
    "('facebook/bart-large', 'wikiword')": {"L_inf": 0.7738835957171751, "A": 2.6258922117119616, "alpha": 0.11449526082979924},
    "('facebook/opt-1.3b', 'flan')": {"L_inf": 1.0516216885374654, "A": 2.569460860025876, "alpha": 0.10478186744092736},
    "('facebook/opt-1.3b', 'gigaword')": {"L_inf": -0.1170624514587777, "A": 5.670422892511086, "alpha": 0.11109463193971202},
    "('facebook/opt-1.3b', 'wikiword')": {"L_inf": 0.9604445677062592, "A": 1.670448216347929, "alpha": 0.1065558735650616},
    "('facebook/opt-350m', 'flan')": {"L_inf": 0.9725591969482548, "A": 4.048032141370493, "alpha": 0.09858854088474828},
    "('facebook/opt-350m', 'gigaword')": {"L_inf": -0.48081699720904036, "A": 7.193524925842425, "alpha": 0.11226593209674694},
    "('facebook/opt-350m', 'wikiword')": {"L_inf": 0.8957143243445762, "A": 2.7053104006048057, "alpha": 0.10442868141102707},
    "('facebook/opt-6.7b', 'flan')": {"L_inf": 1.5178695146140102, "A": 0.8963908163255226, "alpha": 0.10330062084991393},
    "('facebook/opt-6.7b', 'gigaword')": {"L_inf": 1.6271899761579776, "A": 0.7228639238613288, "alpha": 0.10714822966510242},
    "('facebook/opt-6.7b', 'wikiword')": {"L_inf": 0.9976291700548863, "A": 1.2701553495194946, "alpha": 0.10692842591735088},
    "('google/mt5-base', 'flan')": {"L_inf": 0.9909552392011973, "A": 4.127108041394403, "alpha": 0.10765554735842911},
    "('google/mt5-base', 'gigaword')": {"L_inf": 1.738144527000284, "A": 2.0894910201254104, "alpha": 0.10236881852083911},
    "('google/mt5-base', 'wikiword')": {"L_inf": 0.219940755393925, "A": 4.844151110122103, "alpha": 0.10835078116396765},
    "('google/mt5-large', 'flan')": {"L_inf": 1.0582629427618657, "A": 2.88488578563406, "alpha": 0.10765599678024225},
    "('google/mt5-large', 'gigaword')": {"L_inf": 1.5590598696645523, "A": 2.4667672986940827, "alpha": 0.10397451979276673},
    "('google/mt5-large', 'wikiword')": {"L_inf": 0.4997614924770106, "A": 3.764600236381742, "alpha": 0.10752696949469541},
    "('gpt2', 'flan')": {"L_inf": 0.438056337367148, "A": 7.6634206051294, "alpha": 0.10485804635532844},
    "('gpt2', 'gigaword')": {"L_inf": -0.42946933611862526, "A": 7.19942996960907, "alpha": 0.11212218230377852},
    "('gpt2', 'wikiword')": {"L_inf": 0.721914712650132, "A": 3.6805767946541, "alpha": 0.10751695465396897},
    "('t5-base', 'flan')": {"L_inf": 0.9738756683322746, "A": 2.9858467916178224, "alpha": 0.09915764181480334},
    "('t5-base', 'gigaword')": {"L_inf": 0.4160154272945322, "A": 1.8226518092580497, "alpha": 0.16725452198815022},
    "('t5-base', 'wikiword')": {"L_inf": 0.8384227713655806, "A": 1.7031116858599484, "alpha": 0.10486402319956421},
    "('t5-small', 'flan')": {"L_inf": 1.1429779868192445, "A": 3.4039902908755177, "alpha": 0.10198959701342969},
    "('t5-small', 'gigaword')": {"L_inf": 0.40646821534634897, "A": 1.772838135879755, "alpha": 0.1354598347974606},
    "('t5-small', 'wikiword')": {"L_inf": 0.8687266845909188, "A": 2.2762678472908346, "alpha": 0.10517519218258724},
}

# Reasonable default if an unknown group is requested (median-ish across groups)
_DEFAULT = {"L_inf": 0.9, "A": 3.5, "alpha": 0.106}


def _get_params(group: str) -> Dict[str, float]:
    # Try direct match, then a simple normalization replacing double with single quotes
    if group in _PARAMS:
        return _PARAMS[group]
    key = group.replace('"', "'")
    return _PARAMS.get(key, _DEFAULT)


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    p = _get_params(group)
    L_inf, A, alpha = p["L_inf"], p["A"], p["alpha"]

    outputs: List[Dict[str, float]] = []
    for row in input_data:
        N = float(row.get("sft_data_size", 0.0))
        if N <= 0:
            # Fallback for non-positive sizes
            pred = float(L_inf)
        else:
            pred = L_inf + A * (N ** (-alpha))
        outputs.append({"sft_loss": float(pred)})
    return outputs

#3 Run 3 R² = 0.787239

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    The discovered scaling law for the final SFT loss as a function of the number of
    fine-tuning examples N is:
        sft_loss(N) = L_inf + A * N^(-alpha)

    The functional form is shared across groups, while (L_inf, A, alpha) depend on the
    experimental group.
    """
    # Per-group parameters fitted from the provided dataset
    params = {
        "('MBZUAI/LaMini-GPT-124M', 'flan')": {"L_inf": 5.559881562578104e-23, "A": 6.973887795673848, "alpha": 0.08134157828268436},
        "('MBZUAI/LaMini-GPT-124M', 'gigaword')": {"L_inf": 4.327079500922691e-14, "A": 5.923748173995491, "alpha": 0.10520870609722754},
        "('MBZUAI/LaMini-GPT-124M', 'wikiword')": {"L_inf": 1.012696544433358e-20, "A": 3.7815959353426396, "alpha": 0.06407311263806709},
        "('MBZUAI/LaMini-GPT-774M', 'flan')": {"L_inf": 8.817175226395397e-17, "A": 5.416194246903303, "alpha": 0.07203581857213417},
        "('MBZUAI/LaMini-GPT-774M', 'gigaword')": {"L_inf": 2.1212202956484596e-15, "A": 5.3506885622596645, "alpha": 0.10952375080346688},
        "('MBZUAI/LaMini-GPT-774M', 'wikiword')": {"L_inf": 1.7498503534003776e-14, "A": 2.8815067645852346, "alpha": 0.05390255280816395},
        "('cerebras/Cerebras-GPT-1.3B', 'flan')": {"L_inf": 4.4213376653851305e-14, "A": 3.7266874983524167, "alpha": 0.051336229389850475},
        "('cerebras/Cerebras-GPT-1.3B', 'gigaword')": {"L_inf": 1.0553780091788161e-17, "A": 4.601802147881311, "alpha": 0.0893794232774253},
        "('cerebras/Cerebras-GPT-1.3B', 'wikiword')": {"L_inf": 2.5651860391815964e-23, "A": 3.167052213096539, "alpha": 0.050104822840154946},
        "('cerebras/Cerebras-GPT-256M', 'flan')": {"L_inf": 1.0430715861904326e-11, "A": 4.497775910160822, "alpha": 0.04919730153376939},
        "('cerebras/Cerebras-GPT-256M', 'gigaword')": {"L_inf": 8.299683396004994e-20, "A": 5.2374267924570335, "alpha": 0.10004122927271739},
        "('cerebras/Cerebras-GPT-256M', 'wikiword')": {"L_inf": 5.5178220303264e-15, "A": 4.456019626758435, "alpha": 0.06905741523740135},
        "('facebook/bart-base', 'flan')": {"L_inf": 9.967688172321561e-24, "A": 6.79659479623705, "alpha": 0.08550170313618997},
        "('facebook/bart-base', 'gigaword')": {"L_inf": 2.1582514825954972e-14, "A": 7.594470247698282, "alpha": 0.1333098737650711},
        "('facebook/bart-base', 'wikiword')": {"L_inf": 0.2505082588152107, "A": 5.8619605021880545, "alpha": 0.12018517344350915},
        "('facebook/bart-large', 'flan')": {"L_inf": 1.7722344623948618e-21, "A": 5.1213143331737045, "alpha": 0.07407829953498384},
        "('facebook/bart-large', 'gigaword')": {"L_inf": 2.189571119296465e-16, "A": 7.68391016722078, "alpha": 0.14310642748853092},
        "('facebook/bart-large', 'wikiword')": {"L_inf": 0.7814641269275161, "A": 2.620750815857531, "alpha": 0.11520381374808832},
        "('facebook/opt-1.3b', 'flan')": {"L_inf": 4.412424863407121e-19, "A": 3.217103535577476, "alpha": 0.0490507140893502},
        "('facebook/opt-1.3b', 'gigaword')": {"L_inf": 2.946713648801423e-16, "A": 5.229994536997381, "alpha": 0.10805297833097156},
        "('facebook/opt-1.3b', 'wikiword')": {"L_inf": 4.931261534682388e-14, "A": 2.3491291177019287, "alpha": 0.04176287734274415},
        "('facebook/opt-350m', 'flan')": {"L_inf": 6.169762118090173e-17, "A": 4.468174332740433, "alpha": 0.05761166165573999},
        "('facebook/opt-350m', 'gigaword')": {"L_inf": 8.658086070458824e-16, "A": 6.499200759590741, "alpha": 0.12331846053711241},
        "('facebook/opt-350m', 'wikiword')": {"L_inf": 7.720579158922494e-13, "A": 3.2418388182870084, "alpha": 0.05546399769034253},
        "('facebook/opt-6.7b', 'flan')": {"L_inf": 7.293554654846449e-05, "A": 2.2338960147401603, "alpha": 0.019149109674392775},
        "('facebook/opt-6.7b', 'gigaword')": {"L_inf": 1.565293992962462e-13, "A": 2.1770799496039057, "alpha": 0.0144985269914185},
        "('facebook/opt-6.7b', 'wikiword')": {"L_inf": 0.2693295870152577, "A": 1.7881043289159282, "alpha": 0.04220832008572985},
        "('google/mt5-base', 'flan')": {"L_inf": 1.2519378570990074e-20, "A": 4.571149009162612, "alpha": 0.0636171898266009},
        "('google/mt5-base', 'gigaword')": {"L_inf": 6.12442057141287e-13, "A": 3.4396037441224725, "alpha": 0.031650622094009276},
        "('google/mt5-base', 'wikiword')": {"L_inf": 1.3848462896415916e-18, "A": 4.734453356289708, "alpha": 0.0926126263841603},
        "('google/mt5-large', 'flan')": {"L_inf": 7.833658894818595e-23, "A": 3.4927985709331155, "alpha": 0.05281181461038294},
        "('google/mt5-large', 'gigaword')": {"L_inf": 2.9078937035736547e-12, "A": 3.5492578609621224, "alpha": 0.03688126685016739},
        "('google/mt5-large', 'wikiword')": {"L_inf": 1.4601392172786045e-20, "A": 3.926297381838874, "alpha": 0.07816268962859528},
        "('gpt2', 'flan')": {"L_inf": 6.881729106134914e-20, "A": 7.198398075176432, "alpha": 0.08172044839859696},
        "('gpt2', 'gigaword')": {"L_inf": 2.7587330526782323e-12, "A": 6.339063440387514, "alpha": 0.11725622939708774},
        "('gpt2', 'wikiword')": {"L_inf": 4.910092417326488e-22, "A": 3.946267217667862, "alpha": 0.06811657158668484},
        "('t5-base', 'flan')": {"L_inf": 1.9384460869442965e-21, "A": 3.543052164952107, "alpha": 0.05225511788632308},
        "('t5-base', 'gigaword')": {"L_inf": 0.41674098826209743, "A": 1.8233793979734063, "alpha": 0.1674599741068092},
        "('t5-base', 'wikiword')": {"L_inf": 1.9399139733409873e-15, "A": 2.2598720912644974, "alpha": 0.044573869804594164},
        "('t5-small', 'flan')": {"L_inf": 7.821952737934886e-19, "A": 4.052493003053307, "alpha": 0.05269103060656371},
        "('t5-small', 'gigaword')": {"L_inf": 0.40091826346029086, "A": 1.7757420657725895, "alpha": 0.13439808776804488},
        "('t5-small', 'wikiword')": {"L_inf": 2.8846765809929266e-19, "A": 2.7905737209445953, "alpha": 0.05079953105765396},
    }

    # Fallbacks (if an unseen group appears): medians by task and overall
    per_task_median = {
        'flan': {"L_inf": 2.550298887010306e-19, "A": 4.482975121450627, "alpha": 0.05521173813306146},
        'gigaword': {"L_inf": 3.242665491759094e-14, "A": 5.233710664727207, "alpha": 0.10878836456721921},
        'wikiword': {"L_inf": 3.7288680018336935e-15, "A": 3.2044455156917735, "alpha": 0.059768555164204804},
    }
    overall_median = {"L_inf": 5.802399859630123e-16, "A": 3.9993801103605846, "alpha": 0.06858699341204309}

    # Resolve coefficients for the requested group
    if group in params:
        p = params[group]
    else:
        # Try to parse the task name from a string like "('model', 'task')"
        task = None
        try:
            # Split by comma and take last segment, strip ) and quotes
            task = group.split(',')[-1].strip().strip(') ').strip("'")
        except Exception:
            task = None
        p = per_task_median.get(task, overall_median)

    L_inf = float(p["L_inf"])  # asymptotic loss floor
    A = float(p["A"])          # amplitude
    alpha = float(p["alpha"])  # scaling exponent

    out: List[Dict[str, float]] = []
    for item in input_data:
        N = float(item.get('sft_data_size', 0.0))
        if N <= 0:
            # Graceful fallback: use minimal positive value to avoid division by zero
            N = 1.0
        pred = L_inf + A * (N ** (-alpha))
        out.append({'sft_loss': float(pred)})
    return out

#4 Run 4 R² = 0.284069

▼

Python

from __future__ import annotations
from typing import List, Dict

# Parameters fitted per experimental group for the scaling law
# L(N) = L_inf + A * (N + N0)^(-alpha)
GROUP_PARAMS = {
    "('MBZUAI/LaMini-GPT-124M', 'flan')": (1.74334372845e-14, 12.6376966889, 0.135642527036, 3172.84813899),
    "('MBZUAI/LaMini-GPT-124M', 'gigaword')": (0.693703651144, 138.472285153, 0.431968782432, 12511.858243),
    "('MBZUAI/LaMini-GPT-124M', 'wikiword')": (1.01586399934e-16, 4.2334921402, 0.0746041722245, 436.68969118),
    "('MBZUAI/LaMini-GPT-774M', 'flan')": (1.19409314659e-18, 8.92224180358, 0.117395961813, 3069.40919683),
    "('MBZUAI/LaMini-GPT-774M', 'gigaword')": (0.491595039277, 53.7238611705, 0.353849253703, 8208.08072935),
    "('MBZUAI/LaMini-GPT-774M', 'wikiword')": (5.27990777079e-16, 2.98964855216, 0.0573530926488, 140.710189355),
    "('cerebras/Cerebras-GPT-1.3B', 'flan')": (3.76889628326e-16, 4.06287843965, 0.0593450071786, 426.034125852),
    "('cerebras/Cerebras-GPT-1.3B', 'gigaword')": (6.18853897564e-19, 6.33621255116, 0.119201667319, 1084.13661129),
    "('cerebras/Cerebras-GPT-1.3B', 'wikiword')": (8.67298183717e-15, 3.41013139136, 0.0569598662977, 363.706821923),
    "('cerebras/Cerebras-GPT-256M', 'flan')": (5.61099380572e-23, 4.09129228086, 0.0655956807586, 381.324214495),
    "('cerebras/Cerebras-GPT-256M', 'gigaword')": (2.33071354039e-21, 6.20496986049, 0.0898142124765, 421.971295556),
    "('cerebras/Cerebras-GPT-256M', 'wikiword')": (7.17491357681e-12, 2.97543402284, 0.10948596302, 1540.59235749),
    "('facebook/bart-base', 'flan')": (4.59171206171e-23, 3.92826799844, 0.0572921215978, 513.889289197),
    "('facebook/bart-base', 'gigaword')": (0.331145263898, 1.80249328365, 0.161681097935, 1.38996740228e-08),
    "('facebook/bart-base', 'wikiword')": (7.54240118853e-08, 2.40134088564, 0.046208695099, 368.565361701),
    "('facebook/bart-large', 'flan')": (5.08695784056e-19, 3.85220096633, 0.053905137389, 522.232192014),
    "('facebook/bart-large', 'gigaword')": (0.333615694269, 2.10767061513, 0.152700192794, 2.67294535273e-08),
    "('facebook/bart-large', 'wikiword')": (0.0102249525968, 2.66899639933, 0.0733424237895, 125.118536989),
    "('google/flan-t5-base', 'flan')": (4.91247757583e-23, 3.60117304024, 0.0572465810795, 417.875313569),
    "('google/flan-t5-base', 'gigaword')": (0.405005274224, 1.61373265073, 0.164013157084, 3.1548076111e-09),
    "('google/flan-t5-base', 'wikiword')": (9.15667282415e-06, 2.32680485136, 0.0503996647358, 283.080905921),
    "('google/flan-t5-small', 'flan')": (1.00714948887e-18, 3.83772842519, 0.0591848781206, 455.282652859),
    "('google/flan-t5-small', 'gigaword')": (0.535288121688, 2.08203051743, 0.178595557119, 22.4200612776),
    "('google/flan-t5-small', 'wikiword')": (1.03689287158e-14, 2.85083455163, 0.056333131327, 301.738465789),
    "('google/gemma-1.1-2b-it', 'flan')": (6.8915143895e-21, 3.00848271538, 0.0480777398665, 390.795415165),
    "('google/gemma-1.1-2b-it', 'gigaword')": (0.625782070821, 1.82234052821, 0.175295164625, 1.42379766592e-08),
    "('google/gemma-1.1-2b-it', 'wikiword')": (2.05488645576e-08, 2.04820492761, 0.0343357892473, 304.231733022),
    "('HuggingFaceH4/zephyr-7b-alpha', 'flan')": (4.9049381785e-22, 2.43999566801, 0.0515771748292, 215.32550852),
    "('HuggingFaceH4/zephyr-7b-alpha', 'gigaword')": (0.663107576466, 2.05155751097, 0.187721428568, 26.8042322803),
    "('HuggingFaceH4/zephyr-7b-alpha', 'wikiword')": (4.12506384686e-15, 1.75811857156, 0.04720378107, 278.035272188),
    "('HuggingFaceH4/zephyr-7b-beta', 'flan')": (3.09249230834e-20, 1.99774222836, 0.0462512339271, 212.068394081),
    "('HuggingFaceH4/zephyr-7b-beta', 'gigaword')": (0.645108253463, 1.31823579619, 0.185613813767, 1.64484516925e-07),
    "('HuggingFaceH4/zephyr-7b-beta', 'wikiword')": (3.41599139834e-18, 1.50808886401, 0.0431953430781, 270.828819886),
    "('MBZUAI/LaMini-GPT-124M', 'flan+synthetic')": (5.48508537431e-09, 3.44038957741, 0.10143885973, 522.878784361),
    "('MBZUAI/LaMini-GPT-124M', 'gigaword+synthetic')": (0.6503504776, 43.3219835398, 0.388166622966, 6800.69906577),
    "('MBZUAI/LaMini-GPT-124M', 'wikiword+synthetic')": (7.2807427126e-09, 2.37643665333, 0.0904549693201, 583.1756373),
    "('MBZUAI/LaMini-GPT-774M', 'flan+synthetic')": (3.14075461673e-10, 3.34192480141, 0.0956885199672, 644.350510751),
    "('MBZUAI/LaMini-GPT-774M', 'gigaword+synthetic')": (0.487120603132, 25.5290146884, 0.295662400974, 4101.34778253),
    "('MBZUAI/LaMini-GPT-774M', 'wikiword+synthetic')": (1.61549400971e-08, 2.19369617312, 0.0864794164689, 589.042812831),
    "('meta-llama/Llama-2-7b-chat-hf', 'flan')": (3.82222214223e-10, 2.17654619575, 0.0736122258328, 316.310451114),
    "('meta-llama/Llama-2-7b-chat-hf', 'gigaword')": (0.65265599579, 1.61221334699, 0.203963866387, 7.77256658209e-08),
    "('meta-llama/Llama-2-7b-chat-hf', 'wikiword')": (1.10820500056e-16, 1.42413350177, 0.0494370037209, 181.81689969),
    "('openchat/openchat_3.5', 'flan')": (7.48101007959e-13, 2.14041757993, 0.0610714316899, 291.439271718),
    "('openchat/openchat_3.5', 'gigaword')": (0.442685808441, 1.59285815147, 0.190752780851, 62.6334333168),
    "('openchat/openchat_3.5', 'wikiword')": (1.608493018e-14, 1.44339217538, 0.0506461652539, 231.637812601),
    "('Qwen/Qwen1.5-1.8B-Chat', 'flan')": (6.27129779661e-22, 2.42847553091, 0.057062250832, 313.402069889),
    "('Qwen/Qwen1.5-1.8B-Chat', 'gigaword')": (0.672573369247, 1.68982128645, 0.195871473078, 1.58312141503e-07),
    "('Qwen/Qwen1.5-1.8B-Chat', 'wikiword')": (1.15387868584e-16, 1.78180769947, 0.0558031745887, 246.147766024),
    "('Qwen/Qwen1.5-7B-Chat', 'flan')": (2.41403848954e-21, 1.7337576363, 0.0522215109255, 287.46828319),
    "('Qwen/Qwen1.5-7B-Chat', 'gigaword')": (0.701829784636, 1.54187766402, 0.206394932847, 3.91240722118e-08),
    "('Qwen/Qwen1.5-7B-Chat', 'wikiword')": (5.58834583656e-19, 1.35838705992, 0.0471114172229, 255.717687844),
    "('tiiuae/falcon-1b', 'flan')": (2.93496793134e-17, 4.38619638046, 0.0803432225412, 635.63611150),
    "('tiiuae/falcon-1b', 'gigaword')": (0.642779525838, 7.68985835857, 0.246590363741, 1906.41074333),
    "('tiiuae/falcon-1b', 'wikiword')": (3.18649499919e-12, 3.10653686584, 0.103057831142, 1573.84972758),
    "('tiiuae/falcon-7b-instruct', 'flan')": (2.60111608485e-16, 2.22355106974, 0.0722324800102, 397.155408139),
    "('tiiuae/falcon-7b-instruct', 'gigaword')": (0.669375089848, 1.61392768813, 0.203131910042, 2.41640431048e-08),
    "('tiiuae/falcon-7b-instruct', 'wikiword')": (1.6813344452e-18, 1.29755740218, 0.0449371852927, 230.006648709),
    "('t5-base', 'flan')": (1.20686838657e-20, 3.57216973557, 0.0528291592115, 454.455171846),
    "('t5-base', 'gigaword')": (0.416740988169, 1.82337939819, 0.167459974094, 5.3249802882e-09),
    "('t5-base', 'wikiword')": (8.88575837592e-06, 2.39174846722, 0.0498315754901, 304.000915976),
    "('t5-small', 'flan')": (1.04548316353e-21, 4.42886795655, 0.0609221599197, 428.387583993),
    "('t5-small', 'gigaword')": (0.558546921255, 2.42478473544, 0.209095628426, 173.820139152),
    "('t5-small', 'wikiword')": (1.89469511102e-15, 3.00546805873, 0.0576975262125, 352.660073561),
}

# Robust fallback parameters (median across groups), used if an unknown group is requested
FALLBACK_PARAMS = (4.73642516861e-15, 4.40890174984, 0.0821440711414, 432.538637586)


def _predict_loss(N: float, params: tuple[float, float, float, float]) -> float:
    L0, A, alpha, N0 = params
    x = float(N) + float(N0)
    if x < 1e-9:
        x = 1e-9
    return float(L0) + float(A) * (x ** (-float(alpha)))


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    params = GROUP_PARAMS.get(group, FALLBACK_PARAMS)
    return [{"sft_loss": _predict_loss(row.get("sft_data_size", 0.0), params)} for row in input_data]

#5 Run 5 R² = 0.264702

▼

Python

from __future__ import annotations
import math, re
from typing import List, Dict

# Coefficients fitted on the provided dataset using the law:
# L(N) = L_inf + k * N**(-alpha)
# Functional form is shared across groups; only coefficients differ.

COEFFS: dict[str, dict[str, float]] = {
    "('MBZUAI/LaMini-GPT-124M', 'flan')": {'L_inf': 2.32602981455e-19, 'k': 6.97388770412, 'alpha': 0.0813415766836},
    "('MBZUAI/LaMini-GPT-124M', 'gigaword')": {'L_inf': 9.01714251449e-18, 'k': 5.9237450148, 'alpha': 0.105208639131},
    "('MBZUAI/LaMini-GPT-124M', 'wikiword')": {'L_inf': 2.97921363876e-13, 'k': 3.78159598424, 'alpha': 0.0640731141802},
    "('MBZUAI/LaMini-GPT-774M', 'flan')": {'L_inf': 6.75683775834e-22, 'k': 5.41619406169, 'alpha': 0.0720358144505},
    "('MBZUAI/LaMini-GPT-774M', 'gigaword')": {'L_inf': 2.59755209321e-16, 'k': 5.35068812271, 'alpha': 0.109523740458},
    "('MBZUAI/LaMini-GPT-774M', 'wikiword')": {'L_inf': 4.7406071092e-16, 'k': 2.88150676493, 'alpha': 0.0539025528231},
    "('cerebras/Cerebras-GPT-1.3B', 'flan')": {'L_inf': 2.07589754583e-15, 'k': 3.7266874915, 'alpha': 0.0513362291721},
    "('cerebras/Cerebras-GPT-1.3B', 'gigaword')": {'L_inf': 8.33979003212e-22, 'k': 4.60180205858, 'alpha': 0.0893794208949},
    "('cerebras/Cerebras-GPT-1.3B', 'wikiword')": {'L_inf': 2.08399294226e-13, 'k': 3.16705223304, 'alpha': 0.0501048235748},
    "('cerebras/Cerebras-GPT-256M', 'flan')": {'L_inf': 5.00467018061e-20, 'k': 4.49777573647, 'alpha': 0.0491972970206},
    "('cerebras/Cerebras-GPT-256M', 'gigaword')": {'L_inf': 4.66802653456e-13, 'k': 5.23742845695, 'alpha': 0.100041268932},
    "('cerebras/Cerebras-GPT-256M', 'wikiword')": {'L_inf': 2.44788262041e-24, 'k': 4.45601962703, 'alpha': 0.0690574152446},
    "('facebook/bart-base', 'flan')": {'L_inf': 3.81763239739e-24, 'k': 6.79659479072, 'alpha': 0.0855017030342},
    "('facebook/bart-base', 'gigaword')": {'L_inf': 8.89609277179e-20, 'k': 7.594466362, 'alpha': 0.133309807287},
    "('facebook/bart-base', 'wikiword')": {'L_inf': 0.250521154378, 'k': 5.86196164531, 'alpha': 0.120185944723},
    "('facebook/bart-large', 'flan')": {'L_inf': 3.69858816566e-15, 'k': 5.12131355301, 'alpha': 0.0740782957819},
    "('facebook/bart-large', 'gigaword')": {'L_inf': 3.7736342501e-13, 'k': 7.68393185227, 'alpha': 0.14310680226},
    "('facebook/bart-large', 'wikiword')": {'L_inf': 0.781463763912, 'k': 2.62075089349, 'alpha': 0.115203588592},
    "('facebook/opt-1.3b', 'flan')": {'L_inf': 1.11456520733e-15, 'k': 3.21710442803, 'alpha': 0.0490513209021},
    "('facebook/opt-1.3b', 'gigaword')": {'L_inf': 1.28428679752e-24, 'k': 5.22999701626, 'alpha': 0.108052731066},
    "('facebook/opt-1.3b', 'wikiword')": {'L_inf': 8.39484227831e-17, 'k': 2.34912920881, 'alpha': 0.0417630255941},
    "('facebook/opt-350m', 'flan')": {'L_inf': 2.12896705802e-13, 'k': 4.46817476021, 'alpha': 0.0576120681765},
    "('facebook/opt-350m', 'gigaword')": {'L_inf': 1.44205901232e-14, 'k': 6.49920240563, 'alpha': 0.123318370565},
    "('facebook/opt-350m', 'wikiword')": {'L_inf': 1.16479244482e-17, 'k': 3.24183866898, 'alpha': 0.0554642626476},
    "('facebook/opt-6.7b', 'flan')": {'L_inf': 1.21510504514e-18, 'k': 2.23396791349, 'alpha': 0.0191484545848},
    "('facebook/opt-6.7b', 'gigaword')": {'L_inf': 6.16775164569e-13, 'k': 2.17707962767, 'alpha': 0.0144989474484},
    "('facebook/opt-6.7b', 'wikiword')": {'L_inf': 0.175919014957, 'k': 2.74005136883, 'alpha': 0.10447327565},
    "('gpt2', 'flan')": {'L_inf': 2.5e-323, 'k': 3.72739393067, 'alpha': 0.0547825189783},
    "('gpt2', 'gigaword')": {'L_inf': 1.36550147093e-24, 'k': 4.77140298652, 'alpha': 0.0885343305552},
    "('gpt2', 'wikiword')": {'L_inf': 1.11505657262e-18, 'k': 2.47064530041, 'alpha': 0.0392326880497},
    "('google/mt5-base', 'flan')": {'L_inf': 2.85373464182e-17, 'k': 6.32490044618, 'alpha': 0.10705970048},
    "('google/mt5-base', 'gigaword')": {'L_inf': 2.09467361407e-22, 'k': 7.23146392424, 'alpha': 0.134074989853},
    "('google/mt5-base', 'wikiword')": {'L_inf': 0.0060702888029, 'k': 3.91259799635, 'alpha': 0.0828354050287},
    "('google/mt5-large', 'flan')": {'L_inf': 8.78500761998e-21, 'k': 5.90447421308, 'alpha': 0.0953409619567},
    "('google/mt5-large', 'gigaword')": {'L_inf': 4.07696061715e-16, 'k': 7.11451773393, 'alpha': 0.136344443788},
    "('google/mt5-large', 'wikiword')": {'L_inf': 0.335229341613, 'k': 2.54590405026, 'alpha': 0.103779566203},
    "('t5-base', 'flan')": {'L_inf': 2.6467349184e-20, 'k': 6.08769684037, 'alpha': 0.0932843181384},
    "('t5-base', 'gigaword')": {'L_inf': 3.68496645579e-20, 'k': 6.8125614773, 'alpha': 0.116536448907},
    "('t5-base', 'wikiword')": {'L_inf': 0.0, 'k': 4.67827633118, 'alpha': 0.114076823388},
    "('t5-small', 'flan')": {'L_inf': 3.35247273446e-21, 'k': 7.06663234781, 'alpha': 0.121259374307},
    "('t5-small', 'gigaword')": {'L_inf': 1.19919213784e-19, 'k': 7.38374240969, 'alpha': 0.150851024609},
    "('t5-small', 'wikiword')": {'L_inf': 3.28440821836e-21, 'k': 5.01746548505, 'alpha': 0.109593599131},
}

MEDIAN_BY_DATASET: dict[str, dict[str, float]] = {
    'flan': {'L_inf': 1.26146861678e-15, 'k': 4.48297517797, 'alpha': 0.0552117421974},
    'gigaword': {'L_inf': 1.34386175918e-16, 'k': 5.23371253922, 'alpha': 0.108788384497},
    'wikiword': {'L_inf': 2.53160329051e-13, 'k': 3.20444552644, 'alpha': 0.0597685559862},
}

GLOBAL_MEDIAN = {'L_inf': 1.29103697185e-15, 'k': 3.99938013481, 'alpha': 0.0685869934535}

def _predict_one(N: float, coeffs: dict[str, float]) -> float:
    if N <= 0:
        N = 1.0
    L_inf = float(coeffs['L_inf'])
    k = float(coeffs['k'])
    alpha = float(coeffs['alpha'])
    return max(0.0, L_inf + k * (N ** (-alpha)))


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    coeffs = COEFFS.get(group)
    if coeffs is None:
        m = re.search(r", '([^']+)'\)$", str(group))
        if m:
            ds = m.group(1)
            coeffs = MEDIAN_BY_DATASET.get(ds, GLOBAL_MEDIAN)
        else:
            coeffs = GLOBAL_MEDIAN
    out = []
    for row in input_data:
        N = float(row.get('sft_data_size', row.get('N', 0.0)))
        y = _predict_one(N, coeffs)
        out.append({'sft_loss': float(y)})
    return out