SLD - SFT Scaling Law - codex + GPT-5

All Runs (sorted by R²)

Best Run 1 R² = 0.980828

▼

Python

from __future__ import annotations

from typing import Dict, List


# Common functional form across all groups:
#   sft_loss(N) = L_inf + A * (N + N0) ** (-alpha)
# where N is `sft_data_size` and parameters (L_inf, A, alpha, N0) vary by `group`.

PARAMS: Dict[str, Dict[str, float]] = {
    "('MBZUAI/LaMini-GPT-124M', 'flan')": {'L_inf': 1.865671228941949e-17, 'A': 12.637678561446139, 'alpha': 0.13564240302792172, 'N0': 3172.8349547867774},
    "('MBZUAI/LaMini-GPT-124M', 'gigaword')": {'L_inf': 0.6238498800785734, 'A': 99.99999999999999, 'alpha': 0.3976078224298724, 'N0': 11558.491067541954},
    "('MBZUAI/LaMini-GPT-124M', 'wikiword')": {'L_inf': 5.063925056378967e-15, 'A': 4.23348975814935, 'alpha': 0.07460412111294643, 'N0': 436.6866783168706},
    "('MBZUAI/LaMini-GPT-774M', 'flan')": {'L_inf': 9.10578307107794e-19, 'A': 8.92224005340684, 'alpha': 0.11739594489898576, 'N0': 3069.407127001592},
    "('MBZUAI/LaMini-GPT-774M', 'gigaword')": {'L_inf': 0.49159473249601304, 'A': 53.72380342046997, 'alpha': 0.35384913142132957, 'N0': 8208.078352425737},
    "('MBZUAI/LaMini-GPT-774M', 'wikiword')": {'L_inf': 1.2593622265895715e-07, 'A': 2.9896484714861424, 'alpha': 0.05735309742066091, 'N0': 140.71022280680234},
    "('cerebras/Cerebras-GPT-1.3B', 'flan')": {'L_inf': 1.1318279234338576e-18, 'A': 4.062878459420533, 'alpha': 0.05934500761506854, 'N0': 426.03416018746805},
    "('cerebras/Cerebras-GPT-1.3B', 'gigaword')": {'L_inf': 2.3927299190951424e-21, 'A': 6.336196488869017, 'alpha': 0.1192014399950389, 'N0': 1084.1232942926576},
    "('cerebras/Cerebras-GPT-1.3B', 'wikiword')": {'L_inf': 9.220157697788088e-20, 'A': 3.4101310251993695, 'alpha': 0.056959891171103054, 'N0': 363.70595860114133},
    "('cerebras/Cerebras-GPT-256M', 'flan')": {'L_inf': 2.315142285734426e-22, 'A': 5.319365673488941, 'alpha': 0.06450038022371594, 'N0': 1162.8602806462848},
    "('cerebras/Cerebras-GPT-256M', 'gigaword')": {'L_inf': 1.01404483163551e-14, 'A': 10.7925493382822, 'alpha': 0.16678612735396567, 'N0': 2909.744466994437},
    "('cerebras/Cerebras-GPT-256M', 'wikiword')": {'L_inf': 1.6675446178776843e-19, 'A': 4.756329316561344, 'alpha': 0.07520633987592405, 'N0': 197.06918542242636},
    "('facebook/bart-base', 'flan')": {'L_inf': 3.4024201572921337e-19, 'A': 9.466998707085137, 'alpha': 0.1163303938056367, 'N0': 1218.0973728427302},
    "('facebook/bart-base', 'gigaword')": {'L_inf': 0.569762177448528, 'A': 99.9999998931003, 'alpha': 0.4093787678794461, 'N0': 6244.642597957972},
    "('facebook/bart-base', 'wikiword')": {'L_inf': 1.2241220338350347, 'A': 14.5986109269959, 'alpha': 0.29682572736757595, 'N0': 550.5089063545902},
    "('facebook/bart-large', 'flan')": {'L_inf': 1.1510770614726884e-17, 'A': 5.611410375270038, 'alpha': 0.0826931330009914, 'N0': 269.42264002978004},
    "('facebook/bart-large', 'gigaword')": {'L_inf': 0.4362137935788755, 'A': 61.030730385354516, 'alpha': 0.36845048401898794, 'N0': 4178.040636016911},
    "('facebook/bart-large', 'wikiword')": {'L_inf': 0.7814640933847434, 'A': 2.6207508142290563, 'alpha': 0.11520380951312024, 'N0': 7.81642673397928e-11},
    "('facebook/opt-1.3b', 'flan')": {'L_inf': 3.0801677443604495e-12, 'A': 3.437167448516286, 'alpha': 0.05519316053319131, 'N0': 323.52209754302413},
    "('facebook/opt-1.3b', 'gigaword')": {'L_inf': 0.303325908352575, 'A': 10.781976113943252, 'alpha': 0.19556325386045303, 'N0': 1844.5375817073113},
    "('facebook/opt-1.3b', 'wikiword')": {'L_inf': 6.849950839473154e-13, 'A': 2.371033103373906, 'alpha': 0.0426291470638209, 'N0': 42.52989450771065},
    "('facebook/opt-350m', 'flan')": {'L_inf': 1.0439604500936763e-21, 'A': 5.627459766341364, 'alpha': 0.07868013844546193, 'N0': 1427.3103646514269},
    "('facebook/opt-350m', 'gigaword')": {'L_inf': 0.3170636936481373, 'A': 21.45165635532035, 'alpha': 0.2548387620240815, 'N0': 2967.5199471791925},
    "('facebook/opt-350m', 'wikiword')": {'L_inf': 7.435230193328557e-18, 'A': 3.257815231163734, 'alpha': 0.055927008001466834, 'N0': 15.872263908578827},
    "('facebook/opt-6.7b', 'flan')": {'L_inf': 1.7464718748726132e-09, 'A': 2.239883130424448, 'alpha': 0.019392187633072766, 'N0': 27.45074276948627},
    "('facebook/opt-6.7b', 'gigaword')": {'L_inf': 1.6339485409447412, 'A': 1.852648048421951, 'alpha': 0.19215155224167946, 'N0': 5578.387896916012},
    "('facebook/opt-6.7b', 'wikiword')": {'L_inf': 0.8797137795917916, 'A': 1.3801606877059753, 'alpha': 0.09031133705708351, 'N0': 150.715299869533},
    "('google/mt5-base', 'flan')": {'L_inf': 1.0369199206942231e-16, 'A': 4.936124715180229, 'alpha': 0.07082556462170639, 'N0': 268.26577139087215},
    "('google/mt5-base', 'gigaword')": {'L_inf': 2.993452899495181e-20, 'A': 3.657207489347324, 'alpha': 0.037261242285363386, 'N0': 549.53675753708},
    "('google/mt5-base', 'wikiword')": {'L_inf': 8.178379339595827e-17, 'A': 5.558671475357777, 'alpha': 0.10787587390263807, 'N0': 388.26365277913226},
    "('google/mt5-large', 'flan')": {'L_inf': 2.0351395227477922e-12, 'A': 3.7361033461210726, 'alpha': 0.059085413639518115, 'N0': 296.79195384894666},
    "('google/mt5-large', 'gigaword')": {'L_inf': 1.7285593853941235e-20, 'A': 4.301700747580993, 'alpha': 0.054175354177230746, 'N0': 2255.1405984927396},
    "('google/mt5-large', 'wikiword')": {'L_inf': 3.385679168082505e-20, 'A': 4.070364998240034, 'alpha': 0.08159499527141832, 'N0': 84.73574253453266},
    "('gpt2', 'flan')": {'L_inf': 4.015340729288829e-15, 'A': 14.34124705639774, 'alpha': 0.14433103359274688, 'N0': 3987.9597869657864},
    "('gpt2', 'gigaword')": {'L_inf': 0.47259432236704396, 'A': 41.02611997599669, 'alpha': 0.3190891985857585, 'N0': 5570.9100030079235},
    "('gpt2', 'wikiword')": {'L_inf': 1.8048515222229452e-22, 'A': 4.388935478788763, 'alpha': 0.0780881235127983, 'N0': 365.9997120466946},
    "('t5-base', 'flan')": {'L_inf': 8.028632468328126e-18, 'A': 3.8842431380507225, 'alpha': 0.06076610065498959, 'N0': 454.6947810694002},
    "('t5-base', 'gigaword')": {'L_inf': 0.416740982867647, 'A': 1.8233793878342568, 'alpha': 0.16745997189052603, 'N0': 7.833754721840204e-09},
    "('t5-base', 'wikiword')": {'L_inf': 1.7554230404581307e-14, 'A': 2.3917550734396555, 'alpha': 0.049831240630992694, 'N0': 303.9980112153414},
    "('t5-small', 'flan')": {'L_inf': 2.006136285290001e-21, 'A': 4.428866599546551, 'alpha': 0.06092213223812954, 'N0': 428.3855452155827},
    "('t5-small', 'gigaword')": {'L_inf': 0.5585503700141525, 'A': 2.424820616633481, 'alpha': 0.20909831661111153, 'N0': 173.82806592094695},
    "('t5-small', 'wikiword')": {'L_inf': 1.0419309334128787e-12, 'A': 3.0054681324705808, 'alpha': 0.057697528448694614, 'N0': 352.6602281915071},
}


# Fallback parameters (robust median-ish typical behavior) if an unknown group is requested.
FALLBACK = {'L_inf': 0.5, 'A': 5.0, 'alpha': 0.10, 'N0': 300.0}


def _predict_loss(n: float, p: Dict[str, float]) -> float:
    # Guard against negative or pathological inputs
    n = max(0.0, float(n))
    L_inf = float(p['L_inf'])
    A = float(p['A'])
    alpha = float(p['alpha'])
    N0 = float(p['N0'])
    return L_inf + A * (n + N0) ** (-alpha)


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """

    params = PARAMS.get(group, FALLBACK)
    out: List[Dict[str, float]] = []
    for row in input_data:
        n = row.get('sft_data_size')
        if n is None:
            raise KeyError("Each input row must include 'sft_data_size'.")
        pred = _predict_loss(n, params)
        out.append({'sft_loss': float(pred)})
    return out

#2 Run 2 R² = 0.893894

▼

Python

from __future__ import annotations

import math
from typing import Dict, List


# Discovered scaling law:
#   sft_loss(N) = c_g + A_g * N^{-alpha_g}
# where N is `sft_data_size` and parameters (A_g, alpha_g, c_g) depend on the group `g`.
# Parameters were fitted from /app/data using a grid search over c and closed-form
# linear regression in log-space for (A, alpha). See explain.md for details.


_PARAMS: Dict[str, Dict[str, float]] = {
    "('MBZUAI/LaMini-GPT-124M', 'flan')": {"A": 7.433658255860093, "alpha": 0.09226000464368624, "c": 0.09675148370440967},
    "('MBZUAI/LaMini-GPT-124M', 'gigaword')": {"A": 6.874107269901793, "alpha": 0.0961185010987388, "c": -0.6048987107551289},
    "('MBZUAI/LaMini-GPT-124M', 'wikiword')": {"A": 3.515099444896067, "alpha": 0.09118112442159486, "c": 0.5658052043085156},
    "('MBZUAI/LaMini-GPT-774M', 'flan')": {"A": 5.571301521167711, "alpha": 0.08866702222735884, "c": 0.308757393835946},
    "('MBZUAI/LaMini-GPT-774M', 'gigaword')": {"A": 6.165131768239635, "alpha": 0.09804084906491561, "c": -0.5602691056924662},
    "('MBZUAI/LaMini-GPT-774M', 'wikiword')": {"A": 2.4143756266192393, "alpha": 0.09291530407343383, "c": 0.7169066913563793},
    "('cerebras/Cerebras-GPT-1.3B', 'flan')": {"A": 3.0922612285131654, "alpha": 0.0926145433550617, "c": 0.9894025703690386},
    "('cerebras/Cerebras-GPT-1.3B', 'gigaword')": {"A": 4.8983902032674775, "alpha": 0.08980259295448803, "c": -0.12785715270495857},
    "('cerebras/Cerebras-GPT-1.3B', 'wikiword')": {"A": 2.6082145614016587, "alpha": 0.09040044252353252, "c": 0.849221081874973},
    "('cerebras/Cerebras-GPT-256M', 'flan')": {"A": 3.8360560761338975, "alpha": 0.08283987366395605, "c": 1.0527230568498172},
    "('cerebras/Cerebras-GPT-256M', 'gigaword')": {"A": 5.82483780086157, "alpha": 0.09288397507637496, "c": -0.3990739992461749},
    "('cerebras/Cerebras-GPT-256M', 'wikiword')": {"A": 4.203731485262738, "alpha": 0.09241947304680805, "c": 0.5521166281945702},
    "('facebook/bart-base', 'flan')": {"A": 7.166252744001162, "alpha": 0.09118887577361773, "c": -0.014849184533570536},
    "('facebook/bart-base', 'gigaword')": {"A": 8.530694977189771, "alpha": 0.09957563721804477, "c": -1.1823316026689035},
    "('facebook/bart-base', 'wikiword')": {"A": 5.831048007260207, "alpha": 0.12181764753304936, "c": 0.2875148200417713},
    "('facebook/bart-large', 'flan')": {"A": 5.055545188910447, "alpha": 0.08972030157576516, "c": 0.3642928048427696},
    "('facebook/bart-large', 'gigaword')": {"A": 8.440715433300968, "alpha": 0.10128919982674446, "c": -1.2542028866650223},
    "('facebook/bart-large', 'wikiword')": {"A": 2.625901988771753, "alpha": 0.11450881725727033, "c": 0.773990348139355},
    "('facebook/opt-1.3b', 'flan')": {"A": 2.6019395932949694, "alpha": 0.09128396055228089, "c": 0.912092996343137},
    "('facebook/opt-1.3b', 'gigaword')": {"A": 5.698463179207566, "alpha": 0.09684355526954083, "c": -0.404887663369363},
    "('facebook/opt-1.3b', 'wikiword')": {"A": 1.6913415499558018, "alpha": 0.0926469766458569, "c": 0.8686594562819331},
    "('facebook/opt-350m', 'flan')": {"A": 4.114510224476879, "alpha": 0.08609102442015397, "c": 0.7495118335285937},
    "('facebook/opt-350m', 'gigaword')": {"A": 7.215778308055833, "alpha": 0.09785084183895248, "c": -0.8406868812657662},
    "('facebook/opt-350m', 'wikiword')": {"A": 2.747043184816001, "alpha": 0.09075858027212512, "c": 0.743147348569997},
    "('facebook/opt-6.7b', 'flan')": {"A": 0.9105549945441712, "alpha": 0.08988730230965163, "c": 1.467552846893482},
    "('facebook/opt-6.7b', 'gigaword')": {"A": 0.7268311599516257, "alpha": 0.09370785253710144, "c": 1.5912280173978794},
    "('facebook/opt-6.7b', 'wikiword')": {"A": 1.2863909658621537, "alpha": 0.09301795411148002, "c": 0.9279196588592447},
    "('google/mt5-base', 'flan')": {"A": 4.169395463541406, "alpha": 0.09377290515999764, "c": 0.7709238331604022},
    "('google/mt5-base', 'gigaword')": {"A": 2.1192744195133595, "alpha": 0.08928573438345472, "c": 1.6240943680061208},
    "('google/mt5-base', 'wikiword')": {"A": 4.892420916516959, "alpha": 0.09430230139026845, "c": -0.039111613116963984},
    "('google/mt5-large', 'flan')": {"A": 2.9138444990850156, "alpha": 0.09375204525332917, "c": 0.9044896404314862},
    "('google/mt5-large', 'gigaword')": {"A": 2.4904411567913773, "alpha": 0.0908186613029444, "c": 1.430872600935274},
    "('google/mt5-large', 'wikiword')": {"A": 3.810771690226976, "alpha": 0.09346933670550471, "c": 0.2930155180254166},
    "('gpt2', 'flan')": {"A": 7.7293097116637774, "alpha": 0.09159395725105893, "c": 0.04323131903587285},
    "('gpt2', 'gigaword')": {"A": 7.210656946072294, "alpha": 0.09789730975284584, "c": -0.7803905818129324},
    "('gpt2', 'wikiword')": {"A": 3.7187595959817026, "alpha": 0.09361665473439741, "c": 0.524992541691645},
    "('t5-base', 'flan')": {"A": 3.038007165514026, "alpha": 0.08652520067841116, "c": 0.8070610562780735},
    "('t5-base', 'gigaword')": {"A": 1.8251540531547072, "alpha": 0.16771846337021848, "c": 0.4170609196163503},
    "('t5-base', 'wikiword')": {"A": 1.72449969723365, "alpha": 0.0914230741844482, "c": 0.7464464112092959},
    "('t5-small', 'flan')": {"A": 3.4550037153996023, "alpha": 0.08889329646715426, "c": 0.9551825336351996},
    "('t5-small', 'gigaword')": {"A": 1.773215194224546, "alpha": 0.1355972851826645, "c": 0.40697723824523435},
    "('t5-small', 'wikiword')": {"A": 2.302773376251765, "alpha": 0.09181930287533102, "c": 0.7478278999059439},
}

# Global fallback (simple mean across groups) in case an unseen group is requested.
_FALLBACK = {"A": 4.148897722449893, "alpha": 0.09644804195199079, "c": 0.41145739856516367}


def _get_params_for_group(group: str) -> Dict[str, float]:
    # Exact match first; otherwise, use fallback.
    if group in _PARAMS:
        return _PARAMS[group]
    # Light normalization attempt for common formatting differences.
    g = group.strip()
    if g in _PARAMS:
        return _PARAMS[g]
    return _FALLBACK


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    params = _get_params_for_group(group)
    A = float(params["A"])  # amplitude
    alpha = float(params["alpha"])  # power-law exponent
    c = float(params["c"])  # irreducible loss floor

    out: List[Dict[str, float]] = []
    for row in input_data:
        N = float(row.get("sft_data_size", 0.0))
        # Guard against non-positive sizes; use a tiny positive number to avoid
        # division-by-zero while keeping monotonic behavior.
        if not math.isfinite(N) or N <= 0.0:
            N = 1.0
        pred = c + A * (N ** (-alpha))
        out.append({"sft_loss": float(pred)})
    return out

#3 Run 3 R² = 0.893354

▼

Python

from __future__ import annotations

from typing import Dict, List


# Discovered scaling law (shared functional form across all groups):
#   sft_loss(N) = L_inf + A * N**(-alpha)
# where N is `sft_data_size`, and (L_inf, A, alpha) depend on `group`.


PARAMS: Dict[str, Dict[str, float]] = {
  "('MBZUAI/LaMini-GPT-124M', 'flan')": {
    "L_inf": 0.0,
    "A": 7.465597659576915,
    "alpha": 0.08938336391209903
  },
  "('MBZUAI/LaMini-GPT-774M', 'flan')": {
    "L_inf": 0.0,
    "A": 5.713040837756619,
    "alpha": 0.078292258243444
  },
  "('cerebras/Cerebras-GPT-256M', 'flan')": {
    "L_inf": 0.0,
    "A": 4.582026537858248,
    "alpha": 0.051336282730936586
  },
  "('cerebras/Cerebras-GPT-1.3B', 'flan')": {
    "L_inf": 0.0,
    "A": 3.7633686464634066,
    "alpha": 0.05245645350446028
  },
  "('facebook/bart-base', 'flan')": {
    "L_inf": 0.0,
    "A": 7.161143632325522,
    "alpha": 0.09164738334291282
  },
  "('facebook/bart-large', 'flan')": {
    "L_inf": 0.0,
    "A": 5.2395659866752196,
    "alpha": 0.0767344266956308
  },
  "('facebook/opt-1.3b', 'flan')": {
    "L_inf": 0.0,
    "A": 3.2428955974836873,
    "alpha": 0.04996138964643028
  },
  "('facebook/opt-350m', 'flan')": {
    "L_inf": 0.0,
    "A": 4.5858563671544825,
    "alpha": 0.060620773494188006
  },
  "('facebook/opt-6.7b', 'flan')": {
    "L_inf": 0.0,
    "A": 2.2344284064674045,
    "alpha": 0.019171706023120298
  },
  "('gpt2', 'flan')": {
    "L_inf": 0.0,
    "A": 7.743248668603332,
    "alpha": 0.09034898764189159
  },
  "('t5-base', 'flan')": {
    "L_inf": 0.0,
    "A": 3.5927162123671033,
    "alpha": 0.05385301204930487
  },
  "('t5-small', 'flan')": {
    "L_inf": 0.0,
    "A": 4.103162145653623,
    "alpha": 0.05411455023213695
  },
  "('google/mt5-base', 'flan')": {
    "L_inf": 0.0,
    "A": 4.621169816481482,
    "alpha": 0.06486990722366653
  },
  "('google/mt5-large', 'flan')": {
    "L_inf": 0.0,
    "A": 3.5193429654174033,
    "alpha": 0.053677285460916976
  },
  "('MBZUAI/LaMini-GPT-124M', 'gigaword')": {
    "L_inf": 0.0,
    "A": 6.969626540790526,
    "alpha": 0.12488797569428049
  },
  "('MBZUAI/LaMini-GPT-774M', 'gigaword')": {
    "L_inf": 0.0,
    "A": 6.293121529736768,
    "alpha": 0.12918481189914838
  },
  "('cerebras/Cerebras-GPT-256M', 'gigaword')": {
    "L_inf": 0.0,
    "A": 5.7873187311771686,
    "alpha": 0.11196737789112272
  },
  "('cerebras/Cerebras-GPT-1.3B', 'gigaword')": {
    "L_inf": 0.0,
    "A": 4.857135474728793,
    "alpha": 0.09575030044686336
  },
  "('facebook/bart-base', 'gigaword')": {
    "L_inf": 0.0,
    "A": 9.29609103538407,
    "alpha": 0.1581352715189433
  },
  "('facebook/bart-large', 'gigaword')": {
    "L_inf": 0.0,
    "A": 9.506911791025216,
    "alpha": 0.16937059578718588
  },
  "('facebook/opt-1.3b', 'gigaword')": {
    "L_inf": 0.0,
    "A": 5.693457761695681,
    "alpha": 0.1182278831830303
  },
  "('facebook/opt-350m', 'gigaword')": {
    "L_inf": 0.0,
    "A": 7.476891875498462,
    "alpha": 0.1403389747361836
  },
  "('facebook/opt-6.7b', 'gigaword')": {
    "L_inf": 0.0,
    "A": 2.180823669815048,
    "alpha": 0.014692781292003692
  },
  "('gpt2', 'gigaword')": {
    "L_inf": 0.0,
    "A": 7.448123349328373,
    "alpha": 0.13683512515646537
  },
  "('t5-base', 'gigaword')": {
    "L_inf": 0.41614442341441643,
    "A": 1.8229587560850902,
    "alpha": 0.16731161211612708
  },
  "('t5-small', 'gigaword')": {
    "L_inf": 0.4048895734279312,
    "A": 1.771686134476739,
    "alpha": 0.13503543127027648
  },
  "('google/mt5-base', 'gigaword')": {
    "L_inf": 0.0,
    "A": 3.4542367430363656,
    "alpha": 0.032132757710130806
  },
  "('google/mt5-large', 'gigaword')": {
    "L_inf": 0.0,
    "A": 3.5889984641635824,
    "alpha": 0.03815454241188795
  },
  "('MBZUAI/LaMini-GPT-124M', 'wikiword')": {
    "L_inf": 0.0,
    "A": 3.8453873317979803,
    "alpha": 0.06600249911783039
  },
  "('MBZUAI/LaMini-GPT-774M', 'wikiword')": {
    "L_inf": 0.0,
    "A": 2.8960123299335487,
    "alpha": 0.054475442708765934
  },
  "('cerebras/Cerebras-GPT-256M', 'wikiword')": {
    "L_inf": 0.0,
    "A": 4.508753213336002,
    "alpha": 0.07041420217384924
  },
  "('cerebras/Cerebras-GPT-1.3B', 'wikiword')": {
    "L_inf": 0.0,
    "A": 3.1982974522931986,
    "alpha": 0.0512269552341686
  },
  "('facebook/bart-base', 'wikiword')": {
    "L_inf": 0.2863502829285129,
    "A": 5.830753430036486,
    "alpha": 0.12174274538311074
  },
  "('facebook/bart-large', 'wikiword')": {
    "L_inf": 0.7756306965839549,
    "A": 2.6260572549578582,
    "alpha": 0.11471754747211897
  },
  "('facebook/opt-1.3b', 'wikiword')": {
    "L_inf": 0.0,
    "A": 2.3523591080867368,
    "alpha": 0.04191838265818281
  },
  "('facebook/opt-350m', 'wikiword')": {
    "L_inf": 0.0,
    "A": 3.250071982062786,
    "alpha": 0.05575365697201484
  },
  "('facebook/opt-6.7b', 'wikiword')": {
    "L_inf": 0.26802688506172534,
    "A": 1.7891072019745622,
    "alpha": 0.04215249152496872
  },
  "('gpt2', 'wikiword')": {
    "L_inf": 0.0,
    "A": 4.005381596620445,
    "alpha": 0.06983526356123393
  },
  "('t5-base', 'wikiword')": {
    "L_inf": 0.0,
    "A": 2.273452424763712,
    "alpha": 0.04525490715783886
  },
  "('t5-small', 'wikiword')": {
    "L_inf": 0.0,
    "A": 2.8159260594072455,
    "alpha": 0.05183604484401081
  },
  "('google/mt5-base', 'wikiword')": {
    "L_inf": 0.0,
    "A": 4.880269855685565,
    "alpha": 0.09617770187230991
  },
  "('google/mt5-large', 'wikiword')": {
    "L_inf": 0.0,
    "A": 3.953750786401135,
    "alpha": 0.07896963709680072
  }
}


# Fallback parameters (median-like) if an unknown group is requested
_FALLBACK = {
    "L_inf": 0.0,
    "A": 4.5,
    "alpha": 0.08,
}


def _predict_one(n: float, p: Dict[str, float]) -> float:
    # Guard against degenerate or non-positive sizes
    n = float(n)
    if n <= 0:
        return float("nan")
    return float(p["L_inf"] + p["A"] * (n ** (-p["alpha"])) )


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Fetch parameters for the requested group, with a safe fallback
    params = PARAMS.get(group, _FALLBACK)

    outputs: List[Dict[str, float]] = []
    for row in input_data:
        n = row.get("sft_data_size")
        y = _predict_one(n, params)
        outputs.append({"sft_loss": y})
    return outputs

#4 Run 4 R² = 0.796907

▼

Python

from __future__ import annotations

from typing import Dict, List


# Discovered scaling law (common functional form across all groups):
#     sft_loss(N) = c + a * N^{-b}
# where N = sft_data_size, and (c, a, b) are group-specific constants.

COEFFICIENTS: Dict[str, tuple[float, float, float]] = {
    "('MBZUAI/LaMini-GPT-124M', 'flan')": (1.731987, 10.331897, 0.214956),
    "('MBZUAI/LaMini-GPT-124M', 'gigaword')": (0.839163, 10.071592, 0.224204),
    "('MBZUAI/LaMini-GPT-124M', 'wikiword')": (1.384851, 4.809672, 0.217954),
    "('MBZUAI/LaMini-GPT-774M', 'flan')": (1.564914, 7.408989, 0.204933),
    "('MBZUAI/LaMini-GPT-774M', 'gigaword')": (0.71613, 9.159209, 0.228164),
    "('MBZUAI/LaMini-GPT-774M', 'wikiword')": (1.277986, 3.356375, 0.223473),
    "('cerebras/Cerebras-GPT-1.3B', 'flan')": (1.699213, 4.308025, 0.221376),
    "('cerebras/Cerebras-GPT-1.3B', 'gigaword')": (1.021228, 6.648891, 0.214838),
    "('cerebras/Cerebras-GPT-1.3B', 'wikiword')": (1.46071, 3.561491, 0.216802),
    "('cerebras/Cerebras-GPT-256M', 'flan')": (1.974765, 4.782859, 0.1925),
    "('cerebras/Cerebras-GPT-256M', 'gigaword')": (0.896973, 8.176252, 0.218907),
    "('cerebras/Cerebras-GPT-256M', 'wikiword')": (1.533293, 5.816801, 0.222374),
    "('facebook/bart-base', 'flan')": (1.629405, 9.870861, 0.216781),
    "('facebook/bart-base', 'gigaword')": (0.610121, 12.890388, 0.235353),
    "('facebook/bart-base', 'wikiword')": (1.197579, 8.704848, 0.245301),
    "('facebook/bart-large', 'flan')": (1.549392, 6.772108, 0.212899),
    "('facebook/bart-large', 'gigaword')": (0.505826, 12.967178, 0.24014),
    "('facebook/bart-large', 'wikiword')": (1.190723, 3.589003, 0.221618),
    "('facebook/opt-1.3b', 'flan')": (1.516717, 3.56025, 0.217815),
    "('facebook/opt-1.3b', 'gigaword')": (0.842355, 8.301704, 0.230019),
    "('facebook/opt-1.3b', 'wikiword')": (1.266395, 2.360779, 0.225048),
    "('facebook/opt-350m', 'flan')": (1.71605, 5.336651, 0.201391),
    "('facebook/opt-350m', 'gigaword')": (0.718749, 10.705091, 0.232876),
    "('facebook/opt-350m', 'wikiword')": (1.404271, 3.745513, 0.221097),
    "('facebook/opt-6.7b', 'flan')": (1.685592, 1.221022, 0.216304),
    "('facebook/opt-6.7b', 'gigaword')": (1.747063, 1.022884, 0.216527),
    "('facebook/opt-6.7b', 'wikiword')": (1.229994, 1.780851, 0.224694),
    "('google/mt5-base', 'flan')": (1.724393, 5.843416, 0.224196),
    "('google/mt5-base', 'gigaword')": (2.118312, 2.823209, 0.210612),
    "('google/mt5-base', 'wikiword')": (1.083449, 6.917795, 0.226978),
    "('google/mt5-large', 'flan')": (1.570841, 4.096099, 0.224528),
    "('google/mt5-large', 'gigaword')": (1.986351, 3.419825, 0.21213),
    "('google/mt5-large', 'wikiword')": (1.188915, 5.346282, 0.227568),
    "('gpt2', 'flan')": (1.75414, 10.695338, 0.213789),
    "('gpt2', 'gigaword')": (0.740268, 10.674409, 0.22961),
    "('gpt2', 'wikiword')": (1.378322, 5.215325, 0.224403),
    "('t5-base', 'flan')": (1.529924, 3.927784, 0.203483),
    "('t5-base', 'gigaword')": (0.516131, 2.333481, 0.230664),
    "('t5-base', 'wikiword')": (1.145011, 2.344419, 0.216588),
    "('t5-small', 'flan')": (1.768963, 4.609379, 0.211197),
    "('t5-small', 'gigaword')": (0.609782, 2.497339, 0.235035),
    "('t5-small', 'wikiword')": (1.271723, 3.125673, 0.215205),
}

# Fallback (used if an unknown group string is passed)
FALLBACK: tuple[float, float, float] = (1.3166183154612168, 5.836451283719312, 0.2205792477344024)


def _predict_single(n_examples: float, params: tuple[float, float, float]) -> float:
    c, a, b = params
    # Guard against degenerate inputs
    n = max(float(n_examples), 1e-9)
    return c + a * (n ** (-b))


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    params = COEFFICIENTS.get(group, FALLBACK)
    outputs: List[Dict[str, float]] = []
    for row in input_data:
        n = float(row.get("sft_data_size", 0.0))
        yhat = _predict_single(n, params)
        outputs.append({"sft_loss": float(yhat)})
    return outputs

#5 Run 5 R² = 0.708780

▼

Python

from __future__ import annotations

from typing import Dict, List


# Common functional form across all groups:
#   sft_loss(n) = L + A * n^{-alpha}
# Coefficients below were fit on the provided dataset for each group.
COEFS: Dict[str, Dict[str, float]] = {
    "('MBZUAI/LaMini-GPT-124M', 'flan')": {'L': 1.151278, 'A': 6.387290, 'alpha': 0.119397},
    "('MBZUAI/LaMini-GPT-124M', 'gigaword')": {'L': 0.209436, 'A': 6.069774, 'alpha': 0.119397},
    "('MBZUAI/LaMini-GPT-124M', 'wikiword')": {'L': 0.594863, 'A': 3.404022, 'alpha': 0.089655},
    "('MBZUAI/LaMini-GPT-774M', 'flan')": {'L': 0.887005, 'A': 4.744217, 'alpha': 0.099569},
    "('MBZUAI/LaMini-GPT-774M', 'gigaword')": {'L': 0.043412, 'A': 5.242398, 'alpha': 0.109483},
    "('MBZUAI/LaMini-GPT-774M', 'wikiword')": {'L': 0.421853, 'A': 2.527688, 'alpha': 0.069828},
    "('cerebras/Cerebras-GPT-1.3B', 'flan')": {'L': 0.881216, 'A': 2.990184, 'alpha': 0.079741},
    "('cerebras/Cerebras-GPT-1.3B', 'gigaword')": {'L': 0.357477, 'A': 4.524004, 'alpha': 0.109483},
    "('cerebras/Cerebras-GPT-1.3B', 'wikiword')": {'L': 0.617501, 'A': 2.616905, 'alpha': 0.069828},
    "('cerebras/Cerebras-GPT-256M', 'flan')": {'L': 1.211186, 'A': 3.422374, 'alpha': 0.079741},
    "('cerebras/Cerebras-GPT-256M', 'gigaword')": {'L': 0.229292, 'A': 5.082541, 'alpha': 0.109483},
    "('cerebras/Cerebras-GPT-256M', 'wikiword')": {'L': 0.784877, 'A': 3.914892, 'alpha': 0.099569},
    "('facebook/bart-base', 'flan')": {'L': 1.051009, 'A': 6.602574, 'alpha': 0.129310},
    "('facebook/bart-base', 'gigaword')": {'L': 0.069724, 'A': 7.734551, 'alpha': 0.139224},
    "('facebook/bart-base', 'wikiword')": {'L': 0.547840, 'A': 5.883053, 'alpha': 0.139224},
    "('facebook/bart-large', 'flan')": {'L': 0.853263, 'A': 4.705743, 'alpha': 0.109483},
    "('facebook/bart-large', 'gigaword')": {'L': -0.042885, 'A': 7.589665, 'alpha': 0.139224},
    "('facebook/bart-large', 'wikiword')": {'L': 0.818466, 'A': 2.610666, 'alpha': 0.119397},
    "('facebook/opt-1.3b', 'flan')": {'L': 0.671749, 'A': 2.611523, 'alpha': 0.069828},
    "('facebook/opt-1.3b', 'gigaword')": {'L': 0.162161, 'A': 5.289383, 'alpha': 0.119397},
    "('facebook/opt-1.3b', 'wikiword')": {'L': 0.484885, 'A': 1.928485, 'alpha': 0.059914},
    "('facebook/opt-350m', 'flan')": {'L': 0.963881, 'A': 3.772039, 'alpha': 0.089655},
    "('facebook/opt-350m', 'gigaword')": {'L': 0.118605, 'A': 6.472091, 'alpha': 0.129310},
    "('facebook/opt-350m', 'wikiword')": {'L': 0.598750, 'A': 2.790886, 'alpha': 0.079741},
    "('facebook/opt-6.7b', 'flan')": {'L': 1.157440, 'A': 1.128705, 'alpha': 0.050000},
    "('facebook/opt-6.7b', 'gigaword')": {'L': 1.360993, 'A': 0.857867, 'alpha': 0.050000},
    "('facebook/opt-6.7b', 'wikiword')": {'L': 0.457253, 'A': 1.621418, 'alpha': 0.050000},
    "('google/mt5-base', 'flan')": {'L': 0.968873, 'A': 3.909269, 'alpha': 0.099569},
    "('google/mt5-base', 'gigaword')": {'L': 1.245034, 'A': 2.290756, 'alpha': 0.059914},
    "('google/mt5-base', 'wikiword')": {'L': 0.371285, 'A': 4.503793, 'alpha': 0.109483},
    "('google/mt5-large', 'flan')": {'L': 0.741700, 'A': 2.913317, 'alpha': 0.079741},
    "('google/mt5-large', 'gigaword')": {'L': 1.195306, 'A': 2.516514, 'alpha': 0.069828},
    "('google/mt5-large', 'wikiword')": {'L': 0.418632, 'A': 3.712393, 'alpha': 0.099569},
    "('gpt2', 'flan')": {'L': 1.192834, 'A': 6.551524, 'alpha': 0.119397},
    "('gpt2', 'gigaword')": {'L': 0.130181, 'A': 6.102760, 'alpha': 0.119397},
    "('gpt2', 'wikiword')": {'L': 0.672005, 'A': 3.558654, 'alpha': 0.099569},
    "('t5-base', 'flan')": {'L': 0.756869, 'A': 2.967052, 'alpha': 0.079741},
    "('t5-base', 'gigaword')": {'L': 0.416953, 'A': 1.843005, 'alpha': 0.168966},
    "('t5-base', 'wikiword')": {'L': 0.380374, 'A': 1.936891, 'alpha': 0.059914},
    "('t5-small', 'flan')": {'L': 1.035842, 'A': 3.299010, 'alpha': 0.089655},
    "('t5-small', 'gigaword')": {'L': 0.382809, 'A': 1.758617, 'alpha': 0.129310},
    "('t5-small', 'wikiword')": {'L': 0.472628, 'A': 2.416158, 'alpha': 0.069828},
}

# Pooled fallback coefficients (used if an unknown group is requested)
FALLBACK = {'L': -0.374218, 'A': 4.399888, 'alpha': 0.059914}


def _predict_one(n: float, params: Dict[str, float]) -> float:
    if n is None:
        raise ValueError("Missing 'sft_data_size' in input_data item")
    if n <= 0:
        # For non-positive sizes, return the asymptote as a conservative estimate.
        return float(params['L'])
    return float(params['L'] + params['A'] * (n ** (-params['alpha'])))


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Must include key 'sft_data_size'.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is the same for all groups,
                but the constant parameters/coefficients differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s): {'sft_loss': float}.
    """
    params = COEFS.get(group, FALLBACK)
    preds: List[Dict[str, float]] = []
    for row in input_data:
        n = float(row.get('sft_data_size')) if 'sft_data_size' in row else None
        y = _predict_one(n, params)
        preds.append({'sft_loss': y})
    return preds