SLD - SFT Scaling Law - goose + GPT-5

All Runs (sorted by R²)

Best Run 1 R² = 0.967556

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, each with key "sft_data_size" (number of SFT examples).
        group: The experimental group identifier (string).
                The functional form is identical across groups; only constants differ.

    Returns:
        A list of dictionaries with key "sft_loss" for each input row.
    """
    # Parameters fitted per group for the law:
    _PARAMS = {"('MBZUAI/LaMini-GPT-124M', 'flan')": {'L_inf': 1.1512777381812203, 'c': 23.1398245994615, 'alpha': 0.2381458233930208, 'N0': 4876.374238039832}, "('MBZUAI/LaMini-GPT-124M', 'gigaword')": {'L_inf': 0.6238498708909732, 'c': 99.99999997352305, 'alpha': 0.3976078215060757, 'N0': 11558.491226820295}, "('MBZUAI/LaMini-GPT-124M', 'wikiword')": {'L_inf': 0.5948627966954984, 'c': 4.169583454195764, 'alpha': 0.10842983259539116, 'N0': 620.9294592458539}, "('MBZUAI/LaMini-GPT-774M', 'flan')": {'L_inf': 0.8870052565309617, 'c': 12.589156065262879, 'alpha': 0.18965261036492076, 'N0': 4292.202626480964}, "('MBZUAI/LaMini-GPT-774M', 'gigaword')": {'L_inf': 0.4915945363758296, 'c': 53.72376110471119, 'alpha': 0.3538490433901832, 'N0': 8208.076049162994}, "('MBZUAI/LaMini-GPT-774M', 'wikiword')": {'L_inf': 0.42185298559482837, 'c': 2.721922686370415, 'alpha': 0.07696425226399932, 'N0': 201.57413275984604}, "('cerebras/Cerebras-GPT-1.3B', 'flan')": {'L_inf': 0.8812161407059725, 'c': 3.7124906639274893, 'alpha': 0.1004347493339725, 'N0': 651.176470457412}, "('cerebras/Cerebras-GPT-1.3B', 'gigaword')": {'L_inf': 0.31586500856117855, 'c': 6.968142395099821, 'alpha': 0.1466144207037816, 'N0': 1335.72212381429}, "('cerebras/Cerebras-GPT-1.3B', 'wikiword')": {'L_inf': 0.6175014790256353, 'c': 3.075530975514127, 'alpha': 0.08535072362606437, 'N0': 514.7047933000287}, "('cerebras/Cerebras-GPT-256M', 'flan')": {'L_inf': 1.2111855314883686, 'c': 5.298400333867949, 'alpha': 0.12086049837063219, 'N0': 1771.2506644133234}, "('cerebras/Cerebras-GPT-256M', 'gigaword')": {'L_inf': 0.2292923040500484, 'c': 12.618964826980536, 'alpha': 0.1941056864931099, 'N0': 3295.2428246856243}, "('cerebras/Cerebras-GPT-256M', 'wikiword')": {'L_inf': 0.7848770931344929, 'c': 4.6985988833591, 'alpha': 0.11755460303872274, 'N0': 346.47811059894894}, "('facebook/bart-base', 'flan')": {'L_inf': 1.0510091039471077, 'c': 13.36609532140565, 'alpha': 0.19459739902099005, 'N0': 1966.148414736923}, "('facebook/bart-base', 'gigaword')": {'L_inf': 0.5697621647603599, 'c': 99.99999999074515, 'alpha': 0.40937876666094647, 'N0': 6244.642734497064}, "('facebook/bart-base', 'wikiword')": {'L_inf': 1.224143146950251, 'c': 14.59973132909446, 'alpha': 0.2968364050528966, 'N0': 550.5561027660805}, "('facebook/bart-large', 'flan')": {'L_inf': 0.8532632855814969, 'c': 5.769313072202102, 'alpha': 0.128688687479602, 'N0': 441.5873172322458}, "('facebook/bart-large', 'gigaword')": {'L_inf': 0.43621335949856677, 'c': 61.030635791043764, 'alpha': 0.3684503046572159, 'N0': 4178.038107530485}, "('facebook/bart-large', 'wikiword')": {'L_inf': 0.7814637080160759, 'c': 2.6207508287947126, 'alpha': 0.11520376239509583, 'N0': 1.0760076089567364e-11}, "('facebook/opt-1.3b', 'flan')": {'L_inf': 0.671749209401148, 'c': 3.05173942731373, 'alpha': 0.08495191517966257, 'N0': 462.61062401679254}, "('facebook/opt-1.3b', 'gigaword')": {'L_inf': 0.30332635730211316, 'c': 10.781981318062071, 'alpha': 0.19556333080725272, 'N0': 1844.53853486769}, "('facebook/opt-1.3b', 'wikiword')": {'L_inf': 0.36837839625882035, 'c': 2.064791523499473, 'alpha': 0.05607439316988944, 'N0': 75.9172379877138}, "('facebook/opt-350m', 'flan')": {'L_inf': 0.963880812146748, 'c': 6.13315838607694, 'alpha': 0.1342216717881915, 'N0': 2089.526726264657}, "('facebook/opt-350m', 'gigaword')": {'L_inf': 0.3170696281489177, 'c': 21.451868485184864, 'alpha': 0.25484008412733566, 'N0': 2967.5386314330094}, "('facebook/opt-350m', 'wikiword')": {'L_inf': 0.5737897722315956, 'c': 2.8913494105073343, 'alpha': 0.08109527270865684, 'N0': 73.79570557646554}, "('facebook/opt-6.7b', 'flan')": {'L_inf': 1.1209766956751008, 'c': 1.1915986753620234, 'alpha': 0.05000000000000001, 'N0': 119.23012333120555}, "('facebook/opt-6.7b', 'gigaword')": {'L_inf': 1.63395079142584, 'c': 1.8526750238010694, 'alpha': 0.1921537870023617, 'N0': 5578.439427129741}, "('facebook/opt-6.7b', 'wikiword')": {'L_inf': 0.8797129884870536, 'c': 1.3801607965652851, 'alpha': 0.09031120236583308, 'N0': 150.71481335433086}, "('google/mt5-base', 'flan')": {'L_inf': 0.9688725993825086, 'c': 4.846574318664137, 'alpha': 0.12037211014476513, 'N0': 464.2117601422507}, "('google/mt5-base', 'gigaword')": {'L_inf': 1.2450341225298993, 'c': 2.728697598311802, 'alpha': 0.07621781787029036, 'N0': 829.0089044374079}, "('google/mt5-base', 'wikiword')": {'L_inf': 0.3712846154423211, 'c': 5.908144391720105, 'alpha': 0.1361472675488146, 'N0': 509.95077973946974}, "('google/mt5-large', 'flan')": {'L_inf': 0.7416998420079975, 'c': 3.3821432143495036, 'alpha': 0.09379307542347348, 'N0': 447.8764747996571}, "('google/mt5-large', 'gigaword')": {'L_inf': 1.1287810640169023, 'c': 3.965604717364322, 'alpha': 0.10568116035833683, 'N0': 3076.09487758598}, "('google/mt5-large', 'wikiword')": {'L_inf': 0.4186324900396543, 'c': 4.004193555148838, 'alpha': 0.10676480802307665, 'N0': 147.73082755798134}, "('gpt2', 'flan')": {'L_inf': 1.1928341966172604, 'c': 30.064659143310216, 'alpha': 0.2591663719980929, 'N0': 6252.163345032034}, "('gpt2', 'gigaword')": {'L_inf': 0.47259514594176505, 'c': 41.02620921518222, 'alpha': 0.31908945776498765, 'N0': 5570.914686476627}, "('gpt2', 'wikiword')": {'L_inf': 0.597124361354747, 'c': 4.375526622935792, 'alpha': 0.113331946856678, 'N0': 529.9943149466569}, "('t5-base', 'flan')": {'L_inf': 0.7152739441287245, 'c': 3.5717730527435636, 'alpha': 0.09379847746856583, 'N0': 643.8465620868365}, "('t5-base', 'gigaword')": {'L_inf': 0.4167409910882705, 'c': 1.8233794046650496, 'alpha': 0.16745997535522983, 'N0': 1.9948450922754098e-07}, "('t5-base', 'wikiword')": {'L_inf': 0.24720659428707983, 'c': 2.2042419001082694, 'alpha': 0.06036996753569996, 'N0': 349.6769779624643}, "('t5-small', 'flan')": {'L_inf': 0.9776244454150002, 'c': 4.108372726387373, 'alpha': 0.10572258394634945, 'N0': 685.4601409524844}, "('t5-small', 'gigaword')": {'L_inf': 0.5585500951317257, 'c': 2.424817826690159, 'alpha': 0.20909810514967986, 'N0': 173.8274655166467}, "('t5-small', 'wikiword')": {'L_inf': 0.4060546165241737, 'c': 2.753567690217471, 'alpha': 0.07659591725848344, 'N0': 431.1701418362203}}
    _FALLBACK = {'L_inf': 0.6477995401460606, 'c': 4.272555038565778, 'alpha': 0.12061630425769866, 'N0': 632.3880106663452}

    p = _PARAMS.get(group, _FALLBACK)
    out: list[dict[str, float]] = []
    for row in input_data:
        if "sft_data_size" not in row:
            raise KeyError("Input row missing required key: sft_data_size")
        N = float(row["sft_data_size"])
        L_inf = float(p["L_inf"])
        c = float(p["c"])
        alpha = float(p["alpha"])
        N0 = float(p["N0"])
        if N0 < 0.0:
            N0 = 0.0
        y = L_inf + c * (N + N0) ** (-alpha)
        out.append({"sft_loss": float(y)})
    return out

#2 Run 2 R² = 0.960281

▼

Python

# Auto-generated scaling law implementation for SFT loss
# Formula: sft_loss(n) = c_g + a_g * n**(-b_g)
# Where parameters (a_g, b_g, c_g) depend on the experimental group `group`.
# If an unknown group is provided, a robust fallback using median parameters across groups is used.

from typing import List, Dict

# Per-group parameters fitted on the provided dataset
PARAMS: Dict[str, Dict[str, float]] = {
  "('MBZUAI/LaMini-GPT-124M', 'flan')": {
    "a": 8647.31721981341,
    "b": 3.3063841377017955e-05,
    "c": -8641.328598263062
  },
  "('MBZUAI/LaMini-GPT-124M', 'gigaword')": {
    "a": 8557.877640701434,
    "b": 3.078509515275162e-05,
    "c": -8553.139575273637
  },
  "('MBZUAI/LaMini-GPT-124M', 'wikiword')": {
    "a": 5170.546672610577,
    "b": 2.6711498018263207e-05,
    "c": -5167.154633119724
  },
  "('MBZUAI/LaMini-GPT-774M', 'flan')": {
    "a": 7826.224298004896,
    "b": 2.7124359556650597e-05,
    "c": -7821.440497277876
  },
  "('MBZUAI/LaMini-GPT-774M', 'gigaword')": {
    "a": 8415.320549742071,
    "b": 2.829728260010143e-05,
    "c": -8411.115753993647
  },
  "('MBZUAI/LaMini-GPT-774M', 'wikiword')": {
    "a": 10.059892214078516,
    "b": 0.010502118801849692,
    "c": -7.366858158510148
  },
  "('cerebras/Cerebras-GPT-1.3B', 'flan')": {
    "a": 4167.788093140718,
    "b": 2.920434619018147e-05,
    "c": -4164.3273281576985
  },
  "('cerebras/Cerebras-GPT-1.3B', 'gigaword')": {
    "a": 5644.6338439025485,
    "b": 3.3778618347786454e-05,
    "c": -5640.813241965669
  },
  "('cerebras/Cerebras-GPT-1.3B', 'wikiword')": {
    "a": 3326.0456526599573,
    "b": 3.0689071103015705e-05,
    "c": -3323.0952003378807
  },
  "('cerebras/Cerebras-GPT-256M', 'flan')": {
    "a": 4878.603252124826,
    "b": 2.966209177782707e-05,
    "c": -4874.392460064771
  },
  "('cerebras/Cerebras-GPT-256M', 'gigaword')": {
    "a": 8307.215165566979,
    "b": 2.716869941643151e-05,
    "c": -8302.996489510351
  },
  "('cerebras/Cerebras-GPT-256M', 'wikiword')": {
    "a": 22.836923635993642,
    "b": 0.007851507602704935,
    "c": -18.856671249950633
  },
  "('facebook/bart-base', 'flan')": {
    "a": 8676.53254076723,
    "b": 3.2147659436668996e-05,
    "c": -8670.809397175384
  },
  "('facebook/bart-base', 'gigaword')": {
    "a": 8403.636556708127,
    "b": 3.9844941189324075e-05,
    "c": -8398.225321840873
  },
  "('facebook/bart-base', 'wikiword')": {
    "a": 5.861962046164737,
    "b": 0.12018623910929609,
    "c": 0.2505260920524569
  },
  "('facebook/bart-large', 'flan')": {
    "a": 4100.869680169394,
    "b": 4.833849889036928e-05,
    "c": -4096.4157975816925
  },
  "('facebook/bart-large', 'gigaword')": {
    "a": 8927.943733814629,
    "b": 3.741431076796748e-05,
    "c": -8922.700732356094
  },
  "('facebook/bart-large', 'wikiword')": {
    "a": 2.6207508297172257,
    "b": 0.11520374997088315,
    "c": 0.7814636074750416
  },
  "('facebook/opt-1.3b', 'flan')": {
    "a": 3410.6762678503906,
    "b": 2.999309327533415e-05,
    "c": -3407.672624260463
  },
  "('facebook/opt-1.3b', 'gigaword')": {
    "a": 5270.395400311453,
    "b": 4.271505935321989e-05,
    "c": -5266.327652830642
  },
  "('facebook/opt-1.3b', 'wikiword')": {
    "a": 3.745411480088363,
    "b": 0.02195677648506152,
    "c": -1.4568822328003974
  },
  "('facebook/opt-350m', 'flan')": {
    "a": 5319.127232480517,
    "b": 2.9481823937002092e-05,
    "c": -5315.030641572232
  },
  "('facebook/opt-350m', 'gigaword')": {
    "a": 8672.617705998246,
    "b": 3.277489202411034e-05,
    "c": -8667.828138426317
  },
  "('facebook/opt-350m', 'wikiword')": {
    "a": 4.260560944608557,
    "b": 0.035323941244454864,
    "c": -1.1261246855437828
  },
  "('facebook/opt-6.7b', 'flan')": {
    "a": 8.84031525707576,
    "b": 0.004229085352640586,
    "c": -6.628301483551928
  },
  "('facebook/opt-6.7b', 'gigaword')": {
    "a": 825.4524052668343,
    "b": 3.375864235705277e-05,
    "c": -823.2897371425845
  },
  "('facebook/opt-6.7b', 'wikiword')": {
    "a": 1.7881025825750823,
    "b": 0.042208386127589134,
    "c": 0.269331498642009
  },
  "('google/mt5-base', 'flan')": {
    "a": 1923.9868909345048,
    "b": 8.614983584285064e-05,
    "c": -1919.887533567419
  },
  "('google/mt5-base', 'gigaword')": {
    "a": 2292.138334477352,
    "b": 3.593819009738095e-05,
    "c": -2288.8026683660664
  },
  "('google/mt5-base', 'wikiword')": {
    "a": 37.947268995336835,
    "b": 0.005395627879697212,
    "c": -34.044344515916116
  },
  "('google/mt5-large', 'flan')": {
    "a": 2384.0900712105713,
    "b": 4.846823372978741e-05,
    "c": -2380.8616253192226
  },
  "('google/mt5-large', 'gigaword')": {
    "a": 3286.7719399660746,
    "b": 2.9014248974787798e-05,
    "c": -3283.3571700176385
  },
  "('google/mt5-large', 'wikiword')": {
    "a": 4.866217953659758,
    "b": 0.048834241972019514,
    "c": -1.180044505513471
  },
  "('gpt2', 'flan')": {
    "a": 10030.632472189145,
    "b": 2.9534701000729295e-05,
    "c": -10024.452312684838
  },
  "('gpt2', 'gigaword')": {
    "a": 8220.885240152042,
    "b": 3.417647218553115e-05,
    "c": -8216.067683072208
  },
  "('gpt2', 'wikiword')": {
    "a": 2919.6020805080443,
    "b": 5.054366184556883e-05,
    "c": -2916.108417313347
  },
  "('t5-base', 'flan')": {
    "a": 4182.118663568445,
    "b": 2.802441935072382e-05,
    "c": -4178.833026596572
  },
  "('t5-base', 'gigaword')": {
    "a": 1.8233793964016372,
    "b": 0.16745997355463152,
    "c": 0.4167409866932137
  },
  "('t5-base', 'wikiword')": {
    "a": 1958.284262865906,
    "b": 3.465851622310035e-05,
    "c": -1956.1518736561584
  },
  "('t5-small', 'flan')": {
    "a": 4198.848239126039,
    "b": 3.202832135414763e-05,
    "c": -4195.095998292536
  },
  "('t5-small', 'gigaword')": {
    "a": 1.775741756518827,
    "b": 0.13439792658089325,
    "c": 0.40091761886668614
  },
  "('t5-small', 'wikiword')": {
    "a": 2237.846721238536,
    "b": 4.047117434071229e-05,
    "c": -2235.2521851047054
  }
}

# Fallback (median across fitted groups)
FALLBACK = {'a': 3368.360960255174, 'b': 3.529835316024065e-05, 'c': -3365.383912299172}


def _predict_single(n: float, a: float, b: float, c: float) -> float:
    # Guard against non-positive dataset sizes
    n = float(n)
    if not (n > 0):
        n = 1.0
    return c + a * (n ** (-b))


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    params = PARAMS.get(group, FALLBACK)
    a = float(params['a']); b = float(params['b']); c = float(params['c'])
    outputs: List[Dict[str, float]] = []
    for row in input_data:
        n = float(row.get('sft_data_size', 1.0))
        y = _predict_single(n, a, b, c)
        outputs.append({'sft_loss': float(y)})
    return outputs

#3 Run 3 R² = 0.940323

▼

Python

# Auto-generated scaling law implementation
# Formula: sft_loss(N) = L_inf + k * N**(-alpha)
from __future__ import annotations

# Per-group parameters
PARAMS: dict[str, dict[str, float]] = {
  "('MBZUAI/LaMini-GPT-124M', 'flan')": {
    "L_inf": -2.84872226181878,
    "alpha": 0.04693921400417043,
    "k": 9.497535378634526
  },
  "('MBZUAI/LaMini-GPT-124M', 'gigaword')": {
    "L_inf": -3.790564191067444,
    "alpha": 0.04410150185670217,
    "k": 9.105789601382336
  },
  "('MBZUAI/LaMini-GPT-124M', 'wikiword')": {
    "L_inf": -3.405137203304502,
    "alpha": 0.025030929087892846,
    "k": 6.945360297551526
  },
  "('MBZUAI/LaMini-GPT-774M', 'flan')": {
    "L_inf": -3.1129947434690384,
    "alpha": 0.03623548015303827,
    "k": 8.263467907393492
  },
  "('MBZUAI/LaMini-GPT-774M', 'gigaword')": {
    "L_inf": -3.956587922226836,
    "alpha": 0.04055923481177196,
    "k": 8.628322678679417
  },
  "('MBZUAI/LaMini-GPT-774M', 'wikiword')": {
    "L_inf": -3.5781470144051717,
    "alpha": 0.017940235346751876,
    "k": 6.301385169297382
  },
  "('cerebras/Cerebras-GPT-1.3B', 'flan')": {
    "L_inf": -3.1187838592940276,
    "alpha": 0.0223244844231755,
    "k": 6.695142198457086
  },
  "('cerebras/Cerebras-GPT-1.3B', 'gigaword')": {
    "L_inf": -3.6841349914388215,
    "alpha": 0.03334474965323042,
    "k": 7.790419125340186
  },
  "('cerebras/Cerebras-GPT-1.3B', 'wikiword')": {
    "L_inf": -3.382498520974365,
    "alpha": 0.0189465581623249,
    "k": 6.4145211823610495
  },
  "('cerebras/Cerebras-GPT-256M', 'flan')": {
    "L_inf": -2.7888144685116316,
    "alpha": 0.02573360751945762,
    "k": 7.1685528664069595
  },
  "('cerebras/Cerebras-GPT-256M', 'gigaword')": {
    "L_inf": -3.7707076959499517,
    "alpha": 0.03862018836989847,
    "k": 8.398951219616288
  },
  "('cerebras/Cerebras-GPT-256M', 'wikiword')": {
    "L_inf": -3.215122906865507,
    "alpha": 0.029764350302425833,
    "k": 7.355400097137995
  },
  "('facebook/bart-base', 'flan')": {
    "L_inf": -2.9489908960528926,
    "alpha": 0.04619962742399895,
    "k": 9.28290266402412
  },
  "('facebook/bart-base', 'gigaword')": {
    "L_inf": -3.9302756662008402,
    "alpha": 0.05436575101488506,
    "k": 10.235285784884718
  },
  "('facebook/bart-base', 'wikiword')": {
    "L_inf": 0.2821754599129802,
    "alpha": 0.12147501331757041,
    "k": 5.829718775142702
  },
  "('facebook/bart-large', 'flan')": {
    "L_inf": -3.1467367144185032,
    "alpha": 0.034403106857395824,
    "k": 7.901373817811698
  },
  "('facebook/bart-large', 'gigaword')": {
    "L_inf": -4.042884514538057,
    "alpha": 0.054442310427141344,
    "k": 10.164235207037365
  },
  "('facebook/bart-large', 'wikiword')": {
    "L_inf": 0.7717832947588175,
    "alpha": 0.1142292263849532,
    "k": 2.6257079431735018
  },
  "('facebook/opt-1.3b', 'flan')": {
    "L_inf": -3.328250790598852,
    "alpha": 0.018987979800346985,
    "k": 6.413249669199695
  },
  "('facebook/opt-1.3b', 'gigaword')": {
    "L_inf": -3.8378392874117306,
    "alpha": 0.03881323039803364,
    "k": 8.30245528884783
  },
  "('facebook/opt-1.3b', 'wikiword')": {
    "L_inf": -1.4511709120118566,
    "alpha": 0.02199761766628879,
    "k": 3.7398136132117465
  },
  "('facebook/opt-350m', 'flan')": {
    "L_inf": -3.0361191878532523,
    "alpha": 0.02778767997253752,
    "k": 7.331010326048531
  },
  "('facebook/opt-350m', 'gigaword')": {
    "L_inf": -3.881395290290259,
    "alpha": 0.04744089080855877,
    "k": 9.308918407577007
  },
  "('facebook/opt-350m', 'wikiword')": {
    "L_inf": -1.132977603708254,
    "alpha": 0.035247932113577574,
    "k": 4.267077557325106
  },
  "('facebook/opt-6.7b', 'flan')": {
    "L_inf": -3.2585008510725304,
    "alpha": 0.007003789638826263,
    "k": 5.474501111426712
  },
  "('facebook/opt-6.7b', 'gigaword')": {
    "L_inf": -3.212979096019364,
    "alpha": 0.0054518183435738835,
    "k": 5.382150698438362
  },
  "('facebook/opt-6.7b', 'wikiword')": {
    "L_inf": 0.26734817720677784,
    "alpha": 0.042128965416046646,
    "k": 1.7897251112906631
  },
  "('google/mt5-base', 'flan')": {
    "L_inf": -3.0311274006174918,
    "alpha": 0.0295556688942115,
    "k": 7.339597895281673
  },
  "('google/mt5-base', 'gigaword')": {
    "L_inf": -2.754965877470101,
    "alpha": 0.015472046649958514,
    "k": 6.144404673374608
  },
  "('google/mt5-base', 'wikiword')": {
    "L_inf": -3.6287153845576787,
    "alpha": 0.03422960947337014,
    "k": 7.779427664586992
  },
  "('google/mt5-large', 'flan')": {
    "L_inf": -3.258300157992003,
    "alpha": 0.021301010777072714,
    "k": 6.589549881862163
  },
  "('google/mt5-large', 'gigaword')": {
    "L_inf": -2.871218935983098,
    "alpha": 0.017731461190845753,
    "k": 6.361166351429151
  },
  "('google/mt5-large', 'wikiword')": {
    "L_inf": -1.1753529535693708,
    "alpha": 0.0489005510100014,
    "k": 4.861778368571223
  },
  "('gpt2', 'flan')": {
    "L_inf": -2.80716580338274,
    "alpha": 0.04832113090143747,
    "k": 9.69977295636641
  },
  "('gpt2', 'gigaword')": {
    "L_inf": -3.869819355255038,
    "alpha": 0.04677581604567699,
    "k": 9.324068047000075
  },
  "('gpt2', 'wikiword')": {
    "L_inf": -3.402875638645481,
    "alpha": 0.026644500564620817,
    "k": 7.063580020109555
  },
  "('t5-base', 'flan')": {
    "L_inf": -3.2847260558712756,
    "alpha": 0.02140274300422177,
    "k": 6.678646710059947
  },
  "('t5-base', 'gigaword')": {
    "L_inf": 0.4127485489941094,
    "alpha": 0.16582280132153118,
    "k": 1.8150428474927893
  },
  "('t5-base', 'wikiword')": {
    "L_inf": -3.7527934057175982,
    "alpha": 0.012900547039701041,
    "k": 5.920992518798672
  },
  "('t5-small', 'flan')": {
    "L_inf": -3.022375554585,
    "alpha": 0.024351213683578964,
    "k": 6.916266312880347
  },
  "('t5-small', 'gigaword')": {
    "L_inf": 0.40333556866832776,
    "alpha": 0.13462039679218443,
    "k": 1.77057749748915
  },
  "('t5-small', 'wikiword')": {
    "L_inf": -3.593945383475859,
    "alpha": 0.016917955818556193,
    "k": 6.2521647710599755
  }
}

# Fallback parameters (median across groups) will be computed at import time
from statistics import median

_LINF = [float(v['L_inf']) for v in PARAMS.values()]
_K = [float(v['k']) for v in PARAMS.values()]
_ALPHA = [float(v['alpha']) for v in PARAMS.values()]
DEFAULT_PARAMS = {
    'L_inf': median(_LINF),
    'k': median(_K),
    'alpha': median(_ALPHA),
}

def _predict_loss(N: float, p: dict[str, float]) -> float:
    N = max(float(N), 1e-9)
    return float(p['L_inf'] + p['k'] * (N ** (-p['alpha'])))

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    p = PARAMS.get(group, DEFAULT_PARAMS)
    out: list[dict[str, float]] = []
    for row in input_data:
        if 'sft_data_size' not in row:
            raise KeyError("Each input row must include 'sft_data_size'.")
        N = row['sft_data_size']
        y = _predict_loss(N, p)
        out.append({'sft_loss': y})
    return out

#4 Run 4 R² = 0.893357

▼

Python

from __future__ import annotations
from typing import Dict, List

# Discovered scaling law (shared functional form across groups):
#   sft_loss = c_g + a_g * (sft_data_size) ** (-b_g)
# where parameters (a_g, b_g, c_g) depend on the experimental group g.
# The parameters below were fit via log-linear regression with an offset grid
# search for c_g on the provided dataset located at /app/data.

PARAMS: Dict[str, Dict[str, float]] = {
    "('MBZUAI/LaMini-GPT-124M', 'flan')": {"c": 0.0, "a": 7.465597659576915, "b": 0.08938336391209903},
    "('MBZUAI/LaMini-GPT-124M', 'gigaword')": {"c": 0.0, "a": 6.969626540790526, "b": 0.12488797569428049},
    "('MBZUAI/LaMini-GPT-124M', 'wikiword')": {"c": 0.0, "a": 3.8453873317979803, "b": 0.06600249911783039},
    "('MBZUAI/LaMini-GPT-774M', 'flan')": {"c": 0.0, "a": 5.713040837756619, "b": 0.078292258243444},
    "('MBZUAI/LaMini-GPT-774M', 'gigaword')": {"c": 0.0, "a": 6.293121529736768, "b": 0.12918481189914838},
    "('MBZUAI/LaMini-GPT-774M', 'wikiword')": {"c": 0.0, "a": 2.8960123299335487, "b": 0.054475442708765934},
    "('cerebras/Cerebras-GPT-1.3B', 'flan')": {"c": 0.0, "a": 3.7633686464634066, "b": 0.05245645350446028},
    "('cerebras/Cerebras-GPT-1.3B', 'gigaword')": {"c": 0.0, "a": 4.857135474728793, "b": 0.09575030044686336},
    "('cerebras/Cerebras-GPT-1.3B', 'wikiword')": {"c": 0.0, "a": 3.1982974522931986, "b": 0.0512269552341686},
    "('cerebras/Cerebras-GPT-256M', 'flan')": {"c": 0.0, "a": 4.582026537858248, "b": 0.051336282730936586},
    "('cerebras/Cerebras-GPT-256M', 'gigaword')": {"c": 0.0, "a": 5.7873187311771686, "b": 0.11196737789112272},
    "('cerebras/Cerebras-GPT-256M', 'wikiword')": {"c": 0.0, "a": 4.508753213336002, "b": 0.07041420217384924},
    "('facebook/bart-base', 'flan')": {"c": 0.0, "a": 7.161143632325522, "b": 0.09164738334291282},
    "('facebook/bart-base', 'gigaword')": {"c": 0.0, "a": 9.29609103538407, "b": 0.1581352715189433},
    "('facebook/bart-base', 'wikiword')": {"c": 0.2877892290738823, "a": 5.831117801407008, "b": 0.12183531134399322},
    "('facebook/bart-large', 'flan')": {"c": 0.0, "a": 5.2395659866752196, "b": 0.0767344266956308},
    "('facebook/bart-large', 'gigaword')": {"c": 0.0, "a": 9.506911791025216, "b": 0.16937059578718588},
    "('facebook/bart-large', 'wikiword')": {"c": 0.7726903703957233, "a": 2.625785639789138, "b": 0.11434396216041307},
    "('facebook/opt-1.3b', 'flan')": {"c": 0.0, "a": 3.2428955974836873, "b": 0.04996138964643028},
    "('facebook/opt-1.3b', 'gigaword')": {"c": 0.0, "a": 5.693457761695681, "b": 0.1182278831830303},
    "('facebook/opt-1.3b', 'wikiword')": {"c": 0.0, "a": 2.3523591080867368, "b": 0.04191838265818281},
    "('facebook/opt-350m', 'flan')": {"c": 0.0, "a": 4.5858563671544825, "b": 0.060620773494188006},
    "('facebook/opt-350m', 'gigaword')": {"c": 0.0, "a": 7.476891875498462, "b": 0.1403389747361836},
    "('facebook/opt-350m', 'wikiword')": {"c": 0.0, "a": 3.250071982062786, "b": 0.05575365697201484},
    "('facebook/opt-6.7b', 'flan')": {"c": 0.0, "a": 2.2344284064674045, "b": 0.019171706023120298},
    "('facebook/opt-6.7b', 'gigaword')": {"c": 0.0, "a": 2.180823669815048, "b": 0.014692781292003692},
    "('facebook/opt-6.7b', 'wikiword')": {"c": 0.2693737538308797, "a": 1.7878812167903022, "b": 0.04219925650771196},
    "('google/mt5-base', 'flan')": {"c": 0.0, "a": 4.621169816481482, "b": 0.06486990722366653},
    "('google/mt5-base', 'gigaword')": {"c": 0.0, "a": 3.4542367430363656, "b": 0.032132757710130806},
    "('google/mt5-base', 'wikiword')": {"c": 0.0, "a": 4.880269855685565, "b": 0.09617770187230991},
    "('google/mt5-large', 'flan')": {"c": 0.0, "a": 3.5193429654174033, "b": 0.053677285460916976},
    "('google/mt5-large', 'gigaword')": {"c": 0.0, "a": 3.5889984641635824, "b": 0.03815454241188795},
    "('google/mt5-large', 'wikiword')": {"c": 0.0, "a": 3.953750786401135, "b": 0.07896963709680072},
    "('gpt2', 'flan')": {"c": 0.0, "a": 7.743248668603332, "b": 0.09034898764189159},
    "('gpt2', 'gigaword')": {"c": 0.0, "a": 7.448123349328373, "b": 0.13683512515646537},
    "('gpt2', 'wikiword')": {"c": 0.0, "a": 4.005381596620445, "b": 0.06983526356123393},
    "('t5-base', 'flan')": {"c": 0.0, "a": 3.5927162123671033, "b": 0.05385301204930487},
    "('t5-base', 'gigaword')": {"c": 0.4150909728394073, "a": 1.8204666644793315, "b": 0.16684663164101404},
    "('t5-base', 'wikiword')": {"c": 0.0, "a": 2.273452424763712, "b": 0.04525490715783886},
    "('t5-small', 'flan')": {"c": 0.0, "a": 4.103162145653623, "b": 0.05411455023213695},
    "('t5-small', 'gigaword')": {"c": 0.4069241943999308, "a": 1.7731757737185632, "b": 0.13558294804986845},
    "('t5-small', 'wikiword')": {"c": 0.0, "a": 2.8159260594072455, "b": 0.05183604484401081},
}

# Fallback parameters (global median-ish) in case an unseen group name is used.
FALLBACK = {"c": 0.0, "a": 4.0542718711370345, "b": 0.07012473286754159}


def _predict_one(x: float, params: Dict[str, float]) -> float:
    # Ensure numerical stability and monotonicity in x
    if x <= 0:
        # Use tiny positive to avoid non-physical behavior
        x = 1e-12
    return params["c"] + params["a"] * (x ** (-params["b"]))


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Normalize group key to match our parameter dictionary keys
    key = str(group)
    params = PARAMS.get(key, FALLBACK)

    results: List[Dict[str, float]] = []
    for row in input_data:
        x = float(row.get("sft_data_size"))
        yhat = _predict_one(x, params)
        results.append({"sft_loss": float(yhat)})
    return results

#5 Run 5 R² = 0.731620

▼

Python

# Autogenerated scaling law implementation
from __future__ import annotations
from typing import List, Dict
import math
import ast

# Parameters fitted from training data
_PARAMS_BY_GROUP: Dict[str, Dict[str, float]] = {
  "('MBZUAI/LaMini-GPT-124M', 'flan')": {
    "A": 6.551027034471107,
    "L_inf": 1.0512777381812202,
    "alpha": 0.11745762711864405,
    "sse": 0.4808298153556973
  },
  "('MBZUAI/LaMini-GPT-124M', 'gigaword')": {
    "A": 5.765933611487401,
    "L_inf": 0.10943580893255611,
    "alpha": 0.10745762711864404,
    "sse": 0.6245798014382321
  },
  "('MBZUAI/LaMini-GPT-124M', 'wikiword')": {
    "A": 3.376395921350376,
    "L_inf": 0.5548627966954982,
    "alpha": 0.08563389830508474,
    "sse": 0.022893087373991938
  },
  "('MBZUAI/LaMini-GPT-774M', 'flan')": {
    "A": 4.952993658538474,
    "L_inf": 0.7870052565309618,
    "alpha": 0.09908474576271184,
    "sse": 0.2400542702584885
  },
  "('MBZUAI/LaMini-GPT-774M', 'gigaword')": {
    "A": 5.263660623382425,
    "L_inf": 0.0,
    "alpha": 0.10745762711864404,
    "sse": 0.44323280410802207
  },
  "('MBZUAI/LaMini-GPT-774M', 'wikiword')": {
    "A": 2.5700770849364734,
    "L_inf": 0.40185298559482835,
    "alpha": 0.07011457627118642,
    "sse": 0.002812062716955058
  },
  "('cerebras/Cerebras-GPT-1.3B', 'flan')": {
    "A": 3.0671660328777723,
    "L_inf": 0.8665890220619048,
    "alpha": 0.08167457627118642,
    "sse": 0.01832322318723353
  },
  "('cerebras/Cerebras-GPT-1.3B', 'gigaword')": {
    "A": 4.476965914016248,
    "L_inf": 0.21586500856117852,
    "alpha": 0.09885593220338983,
    "sse": 0.09055411163147822
  },
  "('cerebras/Cerebras-GPT-1.3B', 'wikiword')": {
    "A": 2.6958514343065025,
    "L_inf": 0.5775014790256352,
    "alpha": 0.07011457627118642,
    "sse": 0.011516760278950221
  },
  "('cerebras/Cerebras-GPT-256M', 'flan')": {
    "A": 3.634706072945405,
    "L_inf": 1.1111855314883685,
    "alpha": 0.08029016949152541,
    "sse": 0.07819663306349012
  },
  "('cerebras/Cerebras-GPT-256M', 'gigaword')": {
    "A": 5.243225803366305,
    "L_inf": 0.12929230405004832,
    "alpha": 0.10745762711864404,
    "sse": 0.22934069038976262
  },
  "('cerebras/Cerebras-GPT-256M', 'wikiword')": {
    "A": 4.028614188892544,
    "L_inf": 0.724877093134493,
    "alpha": 0.09885593220338983,
    "sse": 0.01833369729089268
  },
  "('facebook/bart-base', 'flan')": {
    "A": 6.254726539846071,
    "L_inf": 0.9510091039471076,
    "alpha": 0.11694915254237281,
    "sse": 0.2540788846983032
  },
  "('facebook/bart-base', 'gigaword')": {
    "A": 7.809486898113445,
    "L_inf": 0.0,
    "alpha": 0.13694915254237283,
    "sse": 0.6790616782057259
  },
  "('facebook/bart-base', 'wikiword')": {
    "A": 5.848232850566735,
    "L_inf": 0.5101285027743712,
    "alpha": 0.13593220338983047,
    "sse": 0.046485498566010953
  },
  "('facebook/bart-large', 'flan')": {
    "A": 4.72749327185259,
    "L_inf": 0.813263285581497,
    "alpha": 0.10745762711864404,
    "sse": 0.06443381176516555
  },
  "('facebook/bart-large', 'gigaword')": {
    "A": 7.9111202924205815,
    "L_inf": 0.0,
    "alpha": 0.14694915254237284,
    "sse": 0.5847511268238709
  },
  "('facebook/bart-large', 'wikiword')": {
    "A": 2.6358531287238205,
    "L_inf": 0.7976718622793141,
    "alpha": 0.11796610169491523,
    "sse": 0.0008035057119018064
  },
  "('facebook/opt-1.3b', 'flan')": {
    "A": 2.690458418113715,
    "L_inf": 0.6317492094011479,
    "alpha": 0.07011457627118642,
    "sse": 0.00896705369896697
  },
  "('facebook/opt-1.3b', 'gigaword')": {
    "A": 5.055017104788688,
    "L_inf": 0.06216071258826933,
    "alpha": 0.10745762711864405,
    "sse": 0.15896734156520373
  },
  "('facebook/opt-1.3b', 'wikiword')": {
    "A": 1.9989082093584953,
    "L_inf": 0.3883783962588203,
    "alpha": 0.05463796610169492,
    "sse": 0.0005966937577908683
  },
  "('facebook/opt-350m', 'flan')": {
    "A": 3.7735300303987582,
    "L_inf": 0.9038808121467481,
    "alpha": 0.08563389830508474,
    "sse": 0.08866213184912522
  },
  "('facebook/opt-350m', 'gigaword')": {
    "A": 6.633179880504192,
    "L_inf": 0.01860470970974132,
    "alpha": 0.12694915254237282,
    "sse": 0.349162962231544
  },
  "('facebook/opt-350m', 'wikiword')": {
    "A": 2.7612543989445277,
    "L_inf": 0.6360779078248159,
    "alpha": 0.08167457627118642,
    "sse": 0.0027850673476467027
  },
  "('facebook/opt-6.7b', 'flan')": {
    "A": 1.365125617665658,
    "L_inf": 0.8999059285884865,
    "alpha": 0.03699364406779661,
    "sse": 0.0004314444689843593
  },
  "('facebook/opt-6.7b', 'gigaword')": {
    "A": 1.2787823367249933,
    "L_inf": 0.9146819209297886,
    "alpha": 0.027880000000000002,
    "sse": 0.0034799487006857556
  },
  "('facebook/opt-6.7b', 'wikiword')": {
    "A": 1.6558332978869559,
    "L_inf": 0.408940745237023,
    "alpha": 0.04719830508474577,
    "sse": 0.0002784725035281276
  },
  "('google/mt5-base', 'flan')": {
    "A": 4.023037948558858,
    "L_inf": 0.9088725993825086,
    "alpha": 0.09885593220338983,
    "sse": 0.02482242593421794
  },
  "('google/mt5-base', 'gigaword')": {
    "A": 2.287289667115906,
    "L_inf": 1.2250341225298993,
    "alpha": 0.058017288135593224,
    "sse": 0.013476308002459835
  },
  "('google/mt5-base', 'wikiword')": {
    "A": 4.673677233571926,
    "L_inf": 0.2712846154423211,
    "alpha": 0.10745762711864405,
    "sse": 0.035593744683382535
  },
  "('google/mt5-large', 'flan')": {
    "A": 2.932396661171257,
    "L_inf": 0.7555303504825739,
    "alpha": 0.08167457627118642,
    "sse": 0.010985283372535848
  },
  "('google/mt5-large', 'gigaword')": {
    "A": 2.5411857996356715,
    "L_inf": 1.1479844538474107,
    "alpha": 0.06697525423728813,
    "sse": 0.039205759001219365
  },
  "('google/mt5-large', 'wikiword')": {
    "A": 3.736579330087355,
    "L_inf": 0.39863249003965173,
    "alpha": 0.09885593220338983,
    "sse": 0.005007817242997015
  },
  "('gpt2', 'flan')": {
    "A": 6.712728732449793,
    "L_inf": 1.0928341966172603,
    "alpha": 0.11745762711864405,
    "sse": 0.5785501985298458
  },
  "('gpt2', 'gigaword')": {
    "A": 6.245576858034009,
    "L_inf": 0.030180644744962015,
    "alpha": 0.11694915254237281,
    "sse": 0.4917946060647004
  },
  "('gpt2', 'wikiword')": {
    "A": 3.5477422717951628,
    "L_inf": 0.6678701240663837,
    "alpha": 0.09885593220338983,
    "sse": 0.023337582935558168
  },
  "('t5-base', 'flan')": {
    "A": 2.9579816171965967,
    "L_inf": 0.6552739441287245,
    "alpha": 0.07150389830508473,
    "sse": 0.024602104806572565
  },
  "('t5-base', 'gigaword')": {
    "A": 1.8289527080465815,
    "L_inf": 0.41334140987512596,
    "alpha": 0.16694915254237283,
    "sse": 0.002126327534880776
  },
  "('t5-base', 'wikiword')": {
    "A": 2.020115701049203,
    "L_inf": 0.27566422140104585,
    "alpha": 0.05463796610169492,
    "sse": 0.002761225698689312
  },
  "('t5-small', 'flan')": {
    "A": 3.3885713791993206,
    "L_inf": 0.8860820725336442,
    "alpha": 0.08167457627118642,
    "sse": 0.026660987451640007
  },
  "('t5-small', 'gigaword')": {
    "A": 1.7756175381171875,
    "L_inf": 0.41731462928597385,
    "alpha": 0.13796610169491524,
    "sse": 0.002043258408125567
  },
  "('t5-small', 'wikiword')": {
    "A": 2.4527737444128093,
    "L_inf": 0.41988512499871733,
    "alpha": 0.06697525423728813,
    "sse": 0.00958337967879773
  }
}
_PARAMS_BY_DATASET: Dict[str, Dict[str, float]] = {
  "flan": {
    "A": 4.215660115624637,
    "L_inf": 0.571749209401148,
    "alpha": 0.08029016949152541,
    "sse": 43.25799096145912
  },
  "gigaword": {
    "A": 4.64950666811947,
    "L_inf": 0.0,
    "alpha": 0.09333599576271186,
    "sse": 54.41332124583563
  },
  "wikiword": {
    "A": 3.219329078800361,
    "L_inf": 0.20103710275697806,
    "alpha": 0.07011457627118642,
    "sse": 15.138361813331931
  }
}
_GLOBAL_PARAMS: Dict[str, float] = {
  "A": 4.1350908249146565,
  "L_inf": 0.12178898498171782,
  "alpha": 0.07745491525423727,
  "sse": 164.34461735246174
}

def _select_params(group: str) -> Dict[str, float]:
    # Exact group match first
    if group in _PARAMS_BY_GROUP:
        return _PARAMS_BY_GROUP[group]
    # Try to parse tuple-like string to extract dataset fallback
    try:
        tpl = ast.literal_eval(group)
        if isinstance(tpl, (list, tuple)) and len(tpl) >= 2:
            dataset = tpl[1]
            if dataset in _PARAMS_BY_DATASET:
                return _PARAMS_BY_DATASET[dataset]
    except Exception:
        pass
    # Fallback to global parameters
    return _GLOBAL_PARAMS

def _predict_loss(n: float, params: Dict[str, float]) -> float:
    # Scaling law: L(N) = L_inf + A * N^{-alpha}
    L_inf = float(params['L_inf'])
    A = float(params['A'])
    alpha = float(params['alpha'])
    # Safety: ensure positive N
    n = float(n)
    if not math.isfinite(n) or n <= 0:
        return float('nan')
    return L_inf + A * (n ** (-alpha))

def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    params = _select_params(group)
    outputs: List[Dict[str, float]] = []
    for row in input_data:
        n = row.get('sft_data_size')
        y = _predict_loss(n, params)
        outputs.append({'sft_loss': float(y)})
    return outputs