SLD - SFT Scaling Law - mini-swe-agent + GPT-5

All Runs (sorted by R²)

Best Run 1 R² = 0.971228

▼

Python

from __future__ import annotations
import math
from typing import List, Dict

# Learned parameters for each group for the scaling law:
# sft_loss = c + a * (sft_data_size + x0) ** (-b)
_PARAMS = {
    "('MBZUAI/LaMini-GPT-124M', 'flan')": {'c': 1.564269587, 'a': 87.98498619, 'b': 0.3824763366, 'x0': 10000},
    "('MBZUAI/LaMini-GPT-124M', 'gigaword')": {'c': 0.4837739236, 'a': 59.71925558, 'b': 0.3416696269, 'x0': 10000},
    "('MBZUAI/LaMini-GPT-124M', 'wikiword')": {'c': 1.048831887, 'a': 5.997674903, 'b': 0.1826307524, 'x0': 1584.893192},
    "('MBZUAI/LaMini-GPT-774M', 'flan')": {'c': 1.307240186, 'a': 41.76904713, 'b': 0.3242540963, 'x0': 10000},
    "('MBZUAI/LaMini-GPT-774M', 'gigaword')": {'c': 0.6260466467, 'a': 104.8492498, 'b': 0.4259772459, 'x0': 10000},
    "('MBZUAI/LaMini-GPT-774M', 'wikiword')": {'c': 0.846386623, 'a': 2.652869033, 'b': 0.116385829, 'x0': 251.1886431},
    "('cerebras/Cerebras-GPT-1.3B', 'flan')": {'c': 1.371608086, 'a': 5.177573316, 'b': 0.1776235759, 'x0': 1584.893192},
    "('cerebras/Cerebras-GPT-1.3B', 'gigaword')": {'c': 0.1373163055, 'a': 7.12887961, 'b': 0.137700517, 'x0': 1584.893192},
    "('cerebras/Cerebras-GPT-1.3B', 'wikiword')": {'c': 1.084408809, 'a': 3.972975118, 'b': 0.1537142579, 'x0': 1584.893192},
    "('cerebras/Cerebras-GPT-256M', 'flan')": {'c': 1.265501879, 'a': 5.42406848, 'b': 0.127521399, 'x0': 1584.893192},
    "('cerebras/Cerebras-GPT-256M', 'gigaword')": {'c': 0, 'a': 8.979116797, 'b': 0.1512820373, 'x0': 1584.893192},
    "('cerebras/Cerebras-GPT-256M', 'wikiword')": {'c': 0.7785420197, 'a': 4.614099641, 'b': 0.1157775285, 'x0': 251.1886431},
    "('facebook/bart-base', 'flan')": {'c': 0.3645941746, 'a': 10.54850589, 'b': 0.1400599591, 'x0': 1584.893192},
    "('facebook/bart-base', 'gigaword')": {'c': 0.8022925003, 'a': 558.1446562, 'b': 0.5859135735, 'x0': 10000},
    "('facebook/bart-base', 'wikiword')": {'c': 0.9874219716, 'a': 8.509733939, 'b': 0.2138652817, 'x0': 251.1886431},
    "('facebook/bart-large', 'flan')": {'c': 0.6377771789, 'a': 5.525963949, 'b': 0.1123388906, 'x0': 251.1886431},
    "('facebook/bart-large', 'gigaword')": {'c': 0.8135473126, 'a': 1721.147572, 'b': 0.7131843465, 'x0': 10000},
    "('facebook/bart-large', 'wikiword')": {'c': 0.7826410482, 'a': 2.626828832, 'b': 0.1156186513, 'x0': 0},
    "('facebook/opt-1.3b', 'flan')": {'c': 1.268665971, 'a': 4.437107393, 'b': 0.182582469, 'x0': 1584.893192},
    "('facebook/opt-1.3b', 'gigaword')": {'c': 0.1162159713, 'a': 9.303401005, 'b': 0.1692837743, 'x0': 1584.893192},
    "('facebook/opt-1.3b', 'wikiword')": {'c': 0.9604445677, 'a': 1.854993681, 'b': 0.1160700451, 'x0': 251.1886431},
    "('facebook/opt-350m', 'flan')": {'c': 0.9725591969, 'a': 5.931693964, 'b': 0.1327214077, 'x0': 1584.893192},
    "('facebook/opt-350m', 'gigaword')": {'c': 0, 'a': 12.90265068, 'b': 0.189106158, 'x0': 1584.893192},
    "('facebook/opt-350m', 'wikiword')": {'c': 0.8957143243, 'a': 2.994444891, 'b': 0.1136282614, 'x0': 251.1886431},
    "('facebook/opt-6.7b', 'flan')": {'c': 1.517869515, 'a': 0.9931489165, 'b': 0.1126229971, 'x0': 251.1886431},
    "('facebook/opt-6.7b', 'gigaword')": {'c': 1.723087933, 'a': 7.567498663, 'b': 0.3652711781, 'x0': 10000},
    "('facebook/opt-6.7b', 'wikiword')": {'c': 0.9976291701, 'a': 1.409723211, 'b': 0.1163785758, 'x0': 251.1886431},
    "('google/mt5-base', 'flan')": {'c': 0.9909552392, 'a': 4.597987709, 'b': 0.1175021959, 'x0': 251.1886431},
    "('google/mt5-base', 'gigaword')": {'c': 1.738144527, 'a': 3.077069792, 'b': 0.1368078084, 'x0': 1584.893192},
    "('google/mt5-base', 'wikiword')": {'c': 0.2199407554, 'a': 5.398032721, 'b': 0.1182095443, 'x0': 251.1886431},
    "('google/mt5-large', 'flan')": {'c': 1.434152687, 'a': 6.288948252, 'b': 0.2299434479, 'x0': 1584.893192},
    "('google/mt5-large', 'gigaword')": {'c': 1.900891986, 'a': 23.65735527, 'b': 0.3546820123, 'x0': 10000},
    "('google/mt5-large', 'wikiword')": {'c': 0.5457049924, 'a': 4.193880429, 'b': 0.1210634752, 'x0': 251.1886431},
    "('gpt2', 'flan')": {'c': 1.490922453, 'a': 75.32900687, 'b': 0.3578463194, 'x0': 10000},
    "('gpt2', 'gigaword')": {'c': 0.8476347336, 'a': 301.256379, 'b': 0.5329780156, 'x0': 10000},
    "('gpt2', 'wikiword')": {'c': 1.203279469, 'a': 7.999141793, 'b': 0.2296289159, 'x0': 1584.893192},
    "('t5-base', 'flan')": {'c': 0.9738756683, 'a': 4.331690322, 'b': 0.132221776, 'x0': 1584.893192},
    "('t5-base', 'gigaword')": {'c': 0.4284101849, 'a': 1.864584477, 'b': 0.1734356468, 'x0': 6.30957344},
    "('t5-base', 'wikiword')": {'c': 1.001935901, 'a': 3.053731805, 'b': 0.1909571108, 'x0': 1584.893192},
    "('t5-small', 'flan')": {'c': 1.142977987, 'a': 4.981709663, 'b': 0.135803553, 'x0': 1584.893192},
    "('t5-small', 'gigaword')": {'c': 0.5884135339, 'a': 2.844380432, 'b': 0.2363651475, 'x0': 251.1886431},
    "('t5-small', 'wikiword')": {'c': 1.083657457, 'a': 4.090656387, 'b': 0.1910050169, 'x0': 1584.893192},
}

# Fallback parameters (mean across groups) used if an unseen group is provided
_FALLBACK = {"c":0.9272686752852628,"a":74.88410175959388,"b":0.22081211328758815,"x0":3132.110732274286}

def _predict_one(n: float, p: dict[str, float]) -> float:
    n = float(n)
    c = float(p.get("c", 0.0))
    a = float(p.get("a", 1.0))
    b = float(p.get("b", 0.5))
    x0 = float(p.get("x0", 0.0))
    # Guard for non-positive n: treat as 0
    if not math.isfinite(n) or n < 0:
        n = 0.0
    return c + a * (n + x0) ** (-b)

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    params = _PARAMS.get(group, _FALLBACK)
    out: list[dict[str, float]] = []
    for item in input_data:
        # Expect 'sft_data_size' as the driver variable
        n = item.get("sft_data_size")
        if n is None:
            # Try common aliases just in case
            n = item.get("N", item.get("n", 0.0))
        yhat = _predict_one(n, params)
        out.append({"sft_loss": float(yhat)})
    return out

#2 Run 2 R² = 0.888404

▼

Python

# Auto-generated scaling law for SFT loss vs data size
# Formula: sft_loss(N) = L_inf + A * N^(-alpha)
# Parameters are fitted per group on the provided dataset.

from typing import List, Dict

PARAMS: dict[str, dict[str, float]] = {
  "('MBZUAI/LaMini-GPT-124M', 'flan')": {
    "Linf": 0.2644679126412921,
    "A": 7.392902084513029,
    "alpha": 0.09772835598361189
  },
  "('MBZUAI/LaMini-GPT-774M', 'flan')": {
    "Linf": 0.43759395405594725,
    "A": 5.531413145210734,
    "alpha": 0.09388832717900626
  },
  "('cerebras/Cerebras-GPT-256M', 'flan')": {
    "Linf": 1.1472914221264334,
    "A": 3.795974200154351,
    "alpha": 0.08774296186718188
  },
  "('cerebras/Cerebras-GPT-1.3B', 'flan')": {
    "Linf": 1.0622036781516453,
    "A": 3.0715307151683127,
    "alpha": 0.09823047544844718
  },
  "('facebook/bart-base', 'flan')": {
    "Linf": 0.15379230840362768,
    "A": 7.116640444912342,
    "alpha": 0.09669213314391904
  },
  "('facebook/bart-large', 'flan')": {
    "Linf": 0.48584141551531856,
    "A": 5.014098914743766,
    "alpha": 0.09512578812569872
  },
  "('facebook/opt-1.3b', 'flan')": {
    "Linf": 0.9741057484295051,
    "A": 2.5825359769036993,
    "alpha": 0.09681201597062883
  },
  "('facebook/opt-350m', 'flan')": {
    "Linf": 0.848643995048443,
    "A": 4.077856456099283,
    "alpha": 0.09121613844113714
  },
  "('facebook/opt-6.7b', 'flan')": {
    "Linf": 1.4899158103248278,
    "A": 0.9025795550681525,
    "alpha": 0.09537760187157074
  },
  "('gpt2', 'flan')": {
    "Linf": 0.21870910496088403,
    "A": 7.684950255452461,
    "alpha": 0.09703407026496974
  },
  "('t5-base', 'flan')": {
    "Linf": 0.8812008838577184,
    "A": 3.0096014573710135,
    "alpha": 0.09170378077307463
  },
  "('t5-small', 'flan')": {
    "Linf": 1.0386471794947751,
    "A": 3.426027780307913,
    "alpha": 0.09425842276668721
  },
  "('google/mt5-base', 'flan')": {
    "Linf": 0.8687155691785333,
    "A": 4.14230239210097,
    "alpha": 0.09945773996866068
  },
  "('google/mt5-large', 'flan')": {
    "Linf": 0.9728333303560993,
    "A": 2.8951405820618747,
    "alpha": 0.0994449930593957
  },
  "('MBZUAI/LaMini-GPT-124M', 'gigaword')": {
    "Linf": -0.4567897703724608,
    "A": 6.853428489126484,
    "alpha": 0.10181381322441063
  },
  "('MBZUAI/LaMini-GPT-774M', 'gigaword')": {
    "Linf": -0.42935635602180255,
    "A": 6.151009176850955,
    "alpha": 0.1038374012172193
  },
  "('cerebras/Cerebras-GPT-256M', 'gigaword')": {
    "Linf": -0.26614613775260576,
    "A": 5.792272468065782,
    "alpha": 0.09844696669260672
  },
  "('cerebras/Cerebras-GPT-1.3B', 'gigaword')": {
    "Linf": -0.010002282397518769,
    "A": 4.859797353275681,
    "alpha": 0.09525580361484658
  },
  "('facebook/bart-base', 'gigaword')": {
    "Linf": -0.9984903017327353,
    "A": 8.510739423060306,
    "alpha": 0.10554630719642306
  },
  "('facebook/bart-large', 'gigaword')": {
    "Linf": -1.0736871011852702,
    "A": 8.425514404986941,
    "alpha": 0.10737344387236608
  },
  "('facebook/opt-1.3b', 'gigaword')": {
    "Linf": -0.27696534696465847,
    "A": 5.673848264697922,
    "alpha": 0.10268168921870761
  },
  "('facebook/opt-350m', 'gigaword')": {
    "Linf": -0.6807447105738882,
    "A": 7.190031773836993,
    "alpha": 0.10375543896859384
  },
  "('facebook/opt-6.7b', 'gigaword')": {
    "Linf": 1.6072111101801454,
    "A": 0.7236059283739782,
    "alpha": 0.09922385277336458
  },
  "('gpt2', 'gigaword')": {
    "Linf": -0.6244255837265738,
    "A": 7.189854871198778,
    "alpha": 0.10372952717809261
  },
  "('t5-base', 'gigaword')": {
    "Linf": 0.13235370541888736,
    "A": 1.6829180686329805,
    "alpha": 0.09721255441041499
  },
  "('t5-small', 'gigaword')": {
    "Linf": 0.23584916358105795,
    "A": 1.7482817915517455,
    "alpha": 0.10149881529338796
  },
  "('google/mt5-base', 'gigaword')": {
    "Linf": 1.6747833275590822,
    "A": 2.102174310233926,
    "alpha": 0.09464789431169726
  },
  "('google/mt5-large', 'gigaword')": {
    "Linf": 1.487844720370509,
    "A": 2.475132952553586,
    "alpha": 0.09621416453798393
  },
  "('MBZUAI/LaMini-GPT-124M', 'wikiword')": {
    "Linf": 0.649809905727861,
    "A": 3.4886667687707806,
    "alpha": 0.09671367952481208
  },
  "('MBZUAI/LaMini-GPT-774M', 'wikiword')": {
    "Linf": 0.7744533276207425,
    "A": 2.397110707298485,
    "alpha": 0.0985784748438589
  },
  "('cerebras/Cerebras-GPT-256M', 'wikiword')": {
    "Linf": 0.6527501355366048,
    "A": 4.172713150671879,
    "alpha": 0.09805474351303752
  },
  "('cerebras/Cerebras-GPT-1.3B', 'wikiword')": {
    "Linf": 0.9119378489893127,
    "A": 2.587920259576489,
    "alpha": 0.09589627115811068
  },
  "('facebook/bart-base', 'wikiword')": {
    "Linf": -0.02833468290713781,
    "A": 5.828239751140302,
    "alpha": 0.10449269492583729
  },
  "('facebook/bart-large', 'wikiword')": {
    "Linf": 0.5956038597493062,
    "A": 2.6528660591119273,
    "alpha": 0.0957153366743184
  },
  "('facebook/opt-1.3b', 'wikiword')": {
    "Linf": 0.9094528391371892,
    "A": 1.6787189787975223,
    "alpha": 0.0983368205391434
  },
  "('facebook/opt-350m', 'wikiword')": {
    "Linf": 0.8109548933586989,
    "A": 2.7232908330556858,
    "alpha": 0.09634969340031328
  },
  "('facebook/opt-6.7b', 'wikiword')": {
    "Linf": 0.9589016638350853,
    "A": 1.2766691408885675,
    "alpha": 0.09871035838369577
  },
  "('gpt2', 'wikiword')": {
    "Linf": 0.6125135065620836,
    "A": 3.6943704466210443,
    "alpha": 0.09930774608957725
  },
  "('t5-base', 'wikiword')": {
    "Linf": 0.787324793500978,
    "A": 1.711734510517094,
    "alpha": 0.09693028358929047
  },
  "('t5-small', 'wikiword')": {
    "Linf": 0.801560693099266,
    "A": 2.2866597170318412,
    "alpha": 0.09729548662578053
  },
  "('google/mt5-base', 'wikiword')": {
    "Linf": 0.07602277288787551,
    "A": 4.861077329303588,
    "alpha": 0.10005253831788846
  },
  "('google/mt5-large', 'wikiword')": {
    "Linf": 0.38490261778168056,
    "A": 3.7826744174681144,
    "alpha": 0.09921917188340773
  },
  "default": {
    "Linf": -2.758689901713578,
    "A": 6.6021114932519085,
    "alpha": 0.05
  }
}

def _predict_one(n: float, p: dict[str, float]) -> float:
    # Safeguards
    if n is None:
        return float("nan")
    try:
        x = float(n)
    except Exception:
        return float("nan")
    if not (x > 0.0) or not (x == x):
        # x <= 0 or NaN
        x = 1.0
    L_inf = p.get("Linf", 0.0)
    A = p.get("A", 1.0)
    alpha = p.get("alpha", 0.5)
    # Compute prediction
    return L_inf + A * (x ** (-alpha))

def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    p = PARAMS.get(group, PARAMS.get("default", {"Linf": 0.0, "A": 1.0, "alpha": 0.5}))
    outputs: List[Dict[str, float]] = []
    for row in input_data:
        n = None
        if isinstance(row, dict):
            # standard key
            if "sft_data_size" in row:
                n = row["sft_data_size"]
            else:
                # fallback: try common variants
                for k in ["N", "n", "data_size", "num_examples"]:
                    if k in row:
                        n = row[k]
                        break
        yhat = _predict_one(n, p)
        outputs.append({"sft_loss": float(yhat)})
    return outputs

#3 Run 3 R² = 0.883449

▼

Python

from __future__ import annotations
import math

# Per-group parameters for the scaling law:
# sft_loss(N) = L + A * N**(-alpha)
PARAMS = {
  "('MBZUAI/LaMini-GPT-124M', 'flan')": {
    "A": 7.440100211083904,
    "L": 0.07578693008730511,
    "alpha": 0.09162057288333554
  },
  "('MBZUAI/LaMini-GPT-124M', 'gigaword')": {
    "A": 6.969627229339258,
    "L": 1.0000000085225302e-06,
    "alpha": 0.12488803834188497
  },
  "('MBZUAI/LaMini-GPT-124M', 'wikiword')": {
    "A": 3.519022121819693,
    "L": 0.5553046166310998,
    "alpha": 0.0905350448853153
  },
  "('MBZUAI/LaMini-GPT-774M', 'flan')": {
    "A": 5.577220212652247,
    "L": 0.2926528238084617,
    "alpha": 0.08805621954625023
  },
  "('MBZUAI/LaMini-GPT-774M', 'gigaword')": {
    "A": 6.293122362913966,
    "L": 1.0000000167985103e-06,
    "alpha": 0.12918488624821686
  },
  "('MBZUAI/LaMini-GPT-774M', 'wikiword')": {
    "A": 2.416969399153642,
    "L": 0.7097133618233342,
    "alpha": 0.09225417668880848
  },
  "('cerebras/Cerebras-GPT-1.3B', 'flan')": {
    "A": 3.0954136626697317,
    "L": 0.9803024318962148,
    "alpha": 0.09195872279643587
  },
  "('cerebras/Cerebras-GPT-1.3B', 'gigaword')": {
    "A": 4.857135213838401,
    "L": 1.000000002361188e-06,
    "alpha": 0.09575035015064552
  },
  "('cerebras/Cerebras-GPT-1.3B', 'wikiword')": {
    "A": 2.6112077543046617,
    "L": 0.8413814859856835,
    "alpha": 0.08975874488469686
  },
  "('cerebras/Cerebras-GPT-256M', 'flan')": {
    "A": 3.84163336780236,
    "L": 1.0409020111902445,
    "alpha": 0.08226644295724701
  },
  "('cerebras/Cerebras-GPT-256M', 'gigaword')": {
    "A": 5.787318908898869,
    "L": 1.0000000028874842e-06,
    "alpha": 0.11196743599817105
  },
  "('cerebras/Cerebras-GPT-256M', 'wikiword')": {
    "A": 4.20836124377278,
    "L": 0.5395374397768178,
    "alpha": 0.0917616156915809
  },
  "('facebook/bart-base', 'flan')": {
    "A": 7.161143292734261,
    "L": 1.0000000359978954e-06,
    "alpha": 0.09164741438063023
  },
  "('facebook/bart-base', 'gigaword')": {
    "A": 9.296093337645615,
    "L": 1.0000000040655927e-06,
    "alpha": 0.15813535276647986
  },
  "('facebook/bart-base', 'wikiword')": {
    "A": 5.830904621700831,
    "L": 0.2869492712213567,
    "alpha": 0.12178125982927294
  },
  "('facebook/bart-large', 'flan')": {
    "A": 5.0615812536821565,
    "L": 0.3490992285087147,
    "alpha": 0.08908882018311826
  },
  "('facebook/bart-large', 'gigaword')": {
    "A": 9.506914846719734,
    "L": 1.0000000022481673e-06,
    "alpha": 0.16937069022924925
  },
  "('facebook/bart-large', 'wikiword')": {
    "A": 2.625867642492629,
    "L": 0.7736133859382263,
    "alpha": 0.11446096205772392
  },
  "('facebook/opt-1.3b', 'flan')": {
    "A": 2.6048225290308236,
    "L": 0.9043414023323434,
    "alpha": 0.09063834250293701
  },
  "('facebook/opt-1.3b', 'gigaword')": {
    "A": 5.693458071267297,
    "L": 1.0000000020853807e-06,
    "alpha": 0.11822794821876381
  },
  "('facebook/opt-1.3b', 'wikiword')": {
    "A": 1.6932257825455985,
    "L": 0.863560283425031,
    "alpha": 0.09198305533257471
  },
  "('facebook/opt-350m', 'flan')": {
    "A": 4.119749020216196,
    "L": 0.7371203133386277,
    "alpha": 0.08549184563226274
  },
  "('facebook/opt-350m', 'gigaword')": {
    "A": 7.476893095942596,
    "L": 1.0000000026274898e-06,
    "alpha": 0.14033904872585587
  },
  "('facebook/opt-350m', 'wikiword')": {
    "A": 2.75049037744945,
    "L": 0.7346714054714121,
    "alpha": 0.09010627836738455
  },
  "('facebook/opt-6.7b', 'flan')": {
    "A": 0.9117066035927738,
    "L": 1.4647574764645663,
    "alpha": 0.08924638523752465
  },
  "('facebook/opt-6.7b', 'gigaword')": {
    "A": 0.7273693244609847,
    "L": 1.5892301308000973,
    "alpha": 0.09306258326029347
  },
  "('facebook/opt-6.7b', 'wikiword')": {
    "A": 1.2878365209695413,
    "L": 0.924046908237266,
    "alpha": 0.09235356420614224
  },
  "('google/mt5-base', 'flan')": {
    "A": 4.173545355842783,
    "L": 0.7586998661581449,
    "alpha": 0.09310904568468663
  },
  "('google/mt5-base', 'gigaword')": {
    "A": 2.121768452406966,
    "L": 1.6177582480620043,
    "alpha": 0.08865913268490061
  },
  "('google/mt5-base', 'wikiword')": {
    "A": 4.880269563742019,
    "L": 1.0000000017002558e-06,
    "alpha": 0.0961777508058187
  },
  "('google/mt5-large', 'flan')": {
    "A": 2.916718912178606,
    "L": 0.895946679190918,
    "alpha": 0.09308729553091386
  },
  "('google/mt5-large', 'gigaword')": {
    "A": 2.492796478693708,
    "L": 1.4237510860058773,
    "alpha": 0.0901878292592061
  },
  "('google/mt5-large', 'wikiword')": {
    "A": 3.8149792877009796,
    "L": 0.28152963055589475,
    "alpha": 0.0927984918782548
  },
  "('gpt2', 'flan')": {
    "A": 7.736245982667956,
    "L": 0.02129659579525276,
    "alpha": 0.09095789078505706
  },
  "('gpt2', 'gigaword')": {
    "A": 7.448124477442126,
    "L": 1.0000000030419932e-06,
    "alpha": 0.13683519625150198
  },
  "('gpt2', 'wikiword')": {
    "A": 3.7224904859431525,
    "L": 0.5140524210828509,
    "alpha": 0.09295215658695029
  },
  "('t5-base', 'flan')": {
    "A": 3.0420404457436288,
    "L": 0.797793577830627,
    "alpha": 0.08591994556309528
  },
  "('t5-base', 'gigaword')": {
    "A": 1.8235671433738498,
    "L": 0.4163994808667427,
    "alpha": 0.16742461953579632
  },
  "('t5-base', 'wikiword')": {
    "A": 1.726395754984347,
    "L": 0.7413366134228406,
    "alpha": 0.0907796674347665
  },
  "('t5-small', 'flan')": {
    "A": 3.4592068416893786,
    "L": 0.9447494529027566,
    "alpha": 0.08826656681486236
  },
  "('t5-small', 'gigaword')": {
    "A": 1.77294022802986,
    "L": 0.4066064048199717,
    "alpha": 0.13549712056259158
  },
  "('t5-small', 'wikiword')": {
    "A": 2.3051873945836134,
    "L": 0.7411113007567804,
    "alpha": 0.09117916657489698
  }
}

# Fallback parameters (median across groups)
FALLBACK = {
  "L": 0.5474210282039588,
  "A": 3.768734886822066,
  "alpha": 0.0921186160106916
}

def _predict_loss(n: float, p: dict[str, float]) -> float:
    if n is None or not (n > 0):
        return float('nan')
    L = float(p.get('L', 0.0))
    A = float(p.get('A', 1.0))
    alpha = float(p.get('alpha', 0.5))
    return L + A * (n ** (-alpha))

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    p = PARAMS.get(group, FALLBACK)
    out = []
    for row in input_data:
        n = row.get('sft_data_size')
        yhat = _predict_loss(float(n), p) if n is not None else float('nan')
        out.append({'sft_loss': float(yhat)})
    return out

#4 Run 4 R² = 0.799521

▼

Python

# Auto-generated scaling law for SFT loss
# Formula: sft_loss = c + a * sft_data_size ** (-b)
# Parameters differ by group; functional form is constant across groups.

from typing import List, Dict

COEFFS: Dict[str, Dict[str, float]] = {
  "('MBZUAI/LaMini-GPT-124M', 'flan')": {
    "a": 10.222339742370627,
    "b": 0.21280634514155633,
    "c": 1.721022189744976
  },
  "('MBZUAI/LaMini-GPT-124M', 'gigaword')": {
    "a": 12.320547355485184,
    "b": 0.26089547694991816,
    "c": 0.9675486463460449
  },
  "('MBZUAI/LaMini-GPT-124M', 'wikiword')": {
    "a": 4.100772188047223,
    "b": 0.18152648967340554,
    "c": 1.2758902365563987
  },
  "('MBZUAI/LaMini-GPT-774M', 'flan')": {
    "a": 6.98811459026019,
    "b": 0.19248917759073345,
    "c": 1.5096042044247693
  },
  "('MBZUAI/LaMini-GPT-774M', 'gigaword')": {
    "a": 11.377717006008076,
    "b": 0.267426321785479,
    "c": 0.8347296614185309
  },
  "('MBZUAI/LaMini-GPT-774M', 'wikiword')": {
    "a": 2.6222828415682673,
    "b": 0.16267728670631554,
    "c": 1.1374823876758626
  },
  "('cerebras/Cerebras-GPT-1.3B', 'flan')": {
    "a": 3.330159186625065,
    "b": 0.15779434393768396,
    "c": 1.5049729117647779
  },
  "('cerebras/Cerebras-GPT-1.3B', 'gigaword')": {
    "a": 6.976305162979089,
    "b": 0.22457075012382388,
    "c": 1.0526920060489429
  },
  "('cerebras/Cerebras-GPT-1.3B', 'wikiword')": {
    "a": 2.778338954201265,
    "b": 0.15443803643412407,
    "c": 1.2940011824205082
  },
  "('cerebras/Cerebras-GPT-256M', 'flan')": {
    "a": 4.005536907737473,
    "b": 0.14619768148504508,
    "c": 1.7689484243906948
  },
  "('cerebras/Cerebras-GPT-256M', 'gigaword')": {
    "a": 9.359113292426192,
    "b": 0.24469706695166904,
    "c": 0.9834338424400386
  },
  "('cerebras/Cerebras-GPT-256M', 'wikiword')": {
    "a": 5.05504398023467,
    "b": 0.1909152356372,
    "c": 1.4279016737075942
  },
  "('facebook/bart-base', 'flan')": {
    "a": 9.986586175420015,
    "b": 0.219148740632597,
    "c": 1.640807282357686
  },
  "('facebook/bart-base', 'gigaword')": {
    "a": 19.128418719190538,
    "b": 0.30473021434712627,
    "c": 0.855779466239328
  },
  "('facebook/bart-base', 'wikiword')": {
    "a": 9.288530661930713,
    "b": 0.2583420783293094,
    "c": 1.2382722929449206
  },
  "('facebook/bart-large', 'flan')": {
    "a": 6.270260870036552,
    "b": 0.19607675604898817,
    "c": 1.4826106276651974
  },
  "('facebook/bart-large', 'gigaword')": {
    "a": 20.51868024266526,
    "b": 0.3198573488096475,
    "c": 0.7656923875695543
  },
  "('facebook/bart-large', 'wikiword')": {
    "a": 3.005653950106531,
    "b": 0.17843702366373163,
    "c": 1.0886052856336208
  },
  "('facebook/opt-1.3b', 'flan')": {
    "a": 2.749677843064501,
    "b": 0.15189436417307506,
    "c": 1.3373993667209183
  },
  "('facebook/opt-1.3b', 'gigaword')": {
    "a": 9.676777613999967,
    "b": 0.2590315295208322,
    "c": 0.9297285692706154
  },
  "('facebook/opt-1.3b', 'wikiword')": {
    "a": 1.719029960352032,
    "b": 0.1371889870827748,
    "c": 1.0947027162070562
  },
  "('facebook/opt-350m', 'flan')": {
    "a": 4.583073463469868,
    "b": 0.16532716547261397,
    "c": 1.5711046489173983
  },
  "('facebook/opt-350m', 'gigaword')": {
    "a": 14.362483066471071,
    "b": 0.2860946378187169,
    "c": 0.894883766967793
  },
  "('facebook/opt-350m', 'wikiword')": {
    "a": 2.991032639803012,
    "b": 0.16546405947502613,
    "c": 1.2590318169852763
  },
  "('facebook/opt-6.7b', 'flan')": {
    "a": 0.9497789352170123,
    "b": 0.07552355590456462,
    "c": 1.3931993183419757
  },
  "('facebook/opt-6.7b', 'gigaword')": {
    "a": 0.8210010689987665,
    "b": 0.060280813546033644,
    "c": 1.4296167223845089
  },
  "('facebook/opt-6.7b', 'wikiword')": {
    "a": 1.2745183253604955,
    "b": 0.11943224747206747,
    "c": 1.0459593750506353
  },
  "('google/mt5-base', 'flan')": {
    "a": 4.848102934246837,
    "b": 0.1814689362651268,
    "c": 1.5750980787060067
  },
  "('google/mt5-base', 'gigaword')": {
    "a": 2.0892991998041346,
    "b": 0.11066805689452665,
    "c": 1.7960272972239193
  },
  "('google/mt5-base', 'wikiword')": {
    "a": 7.07134076334372,
    "b": 0.23142747562751959,
    "c": 1.0970276915538568
  },
  "('google/mt5-large', 'flan')": {
    "a": 3.1617036326215486,
    "b": 0.16107663630524463,
    "c": 1.393359872806398
  },
  "('google/mt5-large', 'gigaword')": {
    "a": 2.5074166691183795,
    "b": 0.12452313908352985,
    "c": 1.7030248504135217
  },
  "('google/mt5-large', 'wikiword')": {
    "a": 4.878109306751101,
    "b": 0.20774103520443024,
    "c": 1.1349059912317212
  },
  "('gpt2', 'flan')": {
    "a": 10.696620740039227,
    "b": 0.21381293880470648,
    "c": 1.7542673564938083
  },
  "('gpt2', 'gigaword')": {
    "a": 13.94318348952227,
    "b": 0.2778055355186586,
    "c": 0.9041445149959696
  },
  "('gpt2', 'wikiword')": {
    "a": 4.475465647006644,
    "b": 0.19027842690009428,
    "c": 1.2776994882836155
  },
  "('t5-base', 'flan')": {
    "a": 3.2391224870437214,
    "b": 0.15468394052503098,
    "c": 1.3722191545029796
  },
  "('t5-base', 'gigaword')": {
    "a": 2.1969110113796746,
    "b": 0.21735306656480474,
    "c": 0.5006256694729821
  },
  "('t5-base', 'wikiword')": {
    "a": 1.7755175801642327,
    "b": 0.14166331237878174,
    "c": 0.9977652746259214
  },
  "('t5-small', 'flan')": {
    "a": 3.7204246492776534,
    "b": 0.15796169239030738,
    "c": 1.582099555532
  },
  "('t5-small', 'gigaword')": {
    "a": 2.1951193979994312,
    "b": 0.20619249726197017,
    "c": 0.5732957704253894
  },
  "('t5-small', 'wikiword')": {
    "a": 2.4517774544927837,
    "b": 0.15379493226337212,
    "c": 1.1248436924193128
  },
  "__default__": {
    "a": 4.009205911858358,
    "b": 0.10560216502408726,
    "c": 0.5006256694729821
  }
}

def _get_params_for_group(group: str) -> Dict[str, float]:
    if not isinstance(group, str):
        return COEFFS.get("__default__", list(COEFFS.values())[0])
    key = group
    if key in COEFFS:
        return COEFFS[key]
    # case-insensitive lookup
    lower_map = {k.lower(): k for k in COEFFS.keys()}
    if key.lower() in lower_map:
        return COEFFS[lower_map[key.lower()]]
    return COEFFS.get("__default__", list(COEFFS.values())[0])

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
               The functional form of the law is the same for all groups,
               but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    params = _get_params_for_group(group)
    c = float(params.get("c", 0.0))
    a = float(params.get("a", 1.0))
    b = float(params.get("b", 0.5))
    out: list[dict[str, float]] = []
    for row in input_data:
        s = float(row.get("sft_data_size", 0.0))
        # Guard for non-positive sizes
        if s <= 0:
            yhat = float(c + a)  # fallback
        else:
            yhat = float(c + a * (s ** (-b)))
        out.append({"sft_loss": yhat})
    return out

#5 Run 5 R² = 0.721476

▼

Python

from typing import List, Dict
import math

# Discovered scaling law parameters per group for: sft_loss = a + c * sft_data_size ** (-alpha)
PARAMS: Dict[str, Dict[str, float]] = {
  "('MBZUAI/LaMini-GPT-124M', 'flan')": {
    "a": 0.790699041420406,
    "alpha": 0.10351089588377727,
    "c": 6.476424025207671
  },
  "('MBZUAI/LaMini-GPT-124M', 'gigaword')": {
    "a": 0.0,
    "alpha": 0.1056900726392252,
    "c": 5.946489526504135
  },
  "('MBZUAI/LaMini-GPT-124M', 'wikiword')": {
    "a": 0.8598216592762249,
    "alpha": 0.1056900726392252,
    "c": 3.2369503044164687
  },
  "('MBZUAI/LaMini-GPT-774M', 'flan')": {
    "a": 0.9075498078231787,
    "alpha": 0.10351089588377727,
    "c": 4.8480780922984765
  },
  "('MBZUAI/LaMini-GPT-774M', 'gigaword')": {
    "a": 0.0,
    "alpha": 0.11004842615012109,
    "c": 5.372961651185846
  },
  "('MBZUAI/LaMini-GPT-774M', 'wikiword')": {
    "a": 0.9183199182816504,
    "alpha": 0.11004842615012109,
    "c": 2.266079975011127
  },
  "('cerebras/Cerebras-GPT-1.3B', 'flan')": {
    "a": 1.2442064476081625,
    "alpha": 0.10786924939467314,
    "c": 2.8658834173325083
  },
  "('cerebras/Cerebras-GPT-1.3B', 'gigaword')": {
    "a": 0.3146945404290659,
    "alpha": 0.1056900726392252,
    "c": 4.491944618944813
  },
  "('cerebras/Cerebras-GPT-1.3B', 'wikiword')": {
    "a": 1.0767260533938707,
    "alpha": 0.1056900726392252,
    "c": 2.397611260105994
  },
  "('cerebras/Cerebras-GPT-256M', 'flan')": {
    "a": 1.3837123353179746,
    "alpha": 0.09237288135593222,
    "c": 3.420524828046365
  },
  "('cerebras/Cerebras-GPT-256M', 'gigaword')": {
    "a": 0.10007866721006853,
    "alpha": 0.1056900726392252,
    "c": 5.239812743829203
  },
  "('cerebras/Cerebras-GPT-256M', 'wikiword')": {
    "a": 0.9043339038916909,
    "alpha": 0.11004842615012109,
    "c": 3.960719642719618
  },
  "('facebook/bart-base', 'flan')": {
    "a": 0.6184084582954843,
    "alpha": 0.1056900726392252,
    "c": 6.513115640886011
  },
  "('facebook/bart-base', 'gigaword')": {
    "a": 0.0,
    "alpha": 0.13401937046004847,
    "c": 7.636029974479608
  },
  "('facebook/bart-base', 'wikiword')": {
    "a": 0.3874016174036133,
    "alpha": 0.12917675544794188,
    "c": 5.898021650901651
  },
  "('facebook/bart-large', 'flan')": {
    "a": 0.8052128093471991,
    "alpha": 0.1056900726392252,
    "c": 4.680762496215878
  },
  "('facebook/bart-large', 'gigaword')": {
    "a": 0.0,
    "alpha": 0.14249394673123486,
    "c": 7.648162334623565
  },
  "('facebook/bart-large', 'wikiword')": {
    "a": 0.8667983322336061,
    "alpha": 0.12651331719128328,
    "c": 2.625072086310989
  },
  "('facebook/opt-1.3b', 'flan')": {
    "a": 1.1291376286454256,
    "alpha": 0.1056900726392252,
    "c": 2.3951203811959285
  },
  "('facebook/opt-1.3b', 'gigaword')": {
    "a": 0.0428404440471033,
    "alpha": 0.11004842615012109,
    "c": 5.207725354628534
  },
  "('facebook/opt-1.3b', 'wikiword')": {
    "a": 1.0114362962753296,
    "alpha": 0.11004842615012109,
    "c": 1.5878995701870624
  },
  "('facebook/opt-350m', 'flan')": {
    "a": 1.222878403219055,
    "alpha": 0.10351089588377727,
    "c": 3.6065255823631324
  },
  "('facebook/opt-350m', 'gigaword')": {
    "a": 0.0,
    "alpha": 0.12312348668280874,
    "c": 6.489336172308496
  },
  "('facebook/opt-350m', 'wikiword')": {
    "a": 0.9804737553304538,
    "alpha": 0.11004842615012109,
    "c": 2.6146381175043194
  },
  "('facebook/opt-6.7b', 'flan')": {
    "a": 1.5458232189031926,
    "alpha": 0.10786924939467314,
    "c": 0.8596509729745894
  },
  "('facebook/opt-6.7b', 'gigaword')": {
    "a": 1.6573439061969082,
    "alpha": 0.10351089588377727,
    "c": 0.6280318367866337
  },
  "('facebook/opt-6.7b', 'wikiword')": {
    "a": 1.0363566762746874,
    "alpha": 0.11222760290556903,
    "c": 1.2253126948973887
  },
  "('google/mt5-base', 'flan')": {
    "a": 1.1131949092238613,
    "alpha": 0.11004842615012109,
    "c": 3.8930952385830904
  },
  "('google/mt5-base', 'gigaword')": {
    "a": 1.8273491136899558,
    "alpha": 0.1056900726392252,
    "c": 1.925439922690624
  },
  "('google/mt5-base', 'wikiword')": {
    "a": 0.3638587378999745,
    "alpha": 0.11222760290556903,
    "c": 4.621303363999233
  },
  "('google/mt5-large', 'flan')": {
    "a": 1.143692555167632,
    "alpha": 0.11004842615012109,
    "c": 2.7206765385461735
  },
  "('google/mt5-large', 'gigaword')": {
    "a": 1.6811125715819986,
    "alpha": 0.10351089588377727,
    "c": 2.159913444176315
  },
  "('google/mt5-large', 'wikiword')": {
    "a": 0.6146203671723408,
    "alpha": 0.11440677966101698,
    "c": 3.675861062356107
  },
  "('gpt2', 'flan')": {
    "a": 0.7916709426646357,
    "alpha": 0.10351089588377727,
    "c": 6.716079382662674
  },
  "('gpt2', 'gigaword')": {
    "a": 0.0,
    "alpha": 0.11658595641646491,
    "c": 6.305790696642276
  },
  "('gpt2', 'wikiword')": {
    "a": 0.8313159187381804,
    "alpha": 0.11004842615012109,
    "c": 3.474574037621201
  },
  "('t5-base', 'flan')": {
    "a": 1.1327222235180263,
    "alpha": 0.1056900726392252,
    "c": 2.756063479323879
  },
  "('t5-base', 'gigaword')": {
    "a": 0.4149675868147168,
    "alpha": 0.16646489104116224,
    "c": 1.8164624249306538
  },
  "('t5-base', 'wikiword')": {
    "a": 0.8895207492301833,
    "alpha": 0.1056900726392252,
    "c": 1.5872010595195376
  },
  "('t5-small', 'flan')": {
    "a": 1.2792346884811308,
    "alpha": 0.1056900726392252,
    "c": 3.1690824030803815
  },
  "('t5-small', 'gigaword')": {
    "a": 0.40252893117710303,
    "alpha": 0.13450363196125908,
    "c": 1.7724517895533356
  },
  "('t5-small', 'wikiword')": {
    "a": 0.9358926760825715,
    "alpha": 0.1056900726392252,
    "c": 2.1182803806409276
  }
}

DEFAULT_PARAMS = {"a": 0.791093711721, "c": 3.78217048039, "alpha": 0.112031592298}

def _predict_loss(n: float, a: float, c: float, alpha: float) -> float:
    n_eff = max(1e-12, float(n))
    return float(a + c * (n_eff ** (-alpha)))

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts sft_loss from sft_data_size using the scaling law:
    sft_loss = a + c * sft_data_size ** (-alpha)

    Functional form is the same for all groups; parameters (a, c, alpha) vary per group.
    """
    params = PARAMS.get(group, DEFAULT_PARAMS)
    a = float(params.get("a", DEFAULT_PARAMS["a"]))
    c = float(params.get("c", DEFAULT_PARAMS["c"]))
    alpha = float(params.get("alpha", DEFAULT_PARAMS["alpha"]))
    out: list[dict[str, float]] = []
    for item in input_data:
        n = float(item.get("sft_data_size", 0.0))
        y = _predict_loss(n, a, c, alpha)
        out.append({"sft_loss": y})
    return out