SLD - SFT Scaling Law - terminus-2 + GPT-5

All Runs (sorted by R²)

Best Run 1 R² = 0.960281

▼

Python

# Auto-generated scaling law for SFT loss
# Model: A + B * N^-alpha
from typing import List, Dict

# Fitted parameters per group
_PARAMS = {
    "('MBZUAI/LaMini-GPT-124M', 'flan')": {'A': -9287.970911, 'B': 9293.959488, 'alpha': 3.076250129e-05},
    "('MBZUAI/LaMini-GPT-124M', 'gigaword')": {'A': -10371.5707, 'B': 10376.30869, 'alpha': 2.53885626e-05},
    "('MBZUAI/LaMini-GPT-124M', 'wikiword')": {'A': -4625.015479, 'B': 4628.407534, 'alpha': 2.984110982e-05},
    "('MBZUAI/LaMini-GPT-774M', 'flan')": {'A': -9172.408439, 'B': 9177.192211, 'alpha': 2.313061965e-05},
    "('MBZUAI/LaMini-GPT-774M', 'gigaword')": {'A': -9623.28162, 'B': 9627.486398, 'alpha': 2.473385708e-05},
    "('MBZUAI/LaMini-GPT-774M', 'wikiword')": {'A': -7.36673275, 'B': 10.05976737, 'alpha': 0.01050226284},
    "('cerebras/Cerebras-GPT-1.3B', 'flan')": {'A': -4121.931225, 'B': 4125.391992, 'alpha': 2.950457285e-05},
    "('cerebras/Cerebras-GPT-1.3B', 'gigaword')": {'A': -5878.067704, 'B': 5881.88829, 'alpha': 3.241559695e-05},
    "('cerebras/Cerebras-GPT-1.3B', 'wikiword')": {'A': -2801.316556, 'B': 2804.267032, 'alpha': 3.640117085e-05},
    "('cerebras/Cerebras-GPT-256M', 'flan')": {'A': -4611.474391, 'B': 4615.685187, 'alpha': 3.135204722e-05},
    "('cerebras/Cerebras-GPT-256M', 'gigaword')": {'A': -8313.964541, 'B': 8318.18321, 'alpha': 2.713277306e-05},
    "('cerebras/Cerebras-GPT-256M', 'wikiword')": {'A': -18.85652913, 'B': 22.83678187, 'alpha': 0.007851560092},
    "('facebook/bart-base', 'flan')": {'A': -8710.157573, 'B': 8715.880715, 'alpha': 3.200248092e-05},
    "('facebook/bart-base', 'gigaword')": {'A': -9243.220356, 'B': 9248.631552, 'alpha': 3.62034338e-05},
    "('facebook/bart-base', 'wikiword')": {'A': 0.2505127082, 'B': 5.861960896, 'alpha': 0.1201854396},
    "('facebook/bart-large', 'flan')": {'A': -4109.920415, 'B': 4114.374298, 'alpha': 4.817980936e-05},
    "('facebook/bart-large', 'gigaword')": {'A': -8915.743973, 'B': 8920.986972, 'alpha': 3.744345395e-05},
    "('facebook/bart-large', 'wikiword')": {'A': 0.7814640543, 'B': 2.620750818, 'alpha': 0.1152038048},
    "('facebook/opt-1.3b', 'flan')": {'A': -4047.521923, 'B': 4050.525548, 'alpha': 2.525409335e-05},
    "('facebook/opt-1.3b', 'gigaword')": {'A': -5808.928716, 'B': 5812.996425, 'alpha': 3.872644109e-05},
    "('facebook/opt-1.3b', 'wikiword')": {'A': -1.456883583, 'B': 3.745412801, 'alpha': 0.02195676682},
    "('facebook/opt-350m', 'flan')": {'A': -5046.306355, 'B': 5050.402958, 'alpha': 3.105100761e-05},
    "('facebook/opt-350m', 'gigaword')": {'A': -8546.627786, 'B': 8551.417352, 'alpha': 3.32394917e-05},
    "('facebook/opt-350m', 'wikiword')": {'A': -1.12612954, 'B': 4.260565516, 'alpha': 0.03532388588},
    "('facebook/opt-6.7b', 'flan')": {'A': -6.628198975, 'B': 8.840212818, 'alpha': 0.004229133986},
    "('facebook/opt-6.7b', 'gigaword')": {'A': -1042.934868, 'B': 1045.09753, 'alpha': 2.66621148e-05},
    "('facebook/opt-6.7b', 'wikiword')": {'A': 0.2693294718, 'B': 1.788104434, 'alpha': 0.04220831611},
    "('google/mt5-base', 'flan')": {'A': -2046.453987, 'B': 2050.553311, 'alpha': 8.082852091e-05},
    "('google/mt5-base', 'gigaword')": {'A': -2558.786766, 'B': 2562.122418, 'alpha': 3.215005554e-05},
    "('google/mt5-base', 'wikiword')": {'A': -34.04468734, 'B': 37.94761142, 'alpha': 0.005395576769},
    "('google/mt5-large', 'flan')": {'A': -3272.79782, 'B': 3276.026208, 'alpha': 3.526798902e-05},
    "('google/mt5-large', 'gigaword')": {'A': -2859.850037, 'B': 2863.264817, 'alpha': 3.330684106e-05},
    "('google/mt5-large', 'wikiword')": {'A': -1.180043896, 'B': 4.866217417, 'alpha': 0.04883425147},
    "('gpt2', 'flan')": {'A': -10281.29052, 'B': 10287.47069, 'alpha': 2.879731311e-05},
    "('gpt2', 'gigaword')": {'A': -8699.204248, 'B': 8704.021771, 'alpha': 3.22787079e-05},
    "('gpt2', 'wikiword')": {'A': -2263.843178, 'B': 2267.336925, 'alpha': 6.509264453e-05},
    "('t5-base', 'flan')": {'A': -3875.638003, 'B': 3878.923652, 'alpha': 3.02155981e-05},
    "('t5-base', 'gigaword')": {'A': 0.4167409862, 'B': 1.823379391, 'alpha': 0.1674599731},
    "('t5-base', 'wikiword')": {'A': -2315.534964, 'B': 2317.66734, 'alpha': 2.92828973e-05},
    "('t5-small', 'flan')": {'A': -4336.333336, 'B': 4340.085568, 'alpha': 3.098564388e-05},
    "('t5-small', 'gigaword')": {'A': 0.4009177682, 'B': 1.775741828, 'alpha': 0.1343979639},
    "('t5-small', 'wikiword')": {'A': -2123.447508, 'B': 2126.042054, 'alpha': 4.260045172e-05},
}

def _predict_one(sft_data_size: float, p: dict) -> float:
    # Ensure positive size
    n = max(float(sft_data_size), 1e-12)
    return float(p['A'] + p['B'] * (n ** (-p['alpha'])))

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # If unseen group, back off to global average of parameters
    if group in _PARAMS:
        p = _PARAMS[group]
    else:
        # simple average parameters
        if not hasattr(law, '_avg_params'):
            import numpy as _np
            A = _np.mean([v['A'] for v in _PARAMS.values()])
            B = _np.mean([v['B'] for v in _PARAMS.values()])
            alpha = _np.mean([v['alpha'] for v in _PARAMS.values()])
            law._avg_params = {'A': float(A), 'B': float(B), 'alpha': float(alpha)}
        p = law._avg_params

    outputs: list[dict[str, float]] = []
    for row in input_data:
        n = row.get('sft_data_size')
        if n is None:
            raise KeyError("Each input row must contain 'sft_data_size'.")
        pred = _predict_one(n, p)
        outputs.append({'sft_loss': float(pred)})
    return outputs

#2 Run 2 R² = 0.821644

▼

Python

# Auto-generated scaling law for SFT loss vs data size
from __future__ import annotations
from math import pow

# Per-group parameters for L(N) = c + a * N^{-b}
_PARAMS = {
  "('MBZUAI/LaMini-GPT-124M', 'flan')": {
    "c": 1.4827993951275662,
    "a": 8.681233620552247,
    "b": 0.17615205560077954,
    "mse": 0.07572095110708876,
    "n": 12
  },
  "('MBZUAI/LaMini-GPT-124M', 'gigaword')": {
    "c": 0.5778526375054923,
    "a": 8.132271163266724,
    "b": 0.17780779263551502,
    "mse": 0.09155548707084517,
    "n": 12
  },
  "('MBZUAI/LaMini-GPT-124M', 'wikiword')": {
    "c": 1.2749505667194287,
    "a": 4.096753821370224,
    "b": 0.18127272693009286,
    "mse": 0.007576015148309798,
    "n": 12
  },
  "('MBZUAI/LaMini-GPT-774M', 'flan')": {
    "c": 1.3969925089074628,
    "a": 6.40984847550739,
    "b": 0.17199807099464345,
    "mse": 0.04227240896528709,
    "n": 12
  },
  "('MBZUAI/LaMini-GPT-774M', 'gigaword')": {
    "c": 0.4749899427965494,
    "a": 7.316034511850357,
    "b": 0.17951634849788112,
    "mse": 0.06685341699938115,
    "n": 12
  },
  "('MBZUAI/LaMini-GPT-774M', 'wikiword')": {
    "c": 1.1984324837502354,
    "a": 2.8225175602427934,
    "b": 0.18381610159857534,
    "mse": 0.002350416955268903,
    "n": 12
  },
  "('cerebras/Cerebras-GPT-1.3B', 'flan')": {
    "c": 1.5983096713993608,
    "a": 3.625465436911771,
    "b": 0.18232212054329097,
    "mse": 0.006276539391354531,
    "n": 12
  },
  "('cerebras/Cerebras-GPT-1.3B', 'gigaword')": {
    "c": 0.873264704567136,
    "a": 5.702649719858272,
    "b": 0.17983169580523745,
    "mse": 0.01956385652134642,
    "n": 12
  },
  "('cerebras/Cerebras-GPT-1.3B', 'wikiword')": {
    "c": 1.380007205548204,
    "a": 3.0382321918197386,
    "b": 0.18059759917861193,
    "mse": 0.004495478358400602,
    "n": 12
  },
  "('cerebras/Cerebras-GPT-256M', 'flan')": {
    "c": 1.8824405987821344,
    "a": 4.3105173431135455,
    "b": 0.16798665859019904,
    "mse": 0.016099566418512953,
    "n": 12
  },
  "('cerebras/Cerebras-GPT-256M', 'gigaword')": {
    "c": 0.7013636723381862,
    "a": 6.83969648610096,
    "b": 0.17912089400734008,
    "mse": 0.040609922074825354,
    "n": 12
  },
  "('cerebras/Cerebras-GPT-256M', 'wikiword')": {
    "c": 1.3966611005609038,
    "a": 4.908488983299831,
    "b": 0.18345726653962038,
    "mse": 0.00725928806266427,
    "n": 12
  },
  "('facebook/bart-base', 'flan')": {
    "c": 1.402782652932343,
    "a": 8.376545770083204,
    "b": 0.17972746603444834,
    "mse": 0.044879946929858,
    "n": 12
  },
  "('facebook/bart-base', 'gigaword')": {
    "c": 0.2764984487592691,
    "a": 10.213711557771816,
    "b": 0.1847929973487988,
    "mse": 0.09375044998301889,
    "n": 12
  },
  "('facebook/bart-base', 'wikiword')": {
    "c": 0.9710400825952036,
    "a": 6.95185109881479,
    "b": 0.19363752005641285,
    "mse": 0.006138814059146752,
    "n": 12
  },
  "('facebook/bart-large', 'flan')": {
    "c": 1.3945609121941243,
    "a": 5.826039279936208,
    "b": 0.1781279206165522,
    "mse": 0.014543204325175108,
    "n": 12
  },
  "('facebook/bart-large', 'gigaword')": {
    "c": 0.16518805426565897,
    "a": 10.136677046755027,
    "b": 0.18675428613221712,
    "mse": 0.07801476549607378,
    "n": 12
  },
  "('facebook/bart-large', 'wikiword')": {
    "c": 1.1065786999405036,
    "a": 3.072980337237203,
    "b": 0.18460659464051513,
    "mse": 0.0005493395674912767,
    "n": 12
  },
  "('facebook/opt-1.3b', 'flan')": {
    "c": 1.4345452517670585,
    "a": 3.028618372301197,
    "b": 0.1808380053964771,
    "mse": 0.004029167599996951,
    "n": 12
  },
  "('facebook/opt-1.3b', 'gigaword')": {
    "c": 0.634715119130116,
    "a": 6.7608938134760015,
    "b": 0.1844253259847985,
    "mse": 0.028949249451991333,
    "n": 12
  },
  "('facebook/opt-1.3b', 'wikiword')": {
    "c": 1.2108912294465461,
    "a": 1.981892390506441,
    "b": 0.18507486951364943,
    "mse": 0.0010254072451437843,
    "n": 12
  },
  "('facebook/opt-350m', 'flan')": {
    "c": 1.6032673720312067,
    "a": 4.698389125373991,
    "b": 0.17197678324723842,
    "mse": 0.018425011918658536,
    "n": 12
  },
  "('facebook/opt-350m', 'gigaword')": {
    "c": 0.4487475721624149,
    "a": 8.612529868566897,
    "b": 0.18513602069713941,
    "mse": 0.05488526088852023,
    "n": 12
  },
  "('facebook/opt-350m', 'wikiword')": {
    "c": 1.318657032305419,
    "a": 3.1918344343120673,
    "b": 0.1840127387732732,
    "mse": 0.0022594662870228117,
    "n": 12
  },
  "('facebook/opt-6.7b', 'flan')": {
    "c": 1.6583758241931825,
    "a": 1.0527934808943984,
    "b": 0.18172580896242374,
    "mse": 0.0003657389558883888,
    "n": 12
  },
  "('facebook/opt-6.7b', 'gigaword')": {
    "c": 1.7214619335229422,
    "a": 0.8501645122933167,
    "b": 0.17553433626221163,
    "mse": 0.0009104202480365186,
    "n": 12
  },
  "('facebook/opt-6.7b', 'wikiword')": {
    "c": 1.187363539503907,
    "a": 1.4987769857971536,
    "b": 0.18477038158174838,
    "mse": 0.00047565093841222937,
    "n": 12
  },
  "('google/mt5-base', 'flan')": {
    "c": 1.583323676667614,
    "a": 4.8827336950093425,
    "b": 0.18333559931283497,
    "mse": 0.008313217244721836,
    "n": 12
  },
  "('google/mt5-base', 'gigaword')": {
    "c": 2.0551210337798635,
    "a": 2.4427896526776167,
    "b": 0.17715640026992513,
    "mse": 0.0038503896974422433,
    "n": 12
  },
  "('google/mt5-base', 'wikiword')": {
    "c": 0.9166373518594958,
    "a": 5.753481901950828,
    "b": 0.1851212657710932,
    "mse": 0.010790900408354289,
    "n": 12
  },
  "('google/mt5-large', 'flan')": {
    "c": 1.4726763734534132,
    "a": 3.4206192214207842,
    "b": 0.18366684343063902,
    "mse": 0.0045625286793028385,
    "n": 12
  },
  "('google/mt5-large', 'gigaword')": {
    "c": 1.905904106139712,
    "a": 2.9000173722337452,
    "b": 0.17523941384449754,
    "mse": 0.008772571224160345,
    "n": 12
  },
  "('google/mt5-large', 'wikiword')": {
    "c": 1.0603143890912172,
    "a": 4.462514518829667,
    "b": 0.1860974601465054,
    "mse": 0.003732898104365934,
    "n": 12
  },
  "('gpt2', 'flan')": {
    "c": 1.499554116407968,
    "a": 9.023130844642328,
    "b": 0.17585201323649569,
    "mse": 0.08812713809218636,
    "n": 12
  },
  "('gpt2', 'gigaword')": {
    "c": 0.46457851812916895,
    "a": 8.5707377758404,
    "b": 0.1817309876468499,
    "mse": 0.0740879352217463,
    "n": 12
  },
  "('gpt2', 'wikiword')": {
    "c": 1.2540722578538999,
    "a": 4.36437086595112,
    "b": 0.18391632922421508,
    "mse": 0.007258446809941537,
    "n": 12
  },
  "('t5-base', 'flan')": {
    "c": 1.4471135954239664,
    "a": 3.4628046853454775,
    "b": 0.17403832928764254,
    "mse": 0.006941254099221423,
    "n": 12
  },
  "('t5-base', 'gigaword')": {
    "c": 0.45810999439596933,
    "a": 1.955554864376512,
    "b": 0.18850642412886137,
    "mse": 0.0001945310979496464,
    "n": 12
  },
  "('t5-base', 'wikiword')": {
    "c": 1.0907259526778699,
    "a": 2.000945824397046,
    "b": 0.18014107601933568,
    "mse": 0.001727331273734328,
    "n": 12
  },
  "('t5-small', 'flan')": {
    "c": 1.6669969283388792,
    "a": 3.9878042798753763,
    "b": 0.1778290229582325,
    "mse": 0.008247131168222439,
    "n": 12
  },
  "('t5-small', 'gigaword')": {
    "c": 0.5448930641881546,
    "a": 2.0501491852544573,
    "b": 0.18876257345373862,
    "mse": 0.00029128786119094033,
    "n": 12
  },
  "('t5-small', 'wikiword')": {
    "c": 1.1972462755987916,
    "a": 2.661511171116643,
    "b": 0.17821433575400167,
    "mse": 0.003410986481257676,
    "n": 12
  }
}

def _predict_loss(N: float, params: dict[str, float]) -> float:
    # ensure positive N
    if N <= 0 or not (N == N):
        return float('nan')
    c = float(params.get('c', 0.0))
    a = float(params.get('a', 0.0))
    b = float(params.get('b', 0.5))
    return c + a * (N ** (-b))

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Resolve parameters for the provided group key (supports tuple, list, or string)
    params = _PARAMS.get(group)
    if params is None:
        # try string forms
        keys_to_try = [str(group)]
        try:
            if not isinstance(group, (str, bytes)):
                keys_to_try.append(str(tuple(group)))
        except Exception:
            pass
        for k in keys_to_try:
            params = _PARAMS.get(k)
            if params is not None:
                break
    if params is None:
        # fallback: average params across groups
        if _PARAMS:
            import statistics
            cs = [v['c'] for v in _PARAMS.values()]
            aas = [v['a'] for v in _PARAMS.values()]
            bs = [v['b'] for v in _PARAMS.values()]
            params = {'c': statistics.fmean(cs), 'a': statistics.fmean(aas), 'b': statistics.fmean(bs)}
        else:
            params = {'c': 0.0, 'a': 0.0, 'b': 0.5}
    out = []
    # support possible alternative key names
    for row in input_data:
        if "sft_data_size" in row:
            N = float(row["sft_data_size"])
        elif "data_size" in row:
            N = float(row["data_size"])
        elif "N" in row:
            N = float(row["N"])
        else:
            # attempt to pull the first numeric value
            N = None
            for k, v in row.items():
                if isinstance(v, (int, float)):
                    N = float(v)
                    break
            if N is None:
                N = float('nan')
        pred = _predict_loss(N, params)
        out.append({'sft_loss': float(pred)})
    return out

#3 Run 3 R² = 0.787239

▼

Python

# Auto-generated scaling law for SFT loss vs data size
from __future__ import annotations
from math import pow

# Fitted parameters per group for the law:
# L(N) = L_inf + A * N^(-alpha)
_PARAMS = {
  "('MBZUAI/LaMini-GPT-124M', 'flan')": {
    "L_inf": 4.1793489034329725e-20,
    "A": 6.973887891988893,
    "alpha": 0.081341579969044,
    "mse": 0.030743888348882354,
    "n": 12
  },
  "('MBZUAI/LaMini-GPT-124M', 'gigaword')": {
    "L_inf": 5.883079522069716e-16,
    "A": 5.923745119438586,
    "alpha": 0.10520864135386986,
    "mse": 0.05049249549339201,
    "n": 12
  },
  "('MBZUAI/LaMini-GPT-124M', 'wikiword')": {
    "L_inf": 1.2330261467896494e-20,
    "A": 3.7815959443927847,
    "alpha": 0.06407311292435097,
    "mse": 0.0013062020699845377,
    "n": 12
  },
  "('MBZUAI/LaMini-GPT-774M', 'flan')": {
    "L_inf": 2.2115004548850216e-18,
    "A": 5.4161940454283455,
    "alpha": 0.07203581409877267,
    "mse": 0.01625972280178637,
    "n": 12
  },
  "('MBZUAI/LaMini-GPT-774M', 'gigaword')": {
    "L_inf": 3.2695866557369864e-17,
    "A": 5.3506894918550945,
    "alpha": 0.10952377275114691,
    "mse": 0.036835097370031274,
    "n": 12
  },
  "('MBZUAI/LaMini-GPT-774M', 'wikiword')": {
    "L_inf": 6.668374080348387e-21,
    "A": 2.88150676365687,
    "alpha": 0.0539025527718289,
    "mse": 0.00013461062353177402,
    "n": 12
  },
  "('cerebras/Cerebras-GPT-1.3B', 'flan')": {
    "L_inf": 5.501435576368134e-20,
    "A": 3.726687484240208,
    "alpha": 0.05133622894510909,
    "mse": 0.0009110207012089926,
    "n": 12
  },
  "('cerebras/Cerebras-GPT-1.3B', 'gigaword')": {
    "L_inf": 2.4132832812521193e-26,
    "A": 4.601802094371978,
    "alpha": 0.08937942184884487,
    "mse": 0.006796620975964945,
    "n": 12
  },
  "('cerebras/Cerebras-GPT-1.3B', 'wikiword')": {
    "L_inf": 1.423435602267005e-20,
    "A": 3.1670522137832107,
    "alpha": 0.0501048228642509,
    "mse": 0.0006835440956639627,
    "n": 12
  },
  "('cerebras/Cerebras-GPT-256M', 'flan')": {
    "L_inf": 1.610802906709126e-17,
    "A": 4.497775766125655,
    "alpha": 0.04919729778903829,
    "mse": 0.005015498498160805,
    "n": 12
  },
  "('cerebras/Cerebras-GPT-256M', 'gigaword')": {
    "L_inf": 2.348802984257992e-22,
    "A": 5.237426600944778,
    "alpha": 0.1000412247174401,
    "mse": 0.018155642203818274,
    "n": 12
  },
  "('cerebras/Cerebras-GPT-256M', 'wikiword')": {
    "L_inf": 6.475596350068448e-23,
    "A": 4.456019624245517,
    "alpha": 0.0690574151700614,
    "mse": 0.0007262970705774059,
    "n": 12
  },
  "('facebook/bart-base', 'flan')": {
    "L_inf": 1.2959852247174561e-21,
    "A": 6.796594790085481,
    "alpha": 0.0855017030154742,
    "mse": 0.014590039674770308,
    "n": 12
  },
  "('facebook/bart-base', 'gigaword')": {
    "L_inf": 3.1659370286402894e-16,
    "A": 7.594463261439606,
    "alpha": 0.13330975424655475,
    "mse": 0.056185185324805854,
    "n": 12
  },
  "('facebook/bart-base', 'wikiword')": {
    "L_inf": 0.25051130094754676,
    "A": 5.861960773835969,
    "alpha": 0.12018535548857547,
    "mse": 0.0037079423898107194,
    "n": 12
  },
  "('facebook/bart-large', 'flan')": {
    "L_inf": 6.4911964594333156e-21,
    "A": 5.1213143392652265,
    "alpha": 0.07407829967451605,
    "mse": 0.0035588510622728106,
    "n": 12
  },
  "('facebook/bart-large', 'gigaword')": {
    "L_inf": 9.750963077899339e-17,
    "A": 7.683917595492025,
    "alpha": 0.14310655452002652,
    "mse": 0.048345250585162335,
    "n": 12
  },
  "('facebook/bart-large', 'wikiword')": {
    "L_inf": 0.7814634104310457,
    "A": 2.6207508355114477,
    "alpha": 0.11520372579215718,
    "mse": 6.40157761487044e-05,
    "n": 12
  },
  "('facebook/opt-1.3b', 'flan')": {
    "L_inf": 6.12910632776623e-14,
    "A": 3.217103534074292,
    "alpha": 0.0490507140388579,
    "mse": 0.0004827374136280498,
    "n": 12
  },
  "('facebook/opt-1.3b', 'gigaword')": {
    "L_inf": 1.832939179598597e-20,
    "A": 5.229994665907221,
    "alpha": 0.10805298142346467,
    "mse": 0.012542874254628038,
    "n": 12
  },
  "('facebook/opt-1.3b', 'wikiword')": {
    "L_inf": 4.2312065248521274e-17,
    "A": 2.349129117760962,
    "alpha": 0.04176287734523659,
    "mse": 2.8524991491276095e-05,
    "n": 12
  },
  "('facebook/opt-350m', 'flan')": {
    "L_inf": 2.0197599888952298e-18,
    "A": 4.468174311531377,
    "alpha": 0.05761166109349805,
    "mse": 0.0056736754061609785,
    "n": 12
  },
  "('facebook/opt-350m', 'gigaword')": {
    "L_inf": 5.563786947697396e-15,
    "A": 6.499200600754544,
    "alpha": 0.1233184573699429,
    "mse": 0.028689612424312425,
    "n": 12
  },
  "('facebook/opt-350m', 'wikiword')": {
    "L_inf": 3.0105155652800226e-15,
    "A": 3.241838818207868,
    "alpha": 0.05546399768730424,
    "mse": 0.00010119066683621969,
    "n": 12
  },
  "('facebook/opt-6.7b', 'flan')": {
    "L_inf": 9.622218431655403e-13,
    "A": 2.2339677922861707,
    "alpha": 0.019148365263688585,
    "mse": 2.8620341700751262e-05,
    "n": 12
  },
  "('facebook/opt-6.7b', 'gigaword')": {
    "L_inf": 4.6641236377906274e-14,
    "A": 2.17707994930431,
    "alpha": 0.014498526976677223,
    "mse": 0.00026131662343541954,
    "n": 12
  },
  "('facebook/opt-6.7b', 'wikiword')": {
    "L_inf": 0.269329386441078,
    "A": 1.7881045126980408,
    "alpha": 0.042208313187313336,
    "mse": 2.1376271193012684e-05,
    "n": 12
  },
  "('google/mt5-base', 'flan')": {
    "L_inf": 3.455535148478193e-24,
    "A": 4.57114901047889,
    "alpha": 0.06361718986332089,
    "mse": 0.0010108168080930104,
    "n": 12
  },
  "('google/mt5-base', 'gigaword')": {
    "L_inf": 9.517467031000367e-21,
    "A": 3.4396037440312734,
    "alpha": 0.03165062209138569,
    "mse": 0.0008017992848500326,
    "n": 12
  },
  "('google/mt5-base', 'wikiword')": {
    "L_inf": 8.99631294455016e-21,
    "A": 4.734453334406631,
    "alpha": 0.09261262581521647,
    "mse": 0.0022927464043197157,
    "n": 12
  },
  "('google/mt5-large', 'flan')": {
    "L_inf": 2.4562094313996466e-22,
    "A": 3.492798569196681,
    "alpha": 0.052811814552779335,
    "mse": 0.0004994391426252324,
    "n": 12
  },
  "('google/mt5-large', 'gigaword')": {
    "L_inf": 1.0352991946075001e-19,
    "A": 3.5492577974949895,
    "alpha": 0.036881264793715444,
    "mse": 0.0025252116437143994,
    "n": 12
  },
  "('google/mt5-large', 'wikiword')": {
    "L_inf": 4.548214147267365e-23,
    "A": 3.926297383655625,
    "alpha": 0.07816268968188453,
    "mse": 0.00017179343960424252,
    "n": 12
  },
  "('gpt2', 'flan')": {
    "L_inf": 5.164356218599464e-20,
    "A": 7.198398207778583,
    "alpha": 0.08172045065773158,
    "mse": 0.037661811955924634,
    "n": 12
  },
  "('gpt2', 'gigaword')": {
    "L_inf": 1.782838501769481e-15,
    "A": 6.33904927480447,
    "alpha": 0.11725594450359499,
    "mse": 0.04049234172260986,
    "n": 12
  },
  "('gpt2', 'wikiword')": {
    "L_inf": 3.19411219632461e-17,
    "A": 3.9462672122831934,
    "alpha": 0.06811657142788795,
    "mse": 0.001116325426678392,
    "n": 12
  },
  "('t5-base', 'flan')": {
    "L_inf": 3.6355061460707474e-19,
    "A": 3.543052166455486,
    "alpha": 0.0522551179370743,
    "mse": 0.0015015110569161615,
    "n": 12
  },
  "('t5-base', 'gigaword')": {
    "L_inf": 0.4167409848191003,
    "A": 1.8233793907359293,
    "alpha": 0.16745997264108628,
    "mse": 0.00017658025868276737,
    "n": 12
  },
  "('t5-base', 'wikiword')": {
    "L_inf": 2.7928490722104062e-24,
    "A": 2.2598720896244022,
    "alpha": 0.044573869717399746,
    "mse": 0.00018275726524061436,
    "n": 12
  },
  "('t5-small', 'flan')": {
    "L_inf": 1.7341701634487937e-22,
    "A": 4.052493004920058,
    "alpha": 0.052691030655664196,
    "mse": 0.001428159543216654,
    "n": 12
  },
  "('t5-small', 'gigaword')": {
    "L_inf": 0.40091754011564584,
    "A": 1.7757417179384685,
    "alpha": 0.13439790681006297,
    "mse": 0.00016924967811668228,
    "n": 12
  },
  "('t5-small', 'wikiword')": {
    "L_inf": 6.740126125023492e-21,
    "A": 2.790573720093778,
    "alpha": 0.05079953102151449,
    "mse": 0.000637766583775489,
    "n": 12
  }
}

_DEFAULT_GROUP = next(iter(_PARAMS.keys())) if _PARAMS else 'default'

def _get_params(group: str):
    g = group if group in _PARAMS else _DEFAULT_GROUP
    p = _PARAMS[g]
    return float(p['L_inf']), float(p['A']), float(p['alpha'])


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    Linf, A, alpha = _get_params(group)
    out = []
    for row in input_data:
        N = float(row.get('sft_data_size', 0.0))
        if N <= 0:
            pred = float(Linf + A)
        else:
            pred = float(Linf + A * (N ** (-alpha)))
        out.append({'sft_loss': pred})
    return out