SLD - SFT Scaling Law - opencode + GPT-5

All Runs (sorted by R²)

Best Run 1 R² = 0.893342

▼

Python

from __future__ import annotations
from typing import List, Dict

# Discovered scaling law (shared functional form across groups):
#   sft_loss(N) = L_inf + A * N^(-alpha)
# Coefficients are fitted per experimental group.

# Per-group parameters fitted from /app/data
PARAMS: Dict[str, Dict[str, float]] = {
    "('MBZUAI/LaMini-GPT-124M', 'flan')": {'L_inf': 0.0000000000, 'A': 7.4655976596, 'alpha': 0.0893833639},
    "('MBZUAI/LaMini-GPT-124M', 'gigaword')": {'L_inf': 0.0000000000, 'A': 6.9696265408, 'alpha': 0.1248879757},
    "('MBZUAI/LaMini-GPT-124M', 'wikiword')": {'L_inf': 0.0000000000, 'A': 3.8453873318, 'alpha': 0.0660024991},
    "('MBZUAI/LaMini-GPT-774M', 'flan')": {'L_inf': 0.0000000000, 'A': 5.7130408378, 'alpha': 0.0782922582},
    "('MBZUAI/LaMini-GPT-774M', 'gigaword')": {'L_inf': 0.0000000000, 'A': 6.2931215297, 'alpha': 0.1291848119},
    "('MBZUAI/LaMini-GPT-774M', 'wikiword')": {'L_inf': 0.0000000000, 'A': 2.8960123299, 'alpha': 0.0544754427},
    "('cerebras/Cerebras-GPT-1.3B', 'flan')": {'L_inf': 0.0000000000, 'A': 3.7633686465, 'alpha': 0.0524564535},
    "('cerebras/Cerebras-GPT-1.3B', 'gigaword')": {'L_inf': 0.0000000000, 'A': 4.8571354747, 'alpha': 0.0957503004},
    "('cerebras/Cerebras-GPT-1.3B', 'wikiword')": {'L_inf': 0.0000000000, 'A': 3.1982974523, 'alpha': 0.0512269552},
    "('cerebras/Cerebras-GPT-256M', 'flan')": {'L_inf': 0.0000000000, 'A': 4.5820265379, 'alpha': 0.0513362827},
    "('cerebras/Cerebras-GPT-256M', 'gigaword')": {'L_inf': 0.0000000000, 'A': 5.7873187312, 'alpha': 0.1119673779},
    "('cerebras/Cerebras-GPT-256M', 'wikiword')": {'L_inf': 0.0000000000, 'A': 4.5087532133, 'alpha': 0.0704142022},
    "('facebook/bart-base', 'flan')": {'L_inf': 0.0000000000, 'A': 7.1611436323, 'alpha': 0.0916473833},
    "('facebook/bart-base', 'gigaword')": {'L_inf': 0.0000000000, 'A': 9.2960910354, 'alpha': 0.1581352715},
    "('facebook/bart-base', 'wikiword')": {'L_inf': 0.2857551447, 'A': 5.8306038938, 'alpha': 0.1217045037},
    "('facebook/bart-large', 'flan')": {'L_inf': 0.0000000000, 'A': 5.2395659867, 'alpha': 0.0767344267},
    "('facebook/bart-large', 'gigaword')": {'L_inf': 0.0000000000, 'A': 9.5069117910, 'alpha': 0.1693705958},
    "('facebook/bart-large', 'wikiword')": {'L_inf': 0.7850518893, 'A': 2.6271353620, 'alpha': 0.1159319757},
    "('facebook/opt-1.3b', 'flan')": {'L_inf': 0.0000000000, 'A': 3.2428955975, 'alpha': 0.0499613896},
    "('facebook/opt-1.3b', 'gigaword')": {'L_inf': 0.0000000000, 'A': 5.6934577617, 'alpha': 0.1182278832},
    "('facebook/opt-1.3b', 'wikiword')": {'L_inf': 0.0000000000, 'A': 2.3523591081, 'alpha': 0.0419183827},
    "('facebook/opt-350m', 'flan')": {'L_inf': 0.0000000000, 'A': 4.5858563672, 'alpha': 0.0606207735},
    "('facebook/opt-350m', 'gigaword')": {'L_inf': 0.0000000000, 'A': 7.4768918755, 'alpha': 0.1403389747},
    "('facebook/opt-350m', 'wikiword')": {'L_inf': 0.0000000000, 'A': 3.2500719821, 'alpha': 0.0557536570},
    "('facebook/opt-6.7b', 'flan')": {'L_inf': 0.0000000000, 'A': 2.2344284065, 'alpha': 0.0191717060},
    "('facebook/opt-6.7b', 'gigaword')": {'L_inf': 0.0000000000, 'A': 2.1808236698, 'alpha': 0.0146927813},
    "('facebook/opt-6.7b', 'wikiword')": {'L_inf': 0.2715471457, 'A': 1.7859035415, 'alpha': 0.0422749399},
    "('google/mt5-base', 'flan')": {'L_inf': 0.0000000000, 'A': 4.6211698165, 'alpha': 0.0648699072},
    "('google/mt5-base', 'gigaword')": {'L_inf': 0.0000000000, 'A': 3.4542367430, 'alpha': 0.0321327577},
    "('google/mt5-base', 'wikiword')": {'L_inf': 0.0000000000, 'A': 4.8802698557, 'alpha': 0.0961777019},
    "('google/mt5-large', 'flan')": {'L_inf': 0.0000000000, 'A': 3.5193429654, 'alpha': 0.0536772855},
    "('google/mt5-large', 'gigaword')": {'L_inf': 0.0000000000, 'A': 3.5889984642, 'alpha': 0.0381545424},
    "('google/mt5-large', 'wikiword')": {'L_inf': 0.0000000000, 'A': 3.9537507864, 'alpha': 0.0789696371},
    "('gpt2', 'flan')": {'L_inf': 0.0000000000, 'A': 7.7432486686, 'alpha': 0.0903489876},
    "('gpt2', 'gigaword')": {'L_inf': 0.0000000000, 'A': 7.4481233493, 'alpha': 0.1368351252},
    "('gpt2', 'wikiword')": {'L_inf': 0.0000000000, 'A': 4.0053815966, 'alpha': 0.0698352636},
    "('t5-base', 'flan')": {'L_inf': 0.0000000000, 'A': 3.5927162124, 'alpha': 0.0538530120},
    "('t5-base', 'gigaword')": {'L_inf': 0.4187926280, 'A': 1.8293722612, 'alpha': 0.1684931683},
    "('t5-base', 'wikiword')": {'L_inf': 0.0000000000, 'A': 2.2734524248, 'alpha': 0.0452549072},
    "('t5-small', 'flan')": {'L_inf': 0.0000000000, 'A': 4.1031621457, 'alpha': 0.0541145502},
    "('t5-small', 'gigaword')": {'L_inf': 0.4134344504, 'A': 1.7782418386, 'alpha': 0.1373669844},
    "('t5-small', 'wikiword')": {'L_inf': 0.0000000000, 'A': 2.8159260594, 'alpha': 0.0518360448},
}

# Fallback parameters (simple robust averages) in case an unseen group is requested
# Computed as medians over fitted groups to be more stable for extrapolation
_FALLBACK = {'L_inf': 0.0, 'A': 4.0, 'alpha': 0.07}


def _predict_one(n: float, params: Dict[str, float]) -> float:
    # Guard against non-positive sizes
    n = max(float(n), 1.0)
    L_inf = params['L_inf']
    A = params['A']
    alpha = params['alpha']
    return L_inf + A * (n ** (-alpha))


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    params = PARAMS.get(group, _FALLBACK)
    outputs: List[Dict[str, float]] = []
    for row in input_data:
        if 'sft_data_size' not in row:
            raise KeyError("Missing required input key 'sft_data_size'")
        n = row['sft_data_size']
        y = _predict_one(n, params)
        outputs.append({'sft_loss': float(y)})
    return outputs

#2 Run 2 R² = 0.887104

▼

Python

from __future__ import annotations
from typing import Dict, Tuple, List

# Scaling law form (shared across groups):
#   sft_loss(N) = L_inf[group] + K[group] * N ** (-alpha[group])
# If an unknown group is requested, fall back to DEFAULT params (median across groups).

PARAMS: Dict[str, Tuple[float, float, float]] = {
    "('MBZUAI/LaMini-GPT-124M', 'flan')": (0.21512777, 7.40276909, 0.09605112),
    "('MBZUAI/LaMini-GPT-124M', 'gigaword')": (0.12094358, 7.07353050, 0.13298803),
    "('MBZUAI/LaMini-GPT-124M', 'wikiword')": (0.15948628, 3.73425333, 0.07154606),
    "('MBZUAI/LaMini-GPT-774M', 'flan')": (0.18870053, 5.61973142, 0.08431313),
    "('MBZUAI/LaMini-GPT-774M', 'gigaword')": (0.10434121, 6.39938739, 0.13747389),
    "('MBZUAI/LaMini-GPT-774M', 'wikiword')": (0.14218530, 2.78010796, 0.05931183),
    "('cerebras/Cerebras-GPT-1.3B', 'flan')": (0.18812161, 3.60793698, 0.05713703),
    "('cerebras/Cerebras-GPT-1.3B', 'gigaword')": (0.13158650, 4.83288193, 0.10278902),
    "('cerebras/Cerebras-GPT-1.3B', 'wikiword')": (0.16175015, 3.06328378, 0.05580373),
    "('cerebras/Cerebras-GPT-256M', 'flan')": (0.22111855, 4.39985898, 0.05576369),
    "('cerebras/Cerebras-GPT-256M', 'gigaword')": (0.12292923, 5.82391597, 0.11962378),
    "('cerebras/Cerebras-GPT-256M', 'wikiword')": (0.17848771, 4.39227977, 0.07626124),
    "('facebook/bart-base', 'flan')": (0.20510091, 7.10558756, 0.09850559),
    "('facebook/bart-base', 'gigaword')": (0.10697243, 9.57773349, 0.16738350),
    "('facebook/bart-base', 'wikiword')": (0.29683060, 5.83349926, 0.12242032),
    "('facebook/bart-large', 'flan')": (0.18532633, 5.13665293, 0.08282034),
    "('facebook/bart-large', 'gigaword')": (0.09571155, 9.83827740, 0.17897799),
    "('facebook/bart-large', 'wikiword')": (0.78537576, 2.62717817, 0.11597420),
    "('facebook/opt-1.3b', 'flan')": (0.16717492, 3.10153397, 0.05445005),
    "('facebook/opt-1.3b', 'gigaword')": (0.11621607, 5.74514767, 0.12633081),
    "('facebook/opt-1.3b', 'wikiword')": (0.13683784, 2.22914054, 0.04584659),
    "('facebook/opt-350m', 'flan')": (0.19638808, 4.44078637, 0.06568707),
    "('facebook/opt-350m', 'gigaword')": (0.11186047, 7.63870448, 0.14917509),
    "('facebook/opt-350m', 'wikiword')": (0.15737898, 3.12308331, 0.06068140),
    "('facebook/opt-6.7b', 'flan')": (0.17414991, 2.06339910, 0.02113603),
    "('facebook/opt-6.7b', 'gigaword')": (0.17870209, 2.00412362, 0.01622301),
    "('facebook/opt-6.7b', 'wikiword')": (0.27470871, 1.78302812, 0.04238552),
    "('google/mt5-base', 'flan')": (0.19688726, 4.48030612, 0.07038140),
    "('google/mt5-base', 'gigaword')": (0.22450341, 3.24262697, 0.03523111),
    "('google/mt5-base', 'wikiword')": (0.13712846, 4.85046137, 0.10340951),
    "('google/mt5-large', 'flan')": (0.17416998, 3.37665074, 0.05845846),
    "('google/mt5-large', 'gigaword')": (0.21287811, 3.39496642, 0.04172983),
    "('google/mt5-large', 'wikiword')": (0.14186325, 3.87623996, 0.08536726),
    "('gpt2', 'flan')": (0.21928342, 7.68484059, 0.09705297),
    "('gpt2', 'gigaword')": (0.11301806, 7.59986948, 0.14541089),
    "('gpt2', 'wikiword')": (0.15971244, 3.90044753, 0.07565736),
    "('t5-base', 'flan')": (0.17152739, 3.45373820, 0.05852187),
    "('t5-base', 'gigaword')": (0.41825821, 1.82806061, 0.16825325),
    "('t5-base', 'wikiword')": (0.12472066, 2.16396984, 0.04939527),
    "('t5-small', 'flan')": (0.19776244, 3.94292758, 0.05885328),
    "('t5-small', 'gigaword')": (0.40021184, 1.76842378, 0.13379426),
    "('t5-small', 'wikiword')": (0.14060546, 2.69903350, 0.05642527),
}

DEFAULT: Tuple[float, float, float] = (0.17283865, 3.92168755, 0.07595930)


def _predict_single(n: float, coeffs: Tuple[float, float, float]) -> float:
    L_inf, K, alpha = coeffs
    # Guard against non-positive sizes
    if n is None or n <= 0:
        return float('nan')
    return L_inf + K * (float(n) ** (-alpha))


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    coeffs = PARAMS.get(group, DEFAULT)
    out: List[Dict[str, float]] = []
    for row in input_data:
        n = float(row.get('sft_data_size', float('nan')))
        pred = _predict_single(n, coeffs)
        out.append({'sft_loss': float(pred)})
    return out

#3 Run 3 R² = 0.866421

▼

Python

from __future__ import annotations
from typing import List, Dict

# Discovered scaling law: sft_loss(N) = c + a * N^{-b}
# Same functional form across groups; (a,b,c) differ per group.

PARAMS: Dict[str, Dict[str, float]] = {
    "('MBZUAI/LaMini-GPT-124M', 'flan')": {'a': 7.79709296, 'b': 0.14366504, 'c': 1.15127774},
    "('MBZUAI/LaMini-GPT-124M', 'gigaword')": {'a': 7.18065251, 'b': 0.13966749, 'c': 0.20943581},
    "('MBZUAI/LaMini-GPT-124M', 'wikiword')": {'a': 3.50492382, 'b': 0.09301953, 'c': 0.59486280},
    "('MBZUAI/LaMini-GPT-774M', 'flan')": {'a': 5.55096616, 'b': 0.11852902, 'c': 0.88700526},
    "('MBZUAI/LaMini-GPT-774M', 'gigaword')": {'a': 6.33245067, 'b': 0.13250068, 'c': 0.04341208},
    "('MBZUAI/LaMini-GPT-774M', 'wikiword')": {'a': 2.57317908, 'b': 0.07191404, 'c': 0.42185299},
    "('cerebras/Cerebras-GPT-1.3B', 'flan')": {'a': 3.13641643, 'b': 0.08539028, 'c': 0.88121614},
    "('cerebras/Cerebras-GPT-1.3B', 'gigaword')": {'a': 4.84266305, 'b': 0.11468163, 'c': 0.31586501},
    "('cerebras/Cerebras-GPT-1.3B', 'wikiword')": {'a': 2.72740881, 'b': 0.07469495, 'c': 0.61750148},
    "('cerebras/Cerebras-GPT-256M', 'flan')": {'a': 3.77416454, 'b': 0.09141127, 'c': 1.21118553},
    "('cerebras/Cerebras-GPT-256M', 'gigaword')": {'a': 5.88436542, 'b': 0.12720263, 'c': 0.22929230},
    "('cerebras/Cerebras-GPT-256M', 'wikiword')": {'a': 4.15198671, 'b': 0.10663355, 'c': 0.78487709},
    "('facebook/bart-base', 'flan')": {'a': 7.45393345, 'b': 0.14374514, 'c': 1.05100910},
    "('facebook/bart-base', 'gigaword')": {'a': 9.47113095, 'b': 0.16403119, 'c': 0.06972433},
    "('facebook/bart-base', 'wikiword')": {'a': 5.97887685, 'b': 0.14140928, 'c': 0.54784037},
    "('facebook/bart-large', 'flan')": {'a': 4.99689388, 'b': 0.11656868, 'c': 0.85326329},
    "('facebook/bart-large', 'gigaword')": {'a': 9.38268749, 'b': 0.16542420, 'c': -0.04288451},
    "('facebook/bart-large', 'wikiword')": {'a': 2.62592585, 'b': 0.11454169, 'c': 0.77424909},
    "('facebook/opt-1.3b', 'flan')": {'a': 2.72492496, 'b': 0.07484074, 'c': 0.67174921},
    "('facebook/opt-1.3b', 'gigaword')": {'a': 5.77555355, 'b': 0.12986758, 'c': 0.16216071},
    "('facebook/opt-1.3b', 'wikiword')": {'a': 2.02894972, 'b': 0.05450637, 'c': 0.36837840},
    "('facebook/opt-350m', 'flan')": {'a': 4.04942870, 'b': 0.09803201, 'c': 0.96388081},
    "('facebook/opt-350m', 'gigaword')": {'a': 7.65025332, 'b': 0.14974638, 'c': 0.11860471},
    "('facebook/opt-350m', 'wikiword')": {'a': 2.83147284, 'b': 0.07932386, 'c': 0.57378977},
    "('facebook/opt-6.7b', 'flan')": {'a': 1.51363810, 'b': 0.03174040, 'c': 0.74149915},
    "('facebook/opt-6.7b', 'gigaword')": {'a': 1.40790357, 'b': 0.02514156, 'c': 0.78702090},
    "('facebook/opt-6.7b', 'wikiword')": {'a': 1.75335535, 'b': 0.04356588, 'c': 0.30744922},
    "('google/mt5-base', 'flan')": {'a': 4.12803145, 'b': 0.10607214, 'c': 0.96887260},
    "('google/mt5-base', 'gigaword')": {'a': 2.35048143, 'b': 0.06291060, 'c': 1.24503412},
    "('google/mt5-base', 'wikiword')": {'a': 4.86327770, 'b': 0.11878338, 'c': 0.37128462},
    "('google/mt5-large', 'flan')": {'a': 2.98458153, 'b': 0.08255616, 'c': 0.74169984},
    "('google/mt5-large', 'gigaword')": {'a': 2.64709116, 'b': 0.07015078, 'c': 1.12878106},
    "('google/mt5-large', 'wikiword')": {'a': 3.77516678, 'b': 0.10151783, 'c': 0.41863249},
    "('gpt2', 'flan')": {'a': 8.16685603, 'b': 0.14630670, 'c': 1.19283420},
    "('gpt2', 'gigaword')": {'a': 7.62765686, 'b': 0.14681575, 'c': 0.13018064},
    "('gpt2', 'wikiword')": {'a': 3.69790048, 'b': 0.09825580, 'c': 0.59712436},
    "('t5-base', 'flan')": {'a': 3.08194773, 'b': 0.08089143, 'c': 0.71527394},
    "('t5-base', 'gigaword')": {'a': 1.82066852, 'b': 0.16688442, 'c': 0.41517682},
    "('t5-base', 'wikiword')": {'a': 2.06003085, 'b': 0.05427983, 'c': 0.24720659},
    "('t5-small', 'flan')": {'a': 3.44637525, 'b': 0.09027300, 'c': 0.97762445},
    "('t5-small', 'gigaword')": {'a': 1.77232820, 'b': 0.13527281, 'c': 0.40577385},
    "('t5-small', 'wikiword')": {'a': 2.49570434, 'b': 0.06779296, 'c': 0.40605462},
}

DEFAULT: Dict[str, float] = {'a': 4.33379279, 'b': 0.10382283, 'c': 0.60135888}


def _predict_sft_loss(n: float, a: float, b: float, c: float) -> float:
    if n <= 0:
        # Guard against invalid inputs; fallback to asymptote
        return float(c)
    return float(c + a * (n ** (-b)))


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    params = PARAMS.get(group, DEFAULT)
    a, b, c = params['a'], params['b'], params['c']
    out: List[Dict[str, float]] = []
    for row in input_data:
        n = float(row.get('sft_data_size', 0.0))
        y = _predict_sft_loss(n, a, b, c)
        out.append({'sft_loss': y})
    return out

#4 Run 4 R² = 0.823497

▼

Python

from __future__ import annotations

from typing import List, Dict

# Discovered scaling law (same functional form for all groups):
#   sft_loss(N) = L_inf + A * N**(-alpha)
# Parameters (L_inf, A, alpha) are fitted per group.

_PARAMS: Dict[str, Dict[str, float]] = {
    "('MBZUAI/LaMini-GPT-124M', 'flan')": {"L_inf": 1.65127773818122, "A": 9.627636110542012, "alpha": 0.20027924459754728},
    "('MBZUAI/LaMini-GPT-124M', 'gigaword')": {"L_inf": 0.7094358089325561, "A": 8.865653607508344, "alpha": 0.19800335463595245},
    "('MBZUAI/LaMini-GPT-124M', 'wikiword')": {"L_inf": 1.0948627966954982, "A": 3.6373715928919688, "alpha": 0.14381793206862253},
    "('MBZUAI/LaMini-GPT-774M', 'flan')": {"L_inf": 1.3870052565309616, "A": 6.371080257535652, "alpha": 0.17042255735621067},
    "('MBZUAI/LaMini-GPT-774M', 'gigaword')": {"L_inf": 0.5434120777731637, "A": 7.662993628357628, "alpha": 0.1907173509488867},
    "('MBZUAI/LaMini-GPT-774M', 'wikiword')": {"L_inf": 0.9218529855948283, "A": 2.3923404329438163, "alpha": 0.11699630370758805},
    "('cerebras/Cerebras-GPT-1.3B', 'flan')": {"L_inf": 1.3812161407059724, "A": 3.1436469109241303, "alpha": 0.13450336630697737},
    "('cerebras/Cerebras-GPT-1.3B', 'gigaword')": {"L_inf": 0.8158650085611785, "A": 5.487940292687338, "alpha": 0.1695013996929711},
    "('cerebras/Cerebras-GPT-1.3B', 'wikiword')": {"L_inf": 1.1175014790256352, "A": 2.5900727104038723, "alpha": 0.12007897055693705},
    "('cerebras/Cerebras-GPT-256M', 'flan')": {"L_inf": 1.7111855314883684, "A": 3.9127469491683713, "alpha": 0.1373625593896704},
    "('cerebras/Cerebras-GPT-256M', 'gigaword')": {"L_inf": 0.7292923040500483, "A": 6.967656124781269, "alpha": 0.18375362369530945},
    "('cerebras/Cerebras-GPT-256M', 'wikiword')": {"L_inf": 1.2848770931344928, "A": 4.547213001678072, "alpha": 0.1613832872213043},
    "('facebook/bart-base', 'flan')": {"L_inf": 1.5510091039471074, "A": 9.20693494347863, "alpha": 0.2020388784974948},
    "('facebook/bart-base', 'gigaword')": {"L_inf": 0.56972433379916, "A": 12.386988214376414, "alpha": 0.22750047436629098},
    "('facebook/bart-base', 'wikiword')": {"L_inf": 1.0478403671811507, "A": 7.361477195148514, "alpha": 0.20816541761373086},
    "('facebook/bart-large', 'flan')": {"L_inf": 1.3532632855814968, "A": 5.673444648443512, "alpha": 0.1709408294855965},
    "('facebook/bart-large', 'gigaword')": {"L_inf": 0.4571154854619428, "A": 12.3368654340819, "alpha": 0.2303296007217699},
    "('facebook/bart-large', 'wikiword')": {"L_inf": 0.8607566080420259, "A": 2.6491880068855447, "alpha": 0.1267697399761743},
    "('facebook/opt-1.3b', 'flan')": {"L_inf": 1.171749209401148, "A": 2.5854684335593436, "alpha": 0.12026549590036285},
    "('facebook/opt-1.3b', 'gigaword')": {"L_inf": 0.6621607125882694, "A": 6.891642217643802, "alpha": 0.18923641573799493},
    "('facebook/opt-1.3b', 'wikiword')": {"L_inf": 0.8683783962588203, "A": 1.6914437727399052, "alpha": 0.09261012525490318},
    "('facebook/opt-350m', 'flan')": {"L_inf": 1.463880812146748, "A": 4.316968915592505, "alpha": 0.14681389474891918},
    "('facebook/opt-350m', 'gigaword')": {"L_inf": 0.6186047097097414, "A": 9.673609194142475, "alpha": 0.21207146970020188},
    "('facebook/opt-350m', 'wikiword')": {"L_inf": 1.0737897722315954, "A": 2.735922068950953, "alpha": 0.12707149006301569},
    "('facebook/opt-6.7b', 'flan')": {"L_inf": 1.2414991489274696, "A": 1.0612135613446296, "alpha": 0.05707558230929925},
    "('facebook/opt-6.7b', 'gigaword')": {"L_inf": 1.287020903980636, "A": 0.9392426366225727, "alpha": 0.04596316888651769},
    "('facebook/opt-6.7b', 'wikiword')": {"L_inf": 0.8074492198132941, "A": 1.3483282412522588, "alpha": 0.07607955772238753},
    "('google/mt5-base', 'flan')": {"L_inf": 1.4688725993825085, "A": 4.5144583405879555, "alpha": 0.16068467806986383},
    "('google/mt5-base', 'gigaword')": {"L_inf": 1.745034122529899, "A": 2.088823825388248, "alpha": 0.1032880939465572},
    "('google/mt5-base', 'wikiword')": {"L_inf": 0.871284615442321, "A": 5.573787647372328, "alpha": 0.17658744292398043},
    "('google/mt5-large', 'flan')": {"L_inf": 1.2416998420079974, "A": 2.9427595790138255, "alpha": 0.1311995007063413},
    "('google/mt5-large', 'gigaword')": {"L_inf": 1.628781064016902, "A": 2.4743566125089473, "alpha": 0.11295717154926284},
    "('google/mt5-large', 'wikiword')": {"L_inf": 0.9186324900396516, "A": 4.042512579943654, "alpha": 0.15627210509048312},
    "('gpt2', 'flan')": {"L_inf": 1.6928341966172602, "A": 10.147228891615065, "alpha": 0.20292675913739364},
    "('gpt2', 'gigaword')": {"L_inf": 0.630180644744962, "A": 9.567826000822086, "alpha": 0.20724319711759384},
    "('gpt2', 'wikiword')": {"L_inf": 1.0971243613545192, "A": 3.9195477918894746, "alpha": 0.15125310943992998},
    "('t5-base', 'flan')": {"L_inf": 1.2152739441287244, "A": 3.0295421285723143, "alpha": 0.126287878102745},
    "('t5-base', 'gigaword')": {"L_inf": 0.41052785055309204, "A": 1.8100469427145163, "alpha": 0.16486480202937612},
    "('t5-base', 'wikiword')": {"L_inf": 0.7472065942824018, "A": 1.7242228951910559, "alpha": 0.09151959741229326},
    "('t5-small', 'flan')": {"L_inf": 1.477624445415, "A": 3.5382374857245042, "alpha": 0.1393167297150124},
    "('t5-small', 'gigaword')": {"L_inf": 0.4064502225063129, "A": 1.7728248579304224, "alpha": 0.13545498163891634},
    "('t5-small', 'wikiword')": {"L_inf": 0.906054616524141, "A": 2.2765381401929234, "alpha": 0.11015919967664096},
}

# Reasonable fallback if an unknown group is requested
_FALLBACK = {"L_inf": 1.0, "A": 4.0, "alpha": 0.15}


def _predict_loss(n: float, p: Dict[str, float]) -> float:
    # Ensure numerical stability and reasonable domain
    n = max(float(n), 1.0)
    L_inf, A, alpha = p["L_inf"], p["A"], p["alpha"]
    y = L_inf + A * (n ** (-alpha))
    # Loss should be positive
    return max(y, 0.0)


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    params = _PARAMS.get(group, _FALLBACK)
    out: List[Dict[str, float]] = []
    for row in input_data:
        if "sft_data_size" not in row:
            raise KeyError("Each input row must contain 'sft_data_size'.")
        n = float(row["sft_data_size"])  # number of SFT examples
        y = _predict_loss(n, params)
        out.append({"sft_loss": float(y)})
    return out

#5 Run 5 R² = 0.756889

▼

Python

from __future__ import annotations
from typing import List, Dict
import math

# Scaling law: sft_loss(n) = a_g + b_g * n^(-c_g)
# Parameters fitted per group using constrained least squares (a >= 0).

_PARAMS: Dict[str, Dict[str, float]] = {
    "('MBZUAI/LaMini-GPT-124M', 'flan')": {"a": 0.5756231571491163, "b": 6.788800109382551, "c": 0.1},
    "('MBZUAI/LaMini-GPT-124M', 'gigaword')": {"a": 0.0, "b": 5.891449115811081, "c": 0.10452261306532663},
    "('MBZUAI/LaMini-GPT-124M', 'wikiword')": {"a": 0.7581735338374717, "b": 3.3253897665429006, "c": 0.1},
    "('MBZUAI/LaMini-GPT-774M', 'flan')": {"a": 0.765913597391628, "b": 5.037698133931885, "c": 0.1},
    "('MBZUAI/LaMini-GPT-774M', 'gigaword')": {"a": 0.0, "b": 5.330435864010007, "c": 0.10904522613065327},
    "('MBZUAI/LaMini-GPT-774M', 'wikiword')": {"a": 0.8162796218426965, "b": 2.3246370433858927, "c": 0.1},
    "('cerebras/Cerebras-GPT-1.3B', 'flan')": {"a": 1.14040467953001, "b": 2.9286389094129244, "c": 0.1},
    "('cerebras/Cerebras-GPT-1.3B', 'gigaword')": {"a": 0.19153901917179617, "b": 4.5738879298641795, "c": 0.1},
    "('cerebras/Cerebras-GPT-1.3B', 'wikiword')": {"a": 1.00490771189317, "b": 2.455204059651987, "c": 0.1},
    "('cerebras/Cerebras-GPT-256M', 'flan')": {"a": 1.465031427159416, "b": 3.450654874709355, "c": 0.1},
    "('cerebras/Cerebras-GPT-256M', 'gigaword')": {"a": 0.0, "b": 5.23569628002393, "c": 0.1},
    "('cerebras/Cerebras-GPT-256M', 'wikiword')": {"a": 0.7360076887487385, "b": 4.040263014645736, "c": 0.1},
    "('facebook/bart-base', 'flan')": {"a": 0.4183721326985148, "b": 6.680930867444866, "c": 0.1},
    "('facebook/bart-base', 'gigaword')": {"a": 0.0, "b": 7.49839505175648, "c": 0.13165829145728644},
    "('facebook/bart-base', 'wikiword')": {"a": 0.21487574349143937, "b": 5.859678336782164, "c": 0.11809045226130654},
    "('facebook/bart-large', 'flan')": {"a": 0.6711606109410349, "b": 4.779116715335894, "c": 0.1},
    "('facebook/bart-large', 'gigaword')": {"a": 0.0, "b": 7.8086157931066555, "c": 0.14522613065326634},
    "('facebook/bart-large', 'wikiword')": {"a": 0.7679524826460833, "b": 2.6213614567063552, "c": 0.1135678391959799},
    "('facebook/opt-1.3b', 'flan')": {"a": 1.0525147738249625, "b": 2.463773688599949, "c": 0.1},
    "('facebook/opt-1.3b', 'gigaword')": {"a": 0.0, "b": 5.271300023177653, "c": 0.10904522613065327},
    "('facebook/opt-1.3b', 'wikiword')": {"a": 0.937971229084469, "b": 1.633385429484853, "c": 0.1},
    "('facebook/opt-350m', 'flan')": {"a": 1.121012709357284, "b": 3.739557598350229, "c": 0.1},
    "('facebook/opt-350m', 'gigaword')": {"a": 0.0, "b": 6.463565783906934, "c": 0.12261306532663317},
    "('facebook/opt-350m', 'wikiword')": {"a": 0.8751090655481231, "b": 2.6540006198573116, "c": 0.1},
    "('facebook/opt-6.7b', 'flan')": {"a": 1.518261849277188, "b": 0.8703088623580859, "c": 0.1},
    "('facebook/opt-6.7b', 'gigaword')": {"a": 1.6362956915613212, "b": 0.6587676975419687, "c": 0.1},
    "('facebook/opt-6.7b', 'wikiword')": {"a": 0.9748633662659345, "b": 1.2519888790332818, "c": 0.1},
    "('google/mt5-base', 'flan')": {"a": 0.9339671741790373, "b": 4.002666584784875, "c": 0.1},
    "('google/mt5-base', 'gigaword')": {"a": 1.7683001697852798, "b": 1.9748465379724371, "c": 0.1},
    "('google/mt5-base', 'wikiword')": {"a": 0.13475564355322378, "b": 4.715649810362228, "c": 0.1},
    "('google/mt5-large', 'flan')": {"a": 1.0227022336320242, "b": 2.787572784403897, "c": 0.1},
    "('google/mt5-large', 'gigaword')": {"a": 1.6097662515198514, "b": 2.263220520187441, "c": 0.1},
    "('google/mt5-large', 'wikiword')": {"a": 0.42186966026041917, "b": 3.7178336557836027, "c": 0.1},
    "('gpt2', 'flan')": {"a": 0.5753079426107529, "b": 7.024797595579225, "c": 0.1},
    "('gpt2', 'gigaword')": {"a": 0.0, "b": 6.380661194775806, "c": 0.11809045226130654},
    "('gpt2', 'wikiword')": {"a": 0.6762981369243037, "b": 3.5611403317650283, "c": 0.1},
    "('t5-base', 'flan')": {"a": 1.0544435197152868, "b": 2.8125301742547397, "c": 0.1},
    "('t5-base', 'gigaword')": {"a": 0.41767974252640383, "b": 1.8250438104494973, "c": 0.1678391959798995},
    "('t5-base', 'wikiword')": {"a": 0.8377530762048058, "b": 1.6349563093453587, "c": 0.1},
    "('t5-small', 'flan')": {"a": 1.1904004060064355, "b": 3.2313189729433955, "c": 0.1},
    "('t5-small', 'gigaword')": {"a": 0.4079594296474219, "b": 1.7792784858202433, "c": 0.13618090452261306},
    "('t5-small', 'wikiword')": {"a": 0.8676415086385475, "b": 2.180107757631235, "c": 0.1},
}

# Global fallback if an unknown group is provided
_FALLBACK = {"a": 0.5860710641486957, "b": 3.8977261691915013, "c": 0.1}


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    params = _PARAMS.get(group, _FALLBACK)
    a = float(params["a"])  # asymptotic loss floor (>= 0)
    b = float(params["b"])  # scale factor
    c = float(params["c"])  # decay exponent (> 0)

    outputs: List[Dict[str, float]] = []
    for row in input_data:
        n = float(row.get("sft_data_size", 0.0))
        # Guard against non-positive n
        if n <= 0:
            y = float("nan")
        else:
            y = a + b * (n ** (-c))
        outputs.append({"sft_loss": float(y)})
    return outputs