SLD - SFT Scaling Law - claude-code + claude-sonnet-4-5

All Runs (sorted by R²)

Best Run 1 R² = 0.948364

▼

Python

import json
import math

# Load fitted parameters from the training data
# These parameters were obtained by fitting L(N) = a * log(N) + b to each group
FITTED_PARAMS = {
    "('MBZUAI/LaMini-GPT-124M', 'flan')": {"a": -0.2858284253987973, "b": 5.988265549351065},
    "('MBZUAI/LaMini-GPT-124M', 'gigaword')": {"a": -0.2633818402656468, "b": 4.737755542012152},
    "('MBZUAI/LaMini-GPT-124M', 'wikiword')": {"a": -0.1380796078939516, "b": 3.391898490213529},
    "('MBZUAI/LaMini-GPT-774M', 'flan')": {"a": -0.21223122991733276, "b": 4.783598287214052},
    "('MBZUAI/LaMini-GPT-774M', 'gigaword')": {"a": -0.23807369821093685, "b": 4.2045733043013165},
    "('MBZUAI/LaMini-GPT-774M', 'wikiword')": {"a": -0.09602793909920557, "b": 2.652870267456121},
    "('cerebras/Cerebras-GPT-1.3B', 'flan')": {"a": -0.12168554599239426, "b": 3.4606313079569837},
    "('cerebras/Cerebras-GPT-1.3B', 'gigaword')": {"a": -0.19060958285317242, "b": 3.820355959611436},
    "('cerebras/Cerebras-GPT-1.3B', 'wikiword')": {"a": -0.10204513742983291, "b": 2.950335143562661},
    "('cerebras/Cerebras-GPT-256M', 'flan')": {"a": -0.14467116003927433, "b": 4.210632462544093},
    "('cerebras/Cerebras-GPT-256M', 'gigaword')": {"a": -0.22564133332553715, "b": 4.218447739736505},
    "('cerebras/Cerebras-GPT-256M', 'wikiword')": {"a": -0.16694369363861158, "b": 3.9284497987057874},
    "('facebook/bart-base', 'flan')": {"a": -0.27885014029903604, "b": 5.722811837645894},
    "('facebook/bart-base', 'gigaword')": {"a": -0.3347248647552073, "b": 5.410755825604152},
    "('facebook/bart-base', 'wikiword')": {"a": -0.2413552868912743, "b": 4.4926337354168595},
    "('facebook/bart-large', 'flan')": {"a": -0.19814331941245988, "b": 4.453518961526505},
    "('facebook/bart-large', 'gigaword')": {"a": -0.3339209236977352, "b": 5.242535980974371},
    "('facebook/bart-large', 'wikiword')": {"a": -0.10813142202742225, "b": 2.7193492499816334},
    "('facebook/opt-1.3b', 'flan')": {"a": -0.10226900935941804, "b": 3.0035271247006574},
    "('facebook/opt-1.3b', 'gigaword')": {"a": -0.22503850880208404, "b": 4.067383747817735},
    "('facebook/opt-1.3b', 'wikiword')": {"a": -0.06738256744904991, "b": 2.227609751673505},
    "('facebook/opt-350m', 'flan')": {"a": -0.1567771098875299, "b": 4.096427281007177},
    "('facebook/opt-350m', 'gigaword')": {"a": -0.28416051180558977, "b": 4.789217216189481},
    "('facebook/opt-350m', 'wikiword')": {"a": -0.10931922565009441, "b": 2.969051299001184},
    "('facebook/opt-6.7b', 'flan')": {"a": -0.035974453233156484, "b": 2.2060635294933304},
    "('facebook/opt-6.7b', 'gigaword')": {"a": -0.027858065828614262, "b": 2.1626361790690503},
    "('facebook/opt-6.7b', 'wikiword')": {"a": -0.05153332226677372, "b": 1.9623193491235948},
    "('google/mt5-base', 'flan')": {"a": -0.16562129412487037, "b": 4.0988075329513345},
    "('google/mt5-base', 'gigaword')": {"a": -0.08234869176487448, "b": 3.3355539218016768},
    "('google/mt5-base', 'wikiword')": {"a": -0.19493726211114437, "b": 3.861648173041152},
    "('google/mt5-large', 'flan')": {"a": -0.11550187851501488, "b": 3.2282318950626876},
    "('google/mt5-large', 'gigaword')": {"a": -0.09533853604323887, "b": 3.4146676286886763},
    "('google/mt5-large', 'wikiword')": {"a": -0.15291379226040927, "b": 3.353099664653985},
    "('gpt2', 'flan')": {"a": -0.29617705608594097, "b": 6.179866386147315},
    "('gpt2', 'gigaword')": {"a": -0.28087485939518764, "b": 4.8171995484035675},
    "('gpt2', 'wikiword')": {"a": -0.14749971599228653, "b": 3.4933775691623454},
    "('t5-base', 'flan')": {"a": -0.11717228671842463, "b": 3.2855166649939935},
    "('t5-base', 'gigaword')": {"a": -0.0696292965522014, "b": 1.480251152476475},
    "('t5-base', 'wikiword')": {"a": -0.06785004652930211, "b": 2.132300508433401},
    "('t5-small', 'flan')": {"a": -0.13444290860067154, "b": 3.752075115263242},
    "('t5-small', 'gigaword')": {"a": -0.07229854246705678, "b": 1.6089334546678618},
    "('t5-small', 'wikiword')": {"a": -0.09053553061557014, "b": 2.594400260204647},
}


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    The discovered scaling law follows a logarithmic relationship:
        L(N) = a * log(N) + b

    where:
        - L is the predicted SFT loss (sft_loss)
        - N is the number of examples in the fine-tuning dataset (sft_data_size)
        - a and b are group-specific parameters fitted from training data
        - log is the natural logarithm

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Get the parameters for this group
    if group not in FITTED_PARAMS:
        raise ValueError(f"Unknown group: {group}. Available groups: {list(FITTED_PARAMS.keys())}")

    params = FITTED_PARAMS[group]
    a = params["a"]
    b = params["b"]

    # Make predictions for each input data point
    predictions = []
    for data_point in input_data:
        # Extract the input variable (sft_data_size)
        N = data_point["sft_data_size"]

        # Apply the scaling law: L(N) = a * log(N) + b
        sft_loss = a * math.log(N) + b

        # Return the prediction
        predictions.append({"sft_loss": sft_loss})

    return predictions

#2 Run 2 R² = 0.948364

▼

Python

import json
import math
import os

# Load fitted parameters from JSON file
PARAMETERS_FILE = os.path.join(os.path.dirname(__file__), 'parameters.json')

with open(PARAMETERS_FILE, 'r') as f:
    PARAMETERS = json.load(f)


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    The scaling law follows a log-linear relationship:
        sft_loss = a * log(sft_data_size) + b

    where:
        - a and b are group-specific parameters
        - log is the natural logarithm
        - sft_data_size is the number of examples in the fine-tuning dataset
        - sft_loss is the final fine-tuning loss

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Get parameters for this group
    if group not in PARAMETERS:
        raise ValueError(f"Unknown group: {group}. Available groups: {list(PARAMETERS.keys())}")

    params = PARAMETERS[group]
    a = params['a']
    b = params['b']

    # Make predictions for each input data point
    predictions = []
    for data_point in input_data:
        # Extract the input variable
        sft_data_size = data_point['sft_data_size']

        # Apply the scaling law: L = a * log(N) + b
        sft_loss = a * math.log(sft_data_size) + b

        # Return prediction as a dictionary
        predictions.append({'sft_loss': sft_loss})

    return predictions

#3 Run 3 R² = 0.948364

▼

Python

import math

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Discovered scaling law: loss = A - B * ln(N)
    # where N is the SFT data size

    # Parameters fitted for each group
    parameters = {
        "('MBZUAI/LaMini-GPT-124M', 'flan')": {'A': 5.988266, 'B': 0.285828},
        "('MBZUAI/LaMini-GPT-774M', 'flan')": {'A': 4.783598, 'B': 0.212231},
        "('cerebras/Cerebras-GPT-256M', 'flan')": {'A': 4.210632, 'B': 0.144671},
        "('cerebras/Cerebras-GPT-1.3B', 'flan')": {'A': 3.460631, 'B': 0.121686},
        "('facebook/bart-base', 'flan')": {'A': 5.722812, 'B': 0.278850},
        "('facebook/bart-large', 'flan')": {'A': 4.453519, 'B': 0.198143},
        "('facebook/opt-1.3b', 'flan')": {'A': 3.003527, 'B': 0.102269},
        "('facebook/opt-350m', 'flan')": {'A': 4.096427, 'B': 0.156777},
        "('facebook/opt-6.7b', 'flan')": {'A': 2.206064, 'B': 0.035974},
        "('gpt2', 'flan')": {'A': 6.179866, 'B': 0.296177},
        "('t5-base', 'flan')": {'A': 3.285517, 'B': 0.117172},
        "('t5-small', 'flan')": {'A': 3.752075, 'B': 0.134443},
        "('google/mt5-base', 'flan')": {'A': 4.098808, 'B': 0.165621},
        "('google/mt5-large', 'flan')": {'A': 3.228232, 'B': 0.115502},
        "('MBZUAI/LaMini-GPT-124M', 'gigaword')": {'A': 4.737756, 'B': 0.263382},
        "('MBZUAI/LaMini-GPT-774M', 'gigaword')": {'A': 4.204573, 'B': 0.238074},
        "('cerebras/Cerebras-GPT-256M', 'gigaword')": {'A': 4.218448, 'B': 0.225641},
        "('cerebras/Cerebras-GPT-1.3B', 'gigaword')": {'A': 3.820356, 'B': 0.190610},
        "('facebook/bart-base', 'gigaword')": {'A': 5.410756, 'B': 0.334725},
        "('facebook/bart-large', 'gigaword')": {'A': 5.242536, 'B': 0.333921},
        "('facebook/opt-1.3b', 'gigaword')": {'A': 4.067384, 'B': 0.225039},
        "('facebook/opt-350m', 'gigaword')": {'A': 4.789217, 'B': 0.284161},
        "('facebook/opt-6.7b', 'gigaword')": {'A': 2.162636, 'B': 0.027858},
        "('gpt2', 'gigaword')": {'A': 4.817200, 'B': 0.280875},
        "('t5-base', 'gigaword')": {'A': 1.480251, 'B': 0.069629},
        "('t5-small', 'gigaword')": {'A': 1.608933, 'B': 0.072299},
        "('google/mt5-base', 'gigaword')": {'A': 3.335554, 'B': 0.082349},
        "('google/mt5-large', 'gigaword')": {'A': 3.414668, 'B': 0.095339},
        "('MBZUAI/LaMini-GPT-124M', 'wikiword')": {'A': 3.391898, 'B': 0.138080},
        "('MBZUAI/LaMini-GPT-774M', 'wikiword')": {'A': 2.652870, 'B': 0.096028},
        "('cerebras/Cerebras-GPT-256M', 'wikiword')": {'A': 3.928450, 'B': 0.166944},
        "('cerebras/Cerebras-GPT-1.3B', 'wikiword')": {'A': 2.950335, 'B': 0.102045},
        "('facebook/bart-base', 'wikiword')": {'A': 4.492634, 'B': 0.241355},
        "('facebook/bart-large', 'wikiword')": {'A': 2.719349, 'B': 0.108131},
        "('facebook/opt-1.3b', 'wikiword')": {'A': 2.227610, 'B': 0.067383},
        "('facebook/opt-350m', 'wikiword')": {'A': 2.969051, 'B': 0.109319},
        "('facebook/opt-6.7b', 'wikiword')": {'A': 1.962319, 'B': 0.051533},
        "('gpt2', 'wikiword')": {'A': 3.493378, 'B': 0.147500},
        "('t5-base', 'wikiword')": {'A': 2.132301, 'B': 0.067850},
        "('t5-small', 'wikiword')": {'A': 2.594400, 'B': 0.090536},
        "('google/mt5-base', 'wikiword')": {'A': 3.861648, 'B': 0.194937},
        "('google/mt5-large', 'wikiword')": {'A': 3.353100, 'B': 0.152914}
    }

    # Get parameters for this group
    if group not in parameters:
        raise ValueError(f"Unknown group: {group}")

    params = parameters[group]
    A = params['A']
    B = params['B']

    # Apply the scaling law to each data point
    results = []
    for data_point in input_data:
        N = data_point['sft_data_size']
        predicted_loss = A - B * math.log(N)
        results.append({'sft_loss': predicted_loss})

    return results

#4 Run 4 R² = 0.948364

▼

Python

import math

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """

    # Fitted parameters for each group
    # Formula: sft_loss = a * log(sft_data_size) + b
    params = {
        "('MBZUAI/LaMini-GPT-124M', 'flan')": {'a': -0.285828, 'b': 5.988266},
        "('MBZUAI/LaMini-GPT-124M', 'gigaword')": {'a': -0.263382, 'b': 4.737756},
        "('MBZUAI/LaMini-GPT-124M', 'wikiword')": {'a': -0.138080, 'b': 3.391898},
        "('MBZUAI/LaMini-GPT-774M', 'flan')": {'a': -0.212231, 'b': 4.783598},
        "('MBZUAI/LaMini-GPT-774M', 'gigaword')": {'a': -0.238074, 'b': 4.204573},
        "('MBZUAI/LaMini-GPT-774M', 'wikiword')": {'a': -0.096028, 'b': 2.652870},
        "('cerebras/Cerebras-GPT-1.3B', 'flan')": {'a': -0.121686, 'b': 3.460631},
        "('cerebras/Cerebras-GPT-1.3B', 'gigaword')": {'a': -0.190610, 'b': 3.820356},
        "('cerebras/Cerebras-GPT-1.3B', 'wikiword')": {'a': -0.102045, 'b': 2.950335},
        "('cerebras/Cerebras-GPT-256M', 'flan')": {'a': -0.144671, 'b': 4.210632},
        "('cerebras/Cerebras-GPT-256M', 'gigaword')": {'a': -0.225641, 'b': 4.218448},
        "('cerebras/Cerebras-GPT-256M', 'wikiword')": {'a': -0.166944, 'b': 3.928450},
        "('facebook/bart-base', 'flan')": {'a': -0.278850, 'b': 5.722812},
        "('facebook/bart-base', 'gigaword')": {'a': -0.334725, 'b': 5.410756},
        "('facebook/bart-base', 'wikiword')": {'a': -0.241355, 'b': 4.492634},
        "('facebook/bart-large', 'flan')": {'a': -0.198143, 'b': 4.453519},
        "('facebook/bart-large', 'gigaword')": {'a': -0.333921, 'b': 5.242536},
        "('facebook/bart-large', 'wikiword')": {'a': -0.108131, 'b': 2.719349},
        "('facebook/opt-1.3b', 'flan')": {'a': -0.102269, 'b': 3.003527},
        "('facebook/opt-1.3b', 'gigaword')": {'a': -0.225039, 'b': 4.067384},
        "('facebook/opt-1.3b', 'wikiword')": {'a': -0.067383, 'b': 2.227610},
        "('facebook/opt-350m', 'flan')": {'a': -0.156777, 'b': 4.096427},
        "('facebook/opt-350m', 'gigaword')": {'a': -0.284161, 'b': 4.789217},
        "('facebook/opt-350m', 'wikiword')": {'a': -0.109319, 'b': 2.969051},
        "('facebook/opt-6.7b', 'flan')": {'a': -0.035974, 'b': 2.206064},
        "('facebook/opt-6.7b', 'gigaword')": {'a': -0.027858, 'b': 2.162636},
        "('facebook/opt-6.7b', 'wikiword')": {'a': -0.051533, 'b': 1.962319},
        "('google/mt5-base', 'flan')": {'a': -0.165621, 'b': 4.098808},
        "('google/mt5-base', 'gigaword')": {'a': -0.082349, 'b': 3.335554},
        "('google/mt5-base', 'wikiword')": {'a': -0.194937, 'b': 3.861648},
        "('google/mt5-large', 'flan')": {'a': -0.115502, 'b': 3.228232},
        "('google/mt5-large', 'gigaword')": {'a': -0.095339, 'b': 3.414668},
        "('google/mt5-large', 'wikiword')": {'a': -0.152914, 'b': 3.353100},
        "('gpt2', 'flan')": {'a': -0.296177, 'b': 6.179866},
        "('gpt2', 'gigaword')": {'a': -0.280875, 'b': 4.817200},
        "('gpt2', 'wikiword')": {'a': -0.147500, 'b': 3.493378},
        "('t5-base', 'flan')": {'a': -0.117172, 'b': 3.285517},
        "('t5-base', 'gigaword')": {'a': -0.069629, 'b': 1.480251},
        "('t5-base', 'wikiword')": {'a': -0.067850, 'b': 2.132301},
        "('t5-small', 'flan')": {'a': -0.134443, 'b': 3.752075},
        "('t5-small', 'gigaword')": {'a': -0.072299, 'b': 1.608933},
        "('t5-small', 'wikiword')": {'a': -0.090536, 'b': 2.594400},
    }

    # Get parameters for the specified group
    if group not in params:
        raise ValueError(f"Unknown group: {group}")

    a = params[group]['a']
    b = params[group]['b']

    # Make predictions for each input data point
    predictions = []
    for data_point in input_data:
        sft_data_size = data_point['sft_data_size']

        # Apply the logarithmic scaling law
        sft_loss = a * math.log(sft_data_size) + b

        predictions.append({'sft_loss': sft_loss})

    return predictions

#5 Run 5 R² = 0.787239

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Fitted parameters for each group (from curve fitting on training data)
    # Scaling law: sft_loss = a * (sft_data_size)^(-b) + c

    parameters = {
        "('MBZUAI/LaMini-GPT-124M', 'flan')": {'a': 6.9738879587354985, 'b': 0.0813415811372061, 'c': 6.209360061651158e-11},
        "('MBZUAI/LaMini-GPT-774M', 'flan')": {'a': 5.416194164267256, 'b': 0.0720358167338764, 'c': 7.8874180375913e-12},
        "('cerebras/Cerebras-GPT-256M', 'flan')": {'a': 4.497775144341002, 'b': 0.0491972775293856, 'c': 1.955613448956228e-11},
        "('cerebras/Cerebras-GPT-1.3B', 'flan')": {'a': 3.726687324322024, 'b': 0.0513362241087863, 'c': 7.71038788371925e-12},
        "('facebook/bart-base', 'flan')": {'a': 6.796594760907895, 'b': 0.0855017025058688, 'c': 3.5644296141167816e-12},
        "('facebook/bart-large', 'flan')": {'a': 5.121314344755191, 'b': 0.0740782998112432, 'c': 1.3640633310585388e-11},
        "('facebook/opt-1.3b', 'flan')": {'a': 3.21710353273244, 'b': 0.0490507139907979, 'c': 1.0557194948504606e-11},
        "('facebook/opt-350m', 'flan')": {'a': 4.468174350569338, 'b': 0.0576116621246158, 'c': 3.8573586034728045e-11},
        "('facebook/opt-6.7b', 'flan')": {'a': 2.233967790682599, 'b': 0.0191483651848653, 'c': 1.9412693674780712e-11},
        "('gpt2', 'flan')": {'a': 7.198398164068978, 'b': 0.0817204499072352, 'c': 3.2510296974217244e-12},
        "('t5-base', 'flan')": {'a': 3.5430516811202155, 'b': 0.0522550981730723, 'c': 1.1068146399395571e-11},
        "('t5-small', 'flan')": {'a': 4.052493019544314, 'b': 0.052691031082205, 'c': 6.746480749474824e-12},
        "('google/mt5-base', 'flan')": {'a': 4.571149014484463, 'b': 0.0636171899686622, 'c': 9.026631196760658e-12},
        "('google/mt5-large', 'flan')": {'a': 3.492798575656628, 'b': 0.0528118147665376, 'c': 1.0836333814545436e-11},
        "('MBZUAI/LaMini-GPT-124M', 'gigaword')": {'a': 5.923744647294182, 'b': 0.1052086313123345, 'c': 1.5788841830932776e-11},
        "('MBZUAI/LaMini-GPT-774M', 'gigaword')": {'a': 5.350688480941363, 'b': 0.1095237488864338, 'c': 5.272610698686065e-12},
        "('cerebras/Cerebras-GPT-256M', 'gigaword')": {'a': 5.237426477009148, 'b': 0.1000412217663205, 'c': 7.1741015201617725e-12},
        "('cerebras/Cerebras-GPT-1.3B', 'gigaword')": {'a': 4.601802086260434, 'b': 0.0893794216309777, 'c': 7.841164216874931e-12},
        "('facebook/bart-base', 'gigaword')": {'a': 7.594465198757044, 'b': 0.1333097873907668, 'c': 1.3554385113419691e-11},
        "('facebook/bart-large', 'gigaword')": {'a': 7.683920586651666, 'b': 0.1431066056707549, 'c': 8.407693574380672e-12},
        "('facebook/opt-1.3b', 'gigaword')": {'a': 5.229994743123611, 'b': 0.1080529832595854, 'c': 1.787133563253055e-11},
        "('facebook/opt-350m', 'gigaword')": {'a': 6.499201552078441, 'b': 0.123318476159955, 'c': 1.4467022271584628e-11},
        "('facebook/opt-6.7b', 'gigaword')": {'a': 2.1770797673846456, 'b': 0.0144985156639429, 'c': 1.3240519791679617e-12},
        "('gpt2', 'gigaword')": {'a': 6.339049368674285, 'b': 0.1172559464274163, 'c': 3.282936462150453e-11},
        "('t5-base', 'gigaword')": {'a': 1.823379316613078, 'b': 0.1674599551268627, 'c': 0.4167409405994949},
        "('t5-small', 'gigaword')": {'a': 1.775741745120599, 'b': 0.1343979198744542, 'c': 0.4009175915675014},
        "('google/mt5-base', 'gigaword')": {'a': 3.4396037497251, 'b': 0.0316506222790409, 'c': 1.851582046346298e-11},
        "('google/mt5-large', 'gigaword')": {'a': 3.5492577898997597, 'b': 0.0368812645537255, 'c': 3.0984992301047484e-11},
        "('MBZUAI/LaMini-GPT-124M', 'wikiword')": {'a': 3.7815960318544137, 'b': 0.0640731112559584, 'c': 5.133338198959336e-12},
        "('MBZUAI/LaMini-GPT-774M', 'wikiword')": {'a': 2.8815066741907813, 'b': 0.0539025472962201, 'c': 1.1417422562942647e-11},
        "('cerebras/Cerebras-GPT-256M', 'wikiword')": {'a': 4.45601962837555, 'b': 0.0690574152830047, 'c': 9.361215376952394e-12},
        "('cerebras/Cerebras-GPT-1.3B', 'wikiword')": {'a': 3.167052217392773, 'b': 0.0501048229949453, 'c': 6.587842000399508e-13},
        "('facebook/bart-base', 'wikiword')": {'a': 5.86196054927262, 'b': 0.1201853857867522, 'c': 0.2505119015404434},
        "('facebook/bart-large', 'wikiword')": {'a': 2.620750839880945, 'b': 0.115203713227391, 'c': 0.7814633075466515},
        "('facebook/opt-1.3b', 'wikiword')": {'a': 2.349129117465099, 'b': 0.0417628773316036, 'c': 7.147060465007951e-12},
        "('facebook/opt-350m', 'wikiword')": {'a': 3.241838816258498, 'b': 0.0554639976201082, 'c': 2.0400472816702877e-11},
        "('facebook/opt-6.7b', 'wikiword')": {'a': 1.7881053529873807, 'b': 0.0422082813415794, 'c': 0.2693284658571291},
        "('gpt2', 'wikiword')": {'a': 3.9462672178497136, 'b': 0.0681165715966697, 'c': 7.881931987611646e-12},
        "('t5-base', 'wikiword')": {'a': 2.259872089331568, 'b': 0.0445738697036901, 'c': 9.790168427672564e-12},
        "('t5-small', 'wikiword')": {'a': 2.7905737239350903, 'b': 0.0507995311827284, 'c': 1.8161462756396424e-12},
        "('google/mt5-base', 'wikiword')": {'a': 4.7344533518491385, 'b': 0.0926126262618999, 'c': 9.943976438405106e-13},
        "('google/mt5-large', 'wikiword')": {'a': 3.9262973834353896, 'b': 0.0781626896765455, 'c': 1.0543787441757105e-12},
    }

    # Check if the group exists in our trained parameters
    if group not in parameters:
        # If group is unseen, we could use average parameters or raise an error
        # For robustness, let's use the mean parameters across all groups
        import statistics
        all_a = [p['a'] for p in parameters.values()]
        all_b = [p['b'] for p in parameters.values()]
        all_c = [p['c'] for p in parameters.values()]

        params = {
            'a': statistics.mean(all_a),
            'b': statistics.mean(all_b),
            'c': statistics.median(all_c)  # Use median for c as it has extreme outliers
        }
    else:
        params = parameters[group]

    # Apply the scaling law: sft_loss = a * (sft_data_size)^(-b) + c
    results = []
    for data_point in input_data:
        N = data_point['sft_data_size']
        predicted_loss = params['a'] * (N ** (-params['b'])) + params['c']
        results.append({'sft_loss': predicted_loss})

    return results