SLD - SFT Scaling Law - gemini-cli + Gemini 2.5 Flash

All Runs (sorted by R²)

Best Run 1 R² = 0.787239

▼

Python

import json
import numpy as np

# Load fitted parameters from the JSON file
# In a real deployment, this would be loaded once, not on every function call.
# For this exercise, we'll assume the file is accessible or parameters are hardcoded
# for simplicity in the submitted law.py.
# However, to simulate the loading from the temporary directory, we'll keep the path.
# For the final law.py, I will embed the parameters directly into the function.

# Hardcoding the parameters into the function for submission to avoid file dependencies in the final `law.py`
FITTED_PARAMS = {
    "('MBZUAI/LaMini-GPT-124M', 'flan')": {"A": 6.973888553329567, "B": 0.08134159151639991, "C": 7.870182444280732e-15},
    "('MBZUAI/LaMini-GPT-774M', 'flan')": {"A": 5.416195597322399, "B": 0.0720358485868509, "C": 1.0602644871174933e-11},
    "('cerebras/Cerebras-GPT-256M', 'flan')": {"A": 4.497775760815009, "B": 0.04919729767645753, "C": 6.274119068597892e-10},
    "('cerebras/Cerebras-GPT-1.3B', 'flan')": {"A": 3.7266874945596746, "B": 0.05133622927325513, "C": 1.412382533841796e-10},
    "('facebook/bart-base', 'flan')": {"A": 6.796594386775143, "B": 0.08550169575715841, "C": 6.737958498228297e-18},
    "('facebook/bart-large', 'flan')": {"A": 5.121314339920321, "B": 0.07407829970041344, "C": 3.1975002498876555e-19},
    "('facebook/opt-1.3b', 'flan')": {"A": 3.2171035336850538, "B": 0.04905071402472535, "C": 1.4665945977868665e-17},
    "('facebook/opt-350m', 'flan')": {"A": 4.46817434881537, "B": 0.05761166207977083, "C": 6.466006159672916e-17},
    "('facebook/opt-6.7b', 'flan')": {"A": 2.2339677920266, "B": 0.019148365251614136, "C": 2.4205570716429996e-12},
    "('gpt2', 'flan')": {"A": 7.198397895874961, "B": 0.08172044535963456, "C": 7.277544347301891e-16},
    "('t5-base', 'flan')": {"A": 3.543052221279221, "B": 0.05225511975662033, "C": 5.365852391533393e-14},
    "('t5-small', 'flan')": {"A": 4.0524929680519675, "B": 0.05269102958942768, "C": 7.17728404772256e-18},
    "('google/mt5-base', 'flan')": {"A": 4.571149127204719, "B": 0.06361719290512968, "C": 7.778644966305842e-13},
    "('google/mt5-large', 'flan')": {"A": 3.492798571175553, "B": 0.05281181461988964, "C": 1.2586498420090986e-20},
    "('MBZUAI/LaMini-GPT-124M', 'gigaword')": {"A": 5.923747139325469, "B": 0.1052086841512751, "C": 3.089177194303035e-17},
    "('MBZUAI/LaMini-GPT-774M', 'gigaword')": {"A": 5.3506872228212465, "B": 0.10952371922324641, "C": 4.789842716916173e-16},
    "('cerebras/Cerebras-GPT-256M', 'gigaword')": {"A": 5.2374256588965595, "B": 0.100041202302744, "C": 2.5291593168013106e-18},
    "('cerebras/Cerebras-GPT-1.3B', 'gigaword')": {"A": 4.601802082329202, "B": 0.08937942152768066, "C": 5.975656142659641e-21},
    "('facebook/bart-base', 'gigaword')": {"A": 7.594466160506642, "B": 0.13330980387218538, "C": 2.0140930917717585e-19},
    "('facebook/bart-large', 'gigaword')": {"A": 7.68392069755279, "B": 0.1431066075849933, "C": 3.416340754873851e-21},
    "('facebook/opt-1.3b', 'gigaword')": {"A": 5.229996705172819, "B": 0.1080530305700941, "C": 4.162452364748143e-13},
    "('facebook/opt-350m', 'gigaword')": {"A": 6.499202936404031, "B": 0.1233185035135486, "C": 9.686843110254685e-16},
    "('facebook/opt-6.7b', 'gigaword')": {"A": 2.177079948663568, "B": 0.014498526945722394, "C": 1.6637361013813952e-20},
    "('gpt2', 'gigaword')": {"A": 6.339049550102067, "B": 0.1172559500524722, "C": 3.0102599329917646e-20},
    "('t5-base', 'gigaword')": {"A": 1.8233793781091008, "B": 0.16745997024393933, "C": 0.4167409793913292},
    "('t5-small', 'gigaword')": {"A": 1.7757421128794664, "B": 0.1343981132151212, "C": 0.40091836577062706},
    "('google/mt5-base', 'gigaword')": {"A": 3.4396037452096033, "B": 0.03165062212756719, "C": 2.272270358483561e-21},
    "('google/mt5-large', 'gigaword')": {"A": 3.5492577919728685, "B": 0.036881264614398473, "C": 3.110993196587435e-19},
    "('MBZUAI/LaMini-GPT-124M', 'wikiword')": {"A": 3.7815959596442967, "B": 0.06407311340689052, "C": 2.1990783058632668e-15},
    "('MBZUAI/LaMini-GPT-774M', 'wikiword')": {"A": 2.8815067635589147, "B": 0.05390255276559482, "C": 1.0505193779423451e-21},
    "('cerebras/Cerebras-GPT-256M', 'wikiword')": {"A": 4.456019622760662, "B": 0.06905741513102519, "C": 1.7674523244785396e-22},
    "('cerebras/Cerebras-GPT-1.3B', 'wikiword')": {"A": 3.167052219129391, "B": 0.0501048230571595, "C": 1.4069568777664217e-16},
    "('facebook/bart-base', 'wikiword')": {"A": 5.861960588216526, "B": 0.1201852341432514, "C": 0.25050927467318046},
    "('facebook/bart-large', 'wikiword')": {"A": 2.620750833375674, "B": 0.11520373428320353, "C": 0.781463479666753},
    "('facebook/opt-1.3b', 'wikiword')": {"A": 2.349125080913662, "B": 0.0417629915758353, "C": 4.408502521317795e-06},
    "('facebook/opt-350m', 'wikiword')": {"A": 3.2418388183227664, "B": 0.05546399769101606, "C": 2.38823275940463e-18},
    "('facebook/opt-6.7b', 'wikiword')": {"A": 1.788104942925995, "B": 0.04220829699230767, "C": 0.26932891639769574},
    "('gpt2', 'wikiword')": {"A": 3.9462672307524063, "B": 0.0681165719858611, "C": 8.567550533709646e-16},
    "('t5-base', 'wikiword')": {"A": 2.259872088758908, "B": 0.04457386967601375, "C": 3.691577925559672e-22},
    "('t5-small', 'wikiword')": {"A": 2.7905737414980694, "B": 0.05079953192207081, "C": 7.005925333025477e-15},
    "('google/mt5-base', 'wikiword')": {"A": 4.734453344250497, "B": 0.09261262606875871, "C": 1.7979546095664325e-22},
    "('google/mt5-large', 'wikiword')": {"A": 3.9262973862263366, "B": 0.0781626897617864, "C": 2.168723424360531e-14}
}


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    predictions = []
    
    # Ensure the group key matches the format in FITTED_PARAMS
    # The groups in the JSON are string representations of tuples, e.g., "('model', 'dataset')"
    # We need to ensure the `group` parameter is converted to this format if it's not already.
    # A simple way is to re-evaluate the string representation of the tuple if group is a tuple.
    # However, the input `group` is already a string according to the problem description,
    # so we'll assume it matches the string keys in FITTED_PARAMS directly or needs conversion
    # from a tuple to a string if the user provides tuples.
    # Given the previous output of `df['group'].unique()`, the keys are indeed string representations of tuples.
    
    # Ensure the group key is in the correct format for lookup
    # If the input `group` is already a string like "('MBZUAI/LaMini-GPT-124M', 'flan')", no conversion needed.
    # If it's a tuple, convert it to the string representation.
    # Assuming `group` comes as a string representation of the tuple directly from the user input based on problem statement
    # and previous observations, no explicit conversion from tuple to string needed here.

    if group not in FITTED_PARAMS:
        raise ValueError(f"Parameters not found for group: {group}. Available groups: {list(FITTED_PARAMS.keys())}")

    params = FITTED_PARAMS[group]
    A = params['A']
    B = params['B']
    C = params['C']

    for data_point in input_data:
        sft_data_size = data_point['sft_data_size']
        if sft_data_size <= 0:
            # Handle non-positive sft_data_size gracefully to avoid errors with x**-B
            # For extrapolation, if B is positive, a small positive value will result in a very large loss.
            # A more robust approach might be to return a very large number or NaN,
            # or clamp sft_data_size to a small positive epsilon.
            # For now, let's clamp it to a small positive value to allow calculation.
            sft_data_size = 1e-9 
        predicted_sft_loss = A * (sft_data_size ** -B) + C
        predictions.append({'sft_loss': predicted_sft_loss})

    return predictions

#2 Run 2 R² = 0.787239

▼

Python

import numpy as np
import ast

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """

    # Fitted parameters for each group
    # These parameters were derived by fitting a power law: sft_loss = a * (sft_data_size)^b + c
    fitted_parameters = {
        ('MBZUAI/LaMini-GPT-124M', 'flan'): {'a': 6.973887601678385, 'b': -0.08134157488968836, 'c': 8.385015259440269e-16},
        ('MBZUAI/LaMini-GPT-774M', 'flan'): {'a': 5.416194004008029, 'b': -0.07203581317324391, 'c': 3.248444764999606e-19},
        ('cerebras/Cerebras-GPT-256M', 'flan'): {'a': 4.49777570534405, 'b': -0.04919729620640735, 'c': 9.77026551038936e-16},
        ('cerebras/Cerebras-GPT-1.3B', 'flan'): {'a': 3.7266874822200644, 'b': -0.051336228881859236, 'c': 4.903746701257287e-18},
        ('facebook/bart-base', 'flan'): {'a': 6.796595186690682, 'b': -0.0855017101765646, 'c': 8.169519979528847e-13},
        ('facebook/bart-large', 'flan'): {'a': 5.121314487536176, 'b': -0.07407830317798782, 'c': 6.639862226455556e-12},
        ('facebook/opt-1.3b', 'flan'): {'a': 3.217103533488377, 'b': -0.04905071401755603, 'c': 1.7004765442558866e-21},
        ('facebook/opt-350m', 'flan'): {'a': 4.4681743332800865, 'b': -0.05761166166677852, 'c': 3.622366007733795e-16},
        ('facebook/opt-6.7b', 'flan'): {'a': 2.2339677920638805, 'b': -0.019148365259602625, 'c': 1.0974804292774034e-10},
        ('gpt2', 'flan'): {'a': 7.198397712535236, 'b': -0.081720442260402, 'c': 5.156880719105918e-19},
        ('t5-base', 'flan'): {'a': 3.543052164388903, 'b': -0.05225511786796331, 'c': 2.601095285567138e-25},
        ('t5-small', 'flan'): {'a': 4.052493005732183, 'b': -0.05269103068039934, 'c': 8.360466235989551e-18},
        ('google/mt5-base', 'flan'): {'a': 4.571149011276132, 'b': -0.06361718988494677, 'c': 4.633021299011904e-14},
        ('google/mt5-large', 'flan'): {'a': 3.4927985717656886, 'b': -0.05281181463518428, 'c': 2.1563374341950853e-17},
        ('MBZUAI/LaMini-GPT-124M', 'gigaword'): {'a': 5.923743274315915, 'b': -0.10520860225837628, 'c': 3.7710290359659517e-16},
        ('MBZUAI/LaMini-GPT-774M', 'gigaword'): {'a': 5.350691403440743, 'b': -0.10952381784302329, 'c': 2.4810595725949006e-16},
        ('cerebras/Cerebras-GPT-256M', 'gigaword'): {'a': 5.237426788563848, 'b': -0.10004122920528172, 'c': 5.351180919299611e-19},
        ('cerebras/Cerebras-GPT-1.3B', 'gigaword'): {'a': 4.601802233073083, 'b': -0.08937942555025985, 'c': 3.645382495201514e-15},
        ('facebook/bart-base', 'gigaword'): {'a': 7.5944700097000934, 'b': -0.13330986997284713, 'c': 1.7204646839580421e-12},
        ('facebook/bart-large', 'gigaword'): {'a': 7.683922602297089, 'b': -0.14310664014976468, 'c': 7.459579427837848e-17},
        ('facebook/opt-1.3b', 'gigaword'): {'a': 5.229994750843848, 'b': -0.10805298346196457, 'c': 5.992524862538881e-19},
        ('facebook/opt-350m', 'gigaword'): {'a': 6.4992005287071395, 'b': -0.1233184559668262, 'c': 7.394666070663951e-16},
        ('facebook/opt-6.7b', 'gigaword'): {'a': 2.177079945546062, 'b': -0.014498526784708423, 'c': 7.272115818324123e-15},
        ('gpt2', 'gigaword'): {'a': 6.339050002404118, 'b': -0.11725595914760072, 'c': 1.7669693346153314e-19},
        ('t5-base', 'gigaword'): {'a': 1.8233793945698429, 'b': -0.16745997332281817, 'c': 0.41674098634526907},
        ('t5-small', 'gigaword'): {'a': 1.7757418014641266, 'b': -0.13439794958024143, 'c': 0.4009177106203218},
        ('google/mt5-base', 'gigaword'): {'a': 3.439603746336424, 'b': -0.03165062216843089, 'c': 1.2667291117725655e-10},
        ('google/mt5-large', 'gigaword'): {'a': 3.549257795876017, 'b': -0.03688126473977413, 'c': 1.0383414089805345e-18},
        ('MBZUAI/LaMini-GPT-124M', 'wikiword'): {'a': 3.7815959375008465, 'b': -0.06407311270958053, 'c': 4.24692621060624e-21},
        ('MBZUAI/LaMini-GPT-774M', 'wikiword'): {'a': 2.881506764325522, 'b': -0.05390255279764554, 'c': 2.1165938700570342e-14},
        ('cerebras/Cerebras-GPT-256M', 'wikiword'): {'a': 4.456019634138359, 'b': -0.06905741543451652, 'c': 1.6286004542706527e-15},
        ('cerebras/Cerebras-GPT-1.3B', 'wikiword'): {'a': 3.1670522145720152, 'b': -0.05010482289031687, 'c': 8.098938302203765e-23},
        ('facebook/bart-base', 'wikiword'): {'a': 5.86196011692024, 'b': -0.12018492319460633, 'c': 0.25050408212341246},
        ('facebook/bart-large', 'wikiword'): {'a': 2.6207508352095923, 'b': -0.11520372860584481, 'c': 0.7814634332256591},
        ('facebook/opt-1.3b', 'wikiword'): {'a': 2.3491291179792992, 'b': -0.041762877362103025, 'c': 7.617635829771271e-11},
        ('facebook/opt-350m', 'wikiword'): {'a': 3.2418388174103048, 'b': -0.05546399765807819, 'c': 4.084064646393175e-17},
        ('facebook/opt-6.7b', 'wikiword'): {'a': 1.788105839801385, 'b': -0.04220826284689452, 'c': 0.2693279326000106},
        ('gpt2', 'wikiword'): {'a': 3.9462672439114614, 'b': -0.06811657238517173, 'c': 6.505974388089182e-14},
        ('t5-base', 'wikiword'): {'a': 2.259872098789215, 'b': -0.04457387019144798, 'c': 2.593161795915095e-13},
        ('t5-small', 'wikiword'): {'a': 2.7905737295558093, 'b': -0.050799531419240665, 'c': 3.911666158786942e-15},
        ('google/mt5-base', 'wikiword'): {'a': 4.7344533512071765, 'b': -0.09261262624971638, 'c': 9.026900928962441e-20},
        ('google/mt5-large', 'wikiword'): {'a': 3.9262973816913367, 'b': -0.07816268962443257, 'c': 8.238421116007637e-19}
    }

    group_tuple = ast.literal_eval(group) if isinstance(group, str) else group

    if group_tuple not in fitted_parameters:
        raise ValueError(f"No fitted parameters found for group: {group}")

    params = fitted_parameters[group_tuple]
    a, b, c = params['a'], params['b'], params['c']

    predictions = []
    for data_point in input_data:
        sft_data_size = data_point['sft_data_size']
        # Apply the power law, handling potential issues with very small or zero sft_data_size
        # A small epsilon is added to sft_data_size to avoid issues with log(0) or 0^negative_exponent
        predicted_sft_loss = a * np.power(sft_data_size + 1e-9, b) + c
        predictions.append({'sft_loss': predicted_sft_loss})

    return predictions

#3 Run 3 R² = 0.787239

▼

Python

import json
import os

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Load the fitted parameters
    current_dir = os.path.dirname(os.path.abspath(__file__))
    parameters_path = os.path.join(current_dir, "fitted_parameters.json")
    
    with open(parameters_path, "r") as f:
        group_parameters = json.load(f)

    if group not in group_parameters:
        raise ValueError(f"Parameters for group '{group}' not found.")

    params = group_parameters[group]
    A = params['A']
    B = params['B']
    C = params['C']

    predictions = []
    for data_point in input_data:
        sft_data_size = data_point['sft_data_size']
        if sft_data_size <= 0:
            # Handle cases where sft_data_size might be non-positive, though
            # the training data had min 200. Extrapolation to <=0 is undefined.
            predicted_sft_loss = float('nan') # Not a Number
        else:
            predicted_sft_loss = A / (sft_data_size ** B) + C
        predictions.append({'sft_loss': predicted_sft_loss})
        
    return predictions

if __name__ == '__main__':
    # Example usage (for testing purposes)
    test_data = [
        {'sft_data_size': 200},
        {'sft_data_size': 400},
        {'sft_data_size': 800},
        {'sft_data_size': 1600},
        {'sft_data_size': 3200},
        {'sft_data_size': 1000000} # Extrapolation example
    ]

    test_group = "('MBZUAI/LaMini-GPT-124M', 'flan')"
    
    predicted_losses = law(test_data, test_group)
    for i, data_point in enumerate(test_data):
        print(f"SFT Data Size: {data_point['sft_data_size']}, Predicted SFT Loss: {predicted_losses[i]['sft_loss']:.4f}")

    test_group_unknown = "('Unknown/Group', 'dataset')"
    try:
        law(test_data, test_group_unknown)
    except ValueError as e:
        print(f"Caught expected error for unknown group: {e}")

#4 Run 4 R² = 0.787239

▼

Python

import math

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    fitted_params_dict = {
        "('MBZUAI/LaMini-GPT-124M', 'flan')": {"A": 6.973887622030689, "alpha": 0.08134157526007954, "C": 5.728864479101879e-16},
        "('MBZUAI/LaMini-GPT-774M', 'flan')": {"A": 5.416194193245512, "alpha": 0.07203581737735007, "C": 3.15688386085212e-19},
        "('cerebras/Cerebras-GPT-256M', 'flan')": {"A": 4.497775838728258, "alpha": 0.049197299674383646, "C": 5.3367599104532994e-14},
        "('cerebras/Cerebras-GPT-1.3B', 'flan')": {"A": 3.7266874820520592, "alpha": 0.05133622887211534, "C": 5.308819321971189e-22},
        "('facebook/bart-base', 'flan')": {"A": 6.796594733758357, "alpha": 0.08550170201971265, "C": 4.14245372898697e-18},
        "('facebook/bart-large', 'flan')": {"A": 5.121314339461187, "alpha": 0.07407829968638488, "C": 1.441224334032635e-20},
        "('facebook/opt-1.3b', 'flan')": {"A": 3.2171035364193568, "alpha": 0.049050714120538304, "C": 3.587789084100273e-18},
        "('facebook/opt-350m', 'flan')": {"A": 4.4681743469128685, "alpha": 0.05761166202483575, "C": 3.58923794877613e-17},
        "('facebook/opt-6.7b', 'flan')": {"A": 2.2339678158228464, "alpha": 0.019148366823950762, "C": 6.5437299366956614e-09},
        "('gpt2', 'flan')": {"A": 7.198397891815236, "alpha": 0.08172044529303231, "C": 7.792619589013988e-16},
        "('t5-base', 'flan')": {"A": 3.543052145699234, "alpha": 0.05225511724790614, "C": 3.414378229853509e-19},
        "('t5-small', 'flan')": {"A": 4.052492991681753, "alpha": 0.05269103027638957, "C": 2.6046573320799443e-20},
        "('google/mt5-base', 'flan')": {"A": 4.571148996789713, "alpha": 0.06361718950582826, "C": 1.764042892788749e-18},
        "('google/mt5-large', 'flan')": {"A": 3.492798570269327, "alpha": 0.05281181459183348, "C": 2.3926629993737422e-20},
        "('MBZUAI/LaMini-GPT-124M', 'gigaword')": {"A": 5.923746584669781, "alpha": 0.10520867238891192, "C": 1.220878785158984e-14},
        "('MBZUAI/LaMini-GPT-774M', 'gigaword')": {"A": 5.350686906412832, "alpha": 0.10952371176932689, "C": 3.425654698520003e-15},
        "('cerebras/Cerebras-GPT-256M', 'gigaword')": {"A": 5.237425216300412, "alpha": 0.10004119173591676, "C": 6.940116890442888e-18},
        "('cerebras/Cerebras-GPT-1.3B', 'gigaword')": {"A": 4.60180208727334, "alpha": 0.08937942165062521, "C": 2.811225010549749e-21},
        "('facebook/bart-base', 'gigaword')": {"A": 7.594466042025294, "alpha": 0.13330980184819125, "C": 8.55783800416838e-19},
        "('facebook/bart-large', 'gigaword')": {"A": 7.6839205668930735, "alpha": 0.14310660533730102, "C": 6.267777005997953e-22},
        "('facebook/opt-1.3b', 'gigaword')": {"A": 5.229996695050441, "alpha": 0.10805303032496519, "C": 4.754079785277548e-13},
        "('facebook/opt-350m', 'gigaword')": {"A": 6.499201514470343, "alpha": 0.12331847542645152, "C": 6.645717675267342e-24},
        "('facebook/opt-6.7b', 'gigaword')": {"A": 2.1770592712777397, "alpha": 0.014498684746609399, "C": 2.0848913148768434e-05},
        "('gpt2', 'gigaword')": {"A": 6.33904969952234, "alpha": 0.11725595305344946, "C": 7.561774201936386e-21},
        "('t5-base', 'gigaword')": {"A": 1.8233793832089342, "alpha": 0.1674599712841327, "C": 0.416740981831402},
        "('t5-small', 'gigaword')": {"A": 1.77574171101253, "alpha": 0.13439790269639923, "C": 0.40091752337599573},
        "('google/mt5-base', 'gigaword')": {"A": 3.439603745471742, "alpha": 0.03165062213591863, "C": 4.0608391576497476e-21},
        "('google/mt5-large', 'gigaword')": {"A": 3.549257809378075, "alpha": 0.036881265180813576, "C": 2.0860758741519912e-15},
        "('MBZUAI/LaMini-GPT-124M', 'wikiword')": {"A": 3.781595939791746, "alpha": 0.0640731127790993, "C": 5.002622908598201e-20},
        "('MBZUAI/LaMini-GPT-774M', 'wikiword')": {"A": 2.8815067636663794, "alpha": 0.05390255277199627, "C": 1.154460555109995e-22},
        "('cerebras/Cerebras-GPT-256M', 'wikiword')": {"A": 4.45601962312231, "alpha": 0.06905741514014997, "C": 1.5256958657628461e-22},
        "('cerebras/Cerebras-GPT-1.3B', 'wikiword')": {"A": 3.167052214787771, "alpha": 0.05010482289741227, "C": 7.877545673645862e-12},
        "('facebook/bart-base', 'wikiword')": {"A": 5.861961164932658, "alpha": 0.1201856235398003, "C": 0.25051578452086226},
        "('facebook/bart-large', 'wikiword')": {"A": 2.6207508354598192, "alpha": 0.115203725914385, "C": 0.7814634114352521},
        "('facebook/opt-1.3b', 'wikiword')": {"A": 2.3491283480616567, "alpha": 0.041762899055400794, "C": 8.395808987406189e-07},
        "('facebook/opt-350m', 'wikiword')": {"A": 3.2418388182845685, "alpha": 0.05546399768990203, "C": 1.7710627352822325e-13},
        "('facebook/opt-6.7b', 'wikiword')": {"A": 1.7881030890968734, "alpha": 0.04220836681314452, "C": 0.26933094247749184},
        "('gpt2', 'wikiword')": {"A": 3.946267226261065, "alpha": 0.06811657184918582, "C": 1.5001441424029169e-16},
        "('t5-base', 'wikiword')": {"A": 2.2598720840266338, "alpha": 0.04457386943064952, "C": 1.5467119401884043e-19},
        "('t5-small', 'wikiword')": {"A": 2.7905737285537673, "alpha": 0.05079953137789061, "C": 1.0135359829837923e-11},
        "('google/mt5-base', 'wikiword')": {"A": 4.734453344942553, "alpha": 0.09261262608587556, "C": 2.2859653215785765e-21},
        "('google/mt5-large', 'wikiword')": {"A": 3.9262973859418766, "alpha": 0.07816268975292369, "C": 2.959944126562046e-17}
    }

    if group not in fitted_params_dict:
        raise ValueError(f"Group '{group}' not found in fitted parameters.")

    params = fitted_params_dict[group]
    A, alpha, C = params['A'], params['alpha'], params['C']

    predictions = []
    for data_point in input_data:
        sft_data_size = data_point['sft_data_size']
        if sft_data_size <= 0: # Ensure positive data size for power law
            predicted_sft_loss = float('nan') # Or handle as an error
        else:
            predicted_sft_loss = A * (sft_data_size ** -alpha) + C
        predictions.append({'sft_loss': predicted_sft_loss})

    return predictions

#5 Run 5 R² = 0.787239

▼

Python

import json
import os
import numpy as np

# Load fitted parameters from the temporary file.
# In a real deployment, this would be handled differently (e.g., packaged with the model).
# For this exercise, we assume the file will be present from the fitting step.
try:
    with open('/root/.gemini/tmp/f53b52ad6d21cceb72dfa78fb67614fe14f110c58e68412b01508d6a485501c3/fitted_params.json', 'r') as f:
        FITTED_PARAMS = json.load(f)
except FileNotFoundError:
    FITTED_PARAMS = {}
    print("Warning: fitted_params.json not found. Predictions will not be accurate.")

def _scaling_law_formula(x, A, B, C):
    """The mathematical formula for the scaling law."""
    # Ensure x is not zero or negative when raised to a negative power
    # For this dataset, sft_data_size starts from 200, so x will always be positive.
    # We can add a small epsilon if there's a risk of x being very close to zero in test data.
    return A * (x ** -B) + C

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    if group not in FITTED_PARAMS:
        # Fallback for unseen groups or if fitting failed for a group
        # This is a robust fallback: predict the mean loss from all data or a default if no data.
        # For this exercise, we'll return a sensible default or raise an error if strict.
        # Given the prompt, it implies parameters *will* be available for the group.
        # If a group is truly unseen, we might need a more sophisticated fallback (e.g., overall average)
        # For now, let's assume the provided 'group' will always have fitted parameters.
        # If not, a robust approach would be to calculate a global average loss or return a placeholder.
        # For evaluation, it's safer to use some average, so let's use a very high value to indicate
        # an unexpected scenario, or a default 0 (which might be too optimistic for loss).
        # A more reasonable fallback would be to return a constant (e.g., the average sft_loss observed in the training data)
        # However, the prompt implies all groups will have parameters.
        # Let's return a very high loss if parameters are missing, indicating an issue.
        # Or, as a simpler fallback for "unseen" groups: predict a high constant loss.
        # For now, let's make it strict and raise an error, assuming `group` will always be fitted.
        # If the problem statement implies extrapolation to *new groups*, then a more general approach
        # (e.g., a "meta-law" or average parameters) would be needed.
        # But for 'new, unseen data points' it refers to data points within *existing* groups.
        # So, we should have params for the given `group`.
        print(f"Error: Parameters for group '{group}' not found. Cannot make prediction.")
        # As a robust fallback, if a group is truly unseen during prediction,
        # we can return a default, e.g., average loss or a constant high loss.
        # For now, we will return None for sft_loss if parameters are missing.
        # This will need to be handled by the caller or a more sophisticated fallback.
        # Given the instruction 'the functional form of the law must be the same for all groups,
        # but the constant parameters/coefficients can differ per group', it implies that
        # `group` will be one for which parameters have been fitted.
        # If not, a reasonable default for loss is often an average observed loss.
        # Let's return 0 for A, B, and a high C if parameters are missing.
        # This will predict just C.
        A, B, C = 0.0, 0.0, 5.0 # A constant high loss if group params are missing.

        # A better fallback in production might be:
        # from datasets import load_from_disk
        # dataset = load_from_disk('/app/data')
        # df_all = dataset.to_pandas()
        # overall_mean_loss = df_all['sft_loss'].mean()
        # A, B, C = 0.0, 0.0, overall_mean_loss
        # This requires loading the dataset again which might be slow.
        # Let's stick with a hardcoded high value for robustness.
        # Or, a more appropriate constant like the max loss observed.
        A, B, C = 0.0, 0.0, 4.5 # A default high loss, slightly above max observed
    else:
        params = FITTED_PARAMS[group]
        A, B, C = params['A'], params['B'], params['C']

    predictions = []
    for data_point in input_data:
        sft_data_size = data_point['sft_data_size']
        predicted_sft_loss = _scaling_law_formula(sft_data_size, A, B, C)
        predictions.append({'sft_loss': predicted_sft_loss})
    return predictions