SLD - Domain Mixture Scaling Law - claude-code + claude-haiku-4-5

All Runs (sorted by R²)

Best Run 1 R² = 0.970840

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    The scaling law discovered is a log-linear relationship:
    loss_domain_i = a_i + b_i * log(proportion_domain_i)

    where a_i and b_i are fitted coefficients that depend on the domain and group.

    For zero proportions, a baseline loss value is used.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values (proportion_domain_i).
        group: The name of the experimental group for which to make predictions.
               Supported groups: '70M', '160M', '305M', '410M'

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s) (loss_domain_i).
    """
    import math

    # Fitted parameters: loss_domain_i = a_i + b_i * log(proportion_domain_i)
    # Format: {domain_i: {group: {a: float, b: float}}}
    params = {
        'domain_1': {
            '70M': {'a': 2.538716023062174, 'b': -0.15014064985318967},
            '160M': {'a': 2.2544843272662405, 'b': -0.1315809083635399},
            '305M': {'a': 2.1175968264069764, 'b': -0.12507426813350242},
            '410M': {'a': 2.0645520693878363, 'b': -0.1200838190624766},
        },
        'domain_2': {
            '70M': {'a': 3.5716664015145185, 'b': -0.04836401240653225},
            '160M': {'a': 3.263924262467963, 'b': -0.034830492162166084},
            '305M': {'a': 3.1006743096532867, 'b': -0.03722876491120557},
            '410M': {'a': 3.0490737355109614, 'b': -0.02679818184395711},
        },
        'domain_3': {
            '70M': {'a': 2.893348244102368, 'b': -0.09886102692916138},
            '160M': {'a': 2.623560530670634, 'b': -0.09003128254977677},
            '305M': {'a': 2.501230421480594, 'b': -0.08239118209742197},
            '410M': {'a': 2.4342910737037524, 'b': -0.08297977891663602},
        },
        'domain_4': {
            '70M': {'a': 1.4060660023243017, 'b': -0.1362756449783236},
            '160M': {'a': 1.1970241661817773, 'b': -0.12103109712578866},
            '305M': {'a': 1.101723420311251, 'b': -0.11479764855541283},
            '410M': {'a': 1.0636521193056327, 'b': -0.11371123168518989},
        },
        'domain_5': {
            '70M': {'a': 3.434597631728354, 'b': -0.12648654647382065},
            '160M': {'a': 3.0839186234824836, 'b': -0.12887711537787372},
            '305M': {'a': 2.9180204995169983, 'b': -0.1269094532480054},
            '410M': {'a': 2.8412870151502942, 'b': -0.12892583253832435},
        },
    }

    # Baseline losses at p=0 (computed from training data)
    baseline_losses = {
        'domain_1': {
            '70M': 3.4149081168601354,
            '160M': 3.060407302873726,
            '305M': 2.8980309491789353,
            '410M': 2.831881281851642,
        },
        'domain_2': {
            '70M': 3.818428775380711,
            '160M': 3.4721373995346867,
            '305M': 3.3061837391603213,
            '410M': 3.230276246563029,
        },
        'domain_3': {
            '70M': 3.60063959478022,
            '160M': 3.2855554601648356,
            '305M': 3.1556234171102338,
            '410M': 3.0982517116672392,
        },
        'domain_4': {
            '70M': 2.266334699876238,
            '160M': 1.9630575752887789,
            '305M': 1.83297445467203,
            '410M': 1.7793668136344885,
        },
        'domain_5': {
            '70M': 3.931742488662131,
            '160M': 3.594912574404762,
            '305M': 3.4344130881519273,
            '410M': 3.374610814377834,
        },
    }

    results = []

    for data_point in input_data:
        output = {}

        # Predict loss for each domain
        for domain_idx in range(1, 6):
            domain_key = f'domain_{domain_idx}'
            proportion_key = f'proportion_domain_{domain_idx}'
            loss_key = f'loss_domain_{domain_idx}'

            # Get the proportion for this domain
            proportion = data_point.get(proportion_key, 0.0)

            # Get parameters for this domain and group
            if domain_key in params and group in params[domain_key]:
                # For zero proportion, use baseline loss
                if proportion == 0:
                    if domain_key in baseline_losses and group in baseline_losses[domain_key]:
                        loss = baseline_losses[domain_key][group]
                    else:
                        loss = 3.0  # fallback value
                else:
                    # Apply the log-linear model: loss = a + b*log(proportion)
                    a = params[domain_key][group]['a']
                    b = params[domain_key][group]['b']
                    loss = a + b * math.log(proportion)

                output[loss_key] = loss

        results.append(output)

    return results

#2 Run 2 R² = 0.968671

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """

    # The discovered scaling law: loss_domain_i = baseline_i + coeff_i * proportion_domain_i^alpha
    # Where alpha is a universal exponent that applies to all domains and groups

    ALPHA = 0.2724489795918367

    # Fitted parameters for each domain and group
    PARAMETERS = {
        1: {
            '70M': {'baseline': 3.406631, 'coeff': -0.944830},
            '160M': {'baseline': 3.048356, 'coeff': -0.877175},
            '305M': {'baseline': 2.884640, 'coeff': -0.852095},
            '410M': {'baseline': 2.816754, 'coeff': -0.841401},
        },
        2: {
            '70M': {'baseline': 3.818985, 'coeff': -0.259324},
            '160M': {'baseline': 3.471953, 'coeff': -0.237293},
            '305M': {'baseline': 3.306313, 'coeff': -0.225400},
            '410M': {'baseline': 3.229731, 'coeff': -0.217373},
        },
        3: {
            '70M': {'baseline': 3.600080, 'coeff': -0.919650},
            '160M': {'baseline': 3.284799, 'coeff': -0.875579},
            '305M': {'baseline': 3.154179, 'coeff': -0.904445},
            '410M': {'baseline': 3.096721, 'coeff': -0.921298},
        },
        4: {
            '70M': {'baseline': 2.261856, 'coeff': -0.996885},
            '160M': {'baseline': 1.959060, 'coeff': -0.888851},
            '305M': {'baseline': 1.828874, 'coeff': -0.850592},
            '410M': {'baseline': 1.775782, 'coeff': -0.828452},
        },
        5: {
            '70M': {'baseline': 3.963172, 'coeff': -0.528051},
            '160M': {'baseline': 3.626490, 'coeff': -0.542729},
            '305M': {'baseline': 3.461341, 'coeff': -0.544839},
            '410M': {'baseline': 3.399876, 'coeff': -0.561194},
        },
    }

    results = []

    for data_point in input_data:
        prediction = {}

        # Predict loss for each domain
        for domain in range(1, 6):
            proportion_key = f'proportion_domain_{domain}'
            loss_key = f'loss_domain_{domain}'

            if proportion_key in data_point:
                proportion = data_point[proportion_key]

                # Get the fitted parameters for this domain and group
                baseline = PARAMETERS[domain][group]['baseline']
                coeff = PARAMETERS[domain][group]['coeff']

                # Apply the scaling law: loss = baseline + coeff * proportion^alpha
                if proportion == 0:
                    # When proportion is 0, loss equals baseline
                    predicted_loss = baseline
                else:
                    predicted_loss = baseline + coeff * (proportion ** ALPHA)

                prediction[loss_key] = predicted_loss

        results.append(prediction)

    return results

#3 Run 3 R² = 0.965420

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    The scaling law follows a power law model: loss_domain_i = a_i * (proportion_domain_i)^(-b_i)
    where a_i and b_i are domain-specific coefficients that vary by model size group.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Expected keys are 'proportion_domain_1'
                    through 'proportion_domain_5'.
        group: The name of the experimental group for which to make predictions.
                Should be one of: '70M', '160M', '305M', '410M'

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted loss_domain_1 through loss_domain_5.
    """

    # Fitted parameters for power law model: loss = a * (proportion)^(-b)
    # These were fitted using scipy.optimize.curve_fit on the experimental dataset
    parameters = {
        "160M": {
            1: {"a": 2.2622394738070954, "b": 0.05305230055807545},
            2: {"a": 3.2646400859561724, "b": 0.010446160735262215},
            3: {"a": 2.6379005393307238, "b": 0.030809881360294254},
            4: {"a": 1.2168896281087662, "b": 0.08337725170858164},
            5: {"a": 3.0857224560131016, "b": 0.039875164268842474},
        },
        "305M": {
            1: {"a": 2.124886418748417, "b": 0.053673219712720203},
            2: {"a": 3.101532158667605, "b": 0.011722348365702549},
            3: {"a": 2.5141204041736103, "b": 0.029664627417788678},
            4: {"a": 1.1208478064340652, "b": 0.08553463709842556},
            5: {"a": 2.9197997341340862, "b": 0.04145492785494071},
        },
        "410M": {
            1: {"a": 2.071494574419323, "b": 0.052918397685703625},
            2: {"a": 3.04952958026863, "b": 0.008635628449930979},
            3: {"a": 2.447692223080657, "b": 0.03058843279314549},
            4: {"a": 1.0827937621785138, "b": 0.0874293859289332},
            5: {"a": 2.8432374398519316, "b": 0.04312786024778481},
        },
        "70M": {
            1: {"a": 2.547611674057732, "b": 0.05370607557144707},
            2: {"a": 3.5729182354113966, "b": 0.013180493524131429},
            3: {"a": 2.908629500537471, "b": 0.030739009328856415},
            4: {"a": 1.4271407005367267, "b": 0.08068857938614302},
            5: {"a": 3.436049097361814, "b": 0.03538256136605673},
        },
    }

    results = []

    for data_point in input_data:
        prediction = {}

        for domain in range(1, 6):
            prop_key = f"proportion_domain_{domain}"
            loss_key = f"loss_domain_{domain}"

            # Get proportion for this domain
            proportion = data_point[prop_key]

            # Get parameters for this domain and group
            params = parameters[group][domain]
            a = params["a"]
            b = params["b"]

            # Apply power law: loss = a * (proportion)^(-b)
            if proportion > 0:
                predicted_loss = a * (proportion ** (-b))
            else:
                # When proportion is 0, the loss is theoretically undefined (approaches infinity)
                # For practical purposes, we use the baseline value (approaching when prop -> 0)
                # which is approximated by the maximum observed baseline
                predicted_loss = a * (0.001 ** (-b))  # Use a small value as proxy

            prediction[loss_key] = predicted_loss

        results.append(prediction)

    return results

#4 Run 4 R² = 0.830099

▼

Python

import math


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    The scaling law models the validation loss for each domain as a linear function of:
    1. The proportion of that domain in the training mixture
    2. The logarithm of the model size (in millions of parameters)

    Functional form: loss_domain_i = a_i * proportion_domain_i + b_i * log(M) + c_i
    where M is the model size in millions.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Expected keys are 'proportion_domain_1'
                    through 'proportion_domain_5'.
        group: The name of the experimental group for which to make predictions.
               Must be one of: '70M', '160M', '305M', '410M'

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variables 'loss_domain_1'
        through 'loss_domain_5'.
    """

    # Model size in millions for each group
    group_sizes = {
        '70M': 70,
        '160M': 160,
        '305M': 305,
        '410M': 410
    }

    # Fitted parameters: {domain: {'a': coef_prop, 'b': coef_log_model_size, 'c': intercept}}
    params = {
        1: {'a': -1.125872, 'b': -0.304742, 'c': 4.430192},
        2: {'a': -0.770981, 'b': -0.321885, 'c': 5.134486},
        3: {'a': -5.809111, 'b': -0.283582, 'c': 4.733491},
        4: {'a': -2.206028, 'b': -0.234356, 'c': 3.039213},
        5: {'a': -0.423390, 'b': -0.332244, 'c': 5.163817},
    }

    # Get model size for the group
    model_size = group_sizes[group]
    log_model_size = math.log(model_size)

    results = []

    for data_point in input_data:
        prediction = {}

        # Predict loss for each domain
        for domain in range(1, 6):
            prop_key = f'proportion_domain_{domain}'
            loss_key = f'loss_domain_{domain}'

            proportion = data_point[prop_key]

            # Apply the scaling law formula
            p = params[domain]
            predicted_loss = p['a'] * proportion + p['b'] * log_model_size + p['c']

            prediction[loss_key] = predicted_loss

        results.append(prediction)

    return results

#5 Run 5 R² = 0.792215

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """

    # Coefficients for each group, domain, and proportion
    # Discovered through linear regression: loss_i = intercept_i + sum_j(coeff_ij * proportion_j)
    coefficients = {
        "160M": {
            "loss_domain_1": {
                "intercept": 2.2047495302794857,
                "proportions": [-0.6785457845693336, 1.1463793805820295, 0.15167833367170308, 1.025939940580135, 0.559297660014955]
            },
            "loss_domain_2": {
                "intercept": 2.7485645127653795,
                "proportions": [0.8454146604044328, -0.17669189456942216, 0.5339549342653283, 0.8879814770372856, 0.657905335627755]
            },
            "loss_domain_3": {
                "intercept": 1.777028370562523,
                "proportions": [1.6343684204048485, 1.3463909111398886, -4.1176102362247855, 1.4889565642039464, 1.4249227110386287]
            },
            "loss_domain_4": {
                "intercept": 1.157804803455643,
                "proportions": [0.5984442545273441, 0.9125282761471838, 0.5726678524110912, -1.5517735333398215, 0.6259379537098435]
            },
            "loss_domain_5": {
                "intercept": 2.8123009748266243,
                "proportions": [0.5902786179156017, 0.5892359010590805, 0.7654442820304688, 0.6814456996467663, 0.18589647417470898]
            }
        },
        "305M": {
            "loss_domain_1": {
                "intercept": 2.0812993412453547,
                "proportions": [-0.6681116974382604, 1.0884138560917158, 0.16066797498121796, 0.9749209723348417, 0.5254082352758425]
            },
            "loss_domain_2": {
                "intercept": 2.6211416491050334,
                "proportions": [0.8044876469222303, -0.18111733656107531, 0.528440945418323, 0.8482493615636207, 0.6210810317619353]
            },
            "loss_domain_3": {
                "intercept": 1.6406715315205858,
                "proportions": [1.6270585820746972, 1.3740527268982263, -4.2844089602142255, 1.4966639435477096, 1.4273052392141794]
            },
            "loss_domain_4": {
                "intercept": 1.0694804673811962,
                "proportions": [0.5643140257573426, 0.8424196628193863, 0.55242066083749, -1.4834642093589991, 0.5937903273259748]
            },
            "loss_domain_5": {
                "intercept": 2.675669426576589,
                "proportions": [0.5562710617961506, 0.5594325514171398, 0.7566886061299141, 0.647156326106111, 0.15612088112727496]
            }
        },
        "410M": {
            "loss_domain_1": {
                "intercept": 2.026790579832494,
                "proportions": [-0.6707570147435764, 1.1129214834370829, 0.0851387854718725, 0.9896066568001738, 0.5098806688669443]
            },
            "loss_domain_2": {
                "intercept": 2.5586009139858072,
                "proportions": [0.7940122885118273, -0.157381828677838, 0.45948615911537244, 0.8582702215518173, 0.6042140734846292]
            },
            "loss_domain_3": {
                "intercept": 1.574714953134018,
                "proportions": [1.6435298197556953, 1.3880256729190952, -4.3849209143151056, 1.4955237719771226, 1.4325566027972132]
            },
            "loss_domain_4": {
                "intercept": 1.0456653717309252,
                "proportions": [0.5202273851847438, 0.863550404237731, 0.5487421835368134, -1.450622661281582, 0.5637680600532177]
            },
            "loss_domain_5": {
                "intercept": 2.611253142181258,
                "proportions": [0.5424865509221992, 0.5856594143219438, 0.6774479937054709, 0.665680734573013, 0.13997844865863204]
            }
        },
        "70M": {
            "loss_domain_1": {
                "intercept": 2.47466987462387,
                "proportions": [-0.7074679140450373, 1.2175088434044439, 0.22352562602536008, 1.106631113925414, 0.6344722053136926]
            },
            "loss_domain_2": {
                "intercept": 3.0158832255809034,
                "proportions": [0.9147245891387732, -0.16651450748750043, 0.5733419334967286, 0.9507289447583598, 0.7436022656745435]
            },
            "loss_domain_3": {
                "intercept": 1.9867908155493292,
                "proportions": [1.7511750123143863, 1.4302318794850386, -4.279610044979821, 1.5512789318819353, 1.53371503684779]
            },
            "loss_domain_4": {
                "intercept": 1.355256984805328,
                "proportions": [0.6709376757670867, 1.0186456755589912, 0.674235515641396, -1.7195739626404851, 0.7110120804783369]
            },
            "loss_domain_5": {
                "intercept": 3.093438385056124,
                "proportions": [0.6590286880560735, 0.63342029207528, 0.805556538546447, 0.7338054152807485, 0.2616274510975776]
            }
        }
    }

    results = []

    for data_point in input_data:
        prediction = {}

        # Get coefficients for this group
        group_coeffs = coefficients[group]

        # Predict each domain's loss
        for domain_idx in range(1, 6):
            domain_key = f"loss_domain_{domain_idx}"
            domain_params = group_coeffs[domain_key]

            # Calculate: intercept + sum(coeff_j * proportion_j)
            intercept = domain_params["intercept"]
            proportions = domain_params["proportions"]

            loss = intercept
            for j in range(5):
                proportion_key = f"proportion_domain_{j+1}"
                if proportion_key in data_point:
                    loss += proportions[j] * data_point[proportion_key]

            prediction[domain_key] = loss

        results.append(prediction)

    return results