SLD - Domain Mixture Scaling Law - claude-code + claude-sonnet-4-5

All Runs (sorted by R²)

Best Run 1 R² = 0.971598

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """

    # Fitted parameters for each domain and group
    # Formula: loss_domain_i = L_inf_i - a_i * (proportion_domain_i)^b_i
    params = {
        1: {
            '70M': {'L_inf': 3.414908, 'a': 0.919426, 'b': 0.242859},
            '160M': {'L_inf': 3.060407, 'a': 0.841478, 'b': 0.227141},
            '305M': {'L_inf': 2.898031, 'a': 0.812498, 'b': 0.220675},
            '410M': {'L_inf': 2.831881, 'a': 0.797224, 'b': 0.213714}
        },
        2: {
            '70M': {'L_inf': 3.818429, 'a': 0.286092, 'b': 0.325881},
            '160M': {'L_inf': 3.472137, 'a': 0.229089, 'b': 0.253344},
            '305M': {'L_inf': 3.306184, 'a': 0.231382, 'b': 0.286684},
            '410M': {'L_inf': 3.230276, 'a': 0.194197, 'b': 0.211294}
        },
        3: {
            '70M': {'L_inf': 3.600640, 'a': 0.884553, 'b': 0.258117},
            '160M': {'L_inf': 3.285555, 'a': 0.821255, 'b': 0.248967},
            '305M': {'L_inf': 3.155623, 'a': 0.780708, 'b': 0.218846},
            '410M': {'L_inf': 3.098252, 'a': 0.789583, 'b': 0.216252}
        },
        4: {
            '70M': {'L_inf': 2.266335, 'a': 0.933792, 'b': 0.235431},
            '160M': {'L_inf': 1.963058, 'a': 0.833632, 'b': 0.236056},
            '305M': {'L_inf': 1.832974, 'a': 0.793849, 'b': 0.233302},
            '410M': {'L_inf': 1.779367, 'a': 0.778080, 'b': 0.236914}
        },
        5: {
            '70M': {'L_inf': 3.931742, 'a': 0.511339, 'b': 0.352486},
            '160M': {'L_inf': 3.594913, 'a': 0.526879, 'b': 0.354306},
            '305M': {'L_inf': 3.434413, 'a': 0.530724, 'b': 0.338928},
            '410M': {'L_inf': 3.374611, 'a': 0.548299, 'b': 0.334021}
        }
    }

    # Process each data point
    results = []
    for data_point in input_data:
        prediction = {}

        # Predict loss for each domain
        for domain_i in range(1, 6):
            prop_key = f'proportion_domain_{domain_i}'
            loss_key = f'loss_domain_{domain_i}'

            # Get the proportion for this domain
            proportion = data_point.get(prop_key, 0.0)

            # Get parameters for this domain and group
            domain_params = params[domain_i][group]
            L_inf = domain_params['L_inf']
            a = domain_params['a']
            b = domain_params['b']

            # Apply the scaling law: loss = L_inf - a * p^b
            predicted_loss = L_inf - a * (proportion ** b)

            prediction[loss_key] = predicted_loss

        results.append(prediction)

    return results

#2 Run 2 R² = 0.971145

▼

Python

import math

# Fitted parameters for each domain and group
# Structure: PARAMS[domain][group] = {'a': ..., 'p0': ..., 'c': ...}
PARAMS = {
    'domain_1': {
        '160M': {'a': -0.133706, 'p0': 0.002390, 'c': 2.253262},
        '305M': {'a': -0.126779, 'p0': 0.002106, 'c': 2.116689},
        '410M': {'a': -0.121507, 'p0': 0.001797, 'c': 2.063766},
        '70M': {'a': -0.153400, 'p0': 0.003269, 'c': 2.536901}
    },
    'domain_2': {
        '160M': {'a': -0.035585, 'p0': 0.002818, 'c': 3.263192},
        '305M': {'a': -0.038571, 'p0': 0.004694, 'c': 3.099380},
        '410M': {'a': -0.027050, 'p0': 0.001221, 'c': 3.048828},
        '70M': {'a': -0.051208, 'p0': 0.007660, 'c': 3.568956}
    },
    'domain_3': {
        '160M': {'a': -0.092529, 'p0': 0.000734, 'c': 2.617690},
        '305M': {'a': -0.083716, 'p0': 0.000388, 'c': 2.498053},
        '410M': {'a': -0.084246, 'p0': 0.000364, 'c': 2.431248},
        '70M': {'a': -0.101971, 'p0': 0.000906, 'c': 2.886180}
    },
    'domain_4': {
        '160M': {'a': -0.123123, 'p0': 0.001951, 'c': 1.194858},
        '305M': {'a': -0.116648, 'p0': 0.001864, 'c': 1.099853},
        '410M': {'a': -0.115642, 'p0': 0.002018, 'c': 1.061758},
        '70M': {'a': -0.138490, 'p0': 0.001975, 'c': 1.403946}
    },
    'domain_5': {
        '160M': {'a': -0.141545, 'p0': 0.027307, 'c': 3.084649},
        '305M': {'a': -0.137132, 'p0': 0.023254, 'c': 2.918796},
        '410M': {'a': -0.139187, 'p0': 0.021836, 'c': 2.841747},
        '70M': {'a': -0.137505, 'p0': 0.026904, 'c': 3.435925}
    }
}


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    results = []

    for data_point in input_data:
        predictions = {}

        # For each domain, predict the loss using the logarithmic scaling law
        # L = a * log(p + p0) + c
        for domain_idx in range(1, 6):
            prop_key = f'proportion_domain_{domain_idx}'
            loss_key = f'loss_domain_{domain_idx}'
            domain_key = f'domain_{domain_idx}'

            # Get the proportion for this domain
            proportion = data_point[prop_key]

            # Get the fitted parameters for this domain and group
            params = PARAMS[domain_key][group]
            a = params['a']
            p0 = params['p0']
            c = params['c']

            # Apply the scaling law: L = a * log(p + p0) + c
            predicted_loss = a * math.log(proportion + p0) + c

            predictions[loss_key] = predicted_loss

        results.append(predictions)

    return results

#3 Run 3 R² = 0.971145

▼

Python

import math

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """

    # Fitted parameters for each group and domain
    # Formula: loss_domain_i = c - a * log(proportion_domain_i + b)
    params = {
        "160M": {
            1: {"a": 0.13370571385297636, "b": 0.002390134393468617, "c": 2.2532615941125935},
            2: {"a": 0.03558508709506605, "b": 0.002818038362083297, "c": 3.2631919364668476},
            3: {"a": 0.09252859186605873, "b": 0.0007337050789613676, "c": 2.6176902832037365},
            4: {"a": 0.12312317552374129, "b": 0.0019513289742103528, "c": 1.1948582742030367},
            5: {"a": 0.14154544729126772, "b": 0.02730698179138869, "c": 3.0846493439463316}
        },
        "305M": {
            1: {"a": 0.12677944537739508, "b": 0.0021060417483921446, "c": 2.116688631238799},
            2: {"a": 0.03857137198216823, "b": 0.004693509918912462, "c": 3.0993804511546172},
            3: {"a": 0.08371624093149321, "b": 0.0003880177278893411, "c": 2.498051678487977},
            4: {"a": 0.1166480072254707, "b": 0.0018641785034444133, "c": 1.0998534270479912},
            5: {"a": 0.1371317714952529, "b": 0.02325374313900293, "c": 2.9187956816183247}
        },
        "410M": {
            1: {"a": 0.12150735155719182, "b": 0.001797374580772529, "c": 2.0637659341196875},
            2: {"a": 0.02704989730064647, "b": 0.0012212141601325463, "c": 3.0488279783306345},
            3: {"a": 0.08424583253617433, "b": 0.00036447295242515113, "c": 2.4312477954997536},
            4: {"a": 0.11564152660339765, "b": 0.0020181562198990236, "c": 1.0617583947352045},
            5: {"a": 0.13918678570181242, "b": 0.021835831641187086, "c": 2.841747294132424}
        },
        "70M": {
            1: {"a": 0.15339974058406491, "b": 0.0032688436937777696, "c": 2.536901156656863},
            2: {"a": 0.05120827004111036, "b": 0.007660058657766675, "c": 3.568955621338834},
            3: {"a": 0.10197068274747872, "b": 0.0009064652901137906, "c": 2.8861808195815533},
            4: {"a": 0.13849029277376568, "b": 0.001974889729256156, "c": 1.4039456354099615},
            5: {"a": 0.1375046566797978, "b": 0.02690444916275532, "c": 3.435925431698058}
        }
    }

    # Get parameters for the specified group
    if group not in params:
        raise ValueError(f"Unknown group: {group}. Available groups: {list(params.keys())}")

    group_params = params[group]

    # Generate predictions
    predictions = []

    for data_point in input_data:
        prediction = {}

        # For each domain, predict the loss
        for domain in range(1, 6):
            prop_key = f"proportion_domain_{domain}"
            loss_key = f"loss_domain_{domain}"

            # Get the proportion for this domain
            if prop_key not in data_point:
                raise ValueError(f"Missing input variable: {prop_key}")

            proportion = data_point[prop_key]

            # Get parameters for this domain
            a = group_params[domain]["a"]
            b = group_params[domain]["b"]
            c = group_params[domain]["c"]

            # Apply the scaling law: loss = c - a * log(proportion + b)
            predicted_loss = c - a * math.log(proportion + b)

            prediction[loss_key] = predicted_loss

        predictions.append(prediction)

    return predictions

#4 Run 4 R² = 0.970491

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """

    # Fitted parameters for each group and domain
    # Structure: params[group][domain] = {'L_0': ..., 'L_inf': ..., 'C': ..., 'alpha': ...}
    params = {
        '70M': {
            1: {'L_0': 3.414908, 'L_inf': 0.100000, 'C': 2.447942, 'alpha': 0.055697},
            2: {'L_0': 3.818429, 'L_inf': 3.293314, 'C': 0.289679, 'alpha': 0.128688},
            3: {'L_0': 3.600640, 'L_inf': 0.100000, 'C': 2.809106, 'alpha': 0.031725},
            4: {'L_0': 2.266335, 'L_inf': 1.184513, 'C': 0.285163, 'alpha': 0.269767},
            5: {'L_0': 3.931742, 'L_inf': 2.131576, 'C': 1.306624, 'alpha': 0.087635},
        },
        '160M': {
            1: {'L_0': 3.060407, 'L_inf': 0.100000, 'C': 2.162561, 'alpha': 0.055281},
            2: {'L_0': 3.472137, 'L_inf': 2.968259, 'C': 0.301832, 'alpha': 0.095188},
            3: {'L_0': 3.285555, 'L_inf': 0.100000, 'C': 2.538393, 'alpha': 0.031901},
            4: {'L_0': 1.963058, 'L_inf': 0.100000, 'C': 1.118286, 'alpha': 0.089544},
            5: {'L_0': 3.594913, 'L_inf': 0.100000, 'C': 2.985780, 'alpha': 0.041148},
        },
        '305M': {
            1: {'L_0': 2.898031, 'L_inf': 0.100000, 'C': 2.025209, 'alpha': 0.056080},
            2: {'L_0': 3.306184, 'L_inf': 2.808446, 'C': 0.299215, 'alpha': 0.101365},
            3: {'L_0': 3.155623, 'L_inf': 0.100000, 'C': 2.414588, 'alpha': 0.030772},
            4: {'L_0': 1.832974, 'L_inf': 0.100000, 'C': 1.022308, 'alpha': 0.092419},
            5: {'L_0': 3.434413, 'L_inf': 0.100000, 'C': 2.819827, 'alpha': 0.042872},
        },
        '410M': {
            1: {'L_0': 2.831881, 'L_inf': 0.100000, 'C': 1.971811, 'alpha': 0.055358},
            2: {'L_0': 3.230276, 'L_inf': 2.748866, 'C': 0.304027, 'alpha': 0.075641},
            3: {'L_0': 3.098252, 'L_inf': 0.100000, 'C': 2.348189, 'alpha': 0.031759},
            4: {'L_0': 1.779367, 'L_inf': 0.524991, 'C': 0.569721, 'alpha': 0.146568},
            5: {'L_0': 3.374611, 'L_inf': 0.100000, 'C': 2.743305, 'alpha': 0.044620},
        },
    }

    def predict_loss(proportion: float, domain: int, group: str) -> float:
        """
        Predict the validation loss for a domain given its proportion in training data.

        Scaling law:
            loss_i = L_0_i                          if proportion_i = 0
            loss_i = L_inf_i + C_i * proportion_i^(-alpha_i)   if proportion_i > 0

        where:
            - L_0_i is the loss when domain i has zero proportion (intrinsic difficulty)
            - L_inf_i is the asymptotic loss as proportion_i → ∞
            - C_i is a scaling coefficient
            - alpha_i is the scaling exponent (how fast loss decreases with proportion)
        """
        p = params[group][domain]

        if proportion == 0:
            return p['L_0']
        else:
            return p['L_inf'] + p['C'] * (proportion ** (-p['alpha']))

    # Generate predictions for each input data point
    results = []

    for data_point in input_data:
        predictions = {}

        # Predict loss for each domain
        for domain in range(1, 6):
            proportion_key = f'proportion_domain_{domain}'
            loss_key = f'loss_domain_{domain}'

            if proportion_key in data_point:
                proportion = data_point[proportion_key]
                predicted_loss = predict_loss(proportion, domain, group)
                predictions[loss_key] = predicted_loss

        results.append(predictions)

    return results

#5 Run 5 R² = 0.968336

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Fitted parameters for each (group, domain) pair
    # Model: loss_domain_i = a + b / (proportion_domain_i + eps)
    params = {
        '70M': {
            1: {'a': 2.5662, 'b': 0.0488, 'eps': 0.0576},
            2: {'a': 3.5963, 'b': 0.0141, 'eps': 0.0636},
            3: {'a': 3.0578, 'b': 0.0084, 'eps': 0.0155},
            4: {'a': 1.5025, 'b': 0.0288, 'eps': 0.0377},
            5: {'a': 3.3529, 'b': 0.1127, 'eps': 0.1952},
        },
        '160M': {
            1: {'a': 2.2834, 'b': 0.0402, 'eps': 0.0519},
            2: {'a': 3.2866, 'b': 0.0083, 'eps': 0.0445},
            3: {'a': 2.7769, 'b': 0.0073, 'eps': 0.0143},
            4: {'a': 1.2831, 'b': 0.0255, 'eps': 0.0375},
            5: {'a': 2.9952, 'b': 0.1205, 'eps': 0.2034},
        },
        '305M': {
            1: {'a': 2.1469, 'b': 0.0374, 'eps': 0.0498},
            2: {'a': 3.1226, 'b': 0.0097, 'eps': 0.0528},
            3: {'a': 2.6482, 'b': 0.0059, 'eps': 0.0117},
            4: {'a': 1.1838, 'b': 0.0240, 'eps': 0.0370},
            5: {'a': 2.8383, 'b': 0.1097, 'eps': 0.1856},
        },
        '410M': {
            1: {'a': 2.0943, 'b': 0.0350, 'eps': 0.0476},
            2: {'a': 3.0684, 'b': 0.0057, 'eps': 0.0351},
            3: {'a': 2.5829, 'b': 0.0059, 'eps': 0.0115},
            4: {'a': 1.1439, 'b': 0.0241, 'eps': 0.0379},
            5: {'a': 2.7604, 'b': 0.1109, 'eps': 0.1828},
        },
    }

    # Get parameters for the specified group
    if group not in params:
        raise ValueError(f"Unknown group: {group}. Valid groups are: {list(params.keys())}")

    group_params = params[group]

    # Generate predictions
    results = []
    for data_point in input_data:
        predictions = {}

        # Predict loss for each domain
        for domain_idx in range(1, 6):
            proportion_key = f'proportion_domain_{domain_idx}'
            loss_key = f'loss_domain_{domain_idx}'

            # Get the proportion value
            if proportion_key not in data_point:
                raise ValueError(f"Missing input key: {proportion_key}")

            proportion = data_point[proportion_key]

            # Apply the scaling law: loss = a + b / (proportion + eps)
            p = group_params[domain_idx]
            predicted_loss = p['a'] + p['b'] / (proportion + p['eps'])

            predictions[loss_key] = predicted_loss

        results.append(predictions)

    return results