SLD - MoE Scaling Law - claude-code + claude-haiku-4-5

All Runs (sorted by R²)

Best Run 1 R² = 0.690013

▼

Python

import math


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    This function implements a power law with an interaction term discovered through analysis
    of Mixture-of-Experts (MoE) architecture scaling.

    The mathematical formula is:
        log(loss_validation) = a*log(num_experts) + b*log(dense_parameter_count)
                               + c*(log(num_experts)*log(dense_parameter_count)) + const

    Where:
        - a = -0.100962
        - b = -0.084521
        - c = 0.004064
        - const = 2.563144

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Expected keys: 'num_experts', 'dense_parameter_count'
        group: The name of the experimental group for which to make predictions.
               The functional form of the law is the same for all groups,
               but parameters can differ per group. For this dataset, only 'all_data' exists.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable 'loss_validation'.
    """
    # Parameters extracted from the fitted model
    # These coefficients work for all groups in this dataset (only 'all_data' exists)
    params = {
        'all_data': {
            'a': -0.10096209505615544,
            'b': -0.08452076489156325,
            'c': 0.004064472374478456,
            'const': 2.56314364064211
        }
    }

    # Use parameters for the specified group; fall back to 'all_data' if group not found
    if group not in params:
        group = 'all_data'

    group_params = params[group]
    a = group_params['a']
    b = group_params['b']
    c = group_params['c']
    const = group_params['const']

    # Make predictions for each input
    results = []
    for data_point in input_data:
        num_experts = data_point['num_experts']
        dense_parameter_count = data_point['dense_parameter_count']

        # Compute log-space prediction
        log_experts = math.log(num_experts)
        log_dense = math.log(dense_parameter_count)
        interaction = log_experts * log_dense

        log_loss = a * log_experts + b * log_dense + c * interaction + const

        # Convert back to linear space
        loss_validation = math.exp(log_loss)

        results.append({'loss_validation': loss_validation})

    return results

#2 Run 2 R² = 0.467622

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Parameters fitted to the data
    # The discovered scaling law is: loss = 10^a * experts^b * dense^c
    # In log space: log10(loss) = a + b*log10(experts) + c*log10(dense)

    group_params = {
        'all_data': {
            'intercept': 1.002994070146366,
            'coef_experts': -0.02740192252755339,
            'coef_dense': -0.0705124860796179
        }
    }

    # Get parameters for the specified group
    if group not in group_params:
        # If group not found, use the 'all_data' parameters as fallback
        params = group_params.get('all_data', group_params[list(group_params.keys())[0]])
    else:
        params = group_params[group]

    results = []
    for data_point in input_data:
        num_experts = data_point['num_experts']
        dense_parameter_count = data_point['dense_parameter_count']

        # Apply the scaling law formula in log space
        # log10(loss) = intercept + coef_experts*log10(num_experts) + coef_dense*log10(dense_parameter_count)
        import math
        log_loss = (
            params['intercept']
            + params['coef_experts'] * math.log10(num_experts)
            + params['coef_dense'] * math.log10(dense_parameter_count)
        )

        # Convert back to linear scale
        loss_prediction = 10 ** log_loss

        results.append({
            'loss_validation': loss_prediction
        })

    return results

#3 Run 3 R² = 0.467622

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    The scaling law discovered is a multiplicative power law:
    loss_validation = A * num_experts^α * dense_parameter_count^β

    Where:
    - A = 10.0691792031
    - α = -0.0274019225 (exponent for num_experts)
    - β = -0.0705124861 (exponent for dense_parameter_count)

    This formula was derived from fitting a log-linear model in log-log space:
    log(loss) = log(A) + α*log(num_experts) + β*log(dense_parameter_count)

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    import math

    # Parameters fitted from the data
    # These are the same across all groups as only one group exists in the dataset
    params = {
        'all_data': {
            'A': 10.0691792031,
            'alpha': -0.0274019225,
            'beta': -0.0705124861
        }
    }

    # Use the group-specific parameters if available, otherwise use 'all_data'
    if group in params:
        param_set = params[group]
    else:
        # Default to all_data parameters for unknown groups
        param_set = params['all_data']

    A = param_set['A']
    alpha = param_set['alpha']
    beta = param_set['beta']

    results = []
    for data_point in input_data:
        num_experts = data_point['num_experts']
        dense_param_count = data_point['dense_parameter_count']

        # Calculate predicted loss using the power law
        # loss = A * num_experts^alpha * dense_parameter_count^beta
        predicted_loss = A * (num_experts ** alpha) * (dense_param_count ** beta)

        results.append({'loss_validation': predicted_loss})

    return results

#4 Run 4 R² = 0.467621

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Discovered scaling law: loss = C * num_experts^alpha * dense_parameter_count^beta
    # Where the parameters are fitted from the data

    # Group-specific parameters (currently only 'all_data' group in dataset)
    # The mathematical form remains the same across groups, only coefficients change
    group_params = {
        'all_data': {
            'C': 10.06917920,           # Constant coefficient: 10^1.00299407
            'alpha': -0.02740192,        # Exponent for num_experts
            'beta': -0.07051249          # Exponent for dense_parameter_count
        }
    }

    # Use parameters for the requested group, or default to 'all_data'
    params = group_params.get(group, group_params['all_data'])
    C = params['C']
    alpha = params['alpha']
    beta = params['beta']

    results = []
    for data_point in input_data:
        num_experts = data_point['num_experts']
        dense_parameter_count = data_point['dense_parameter_count']

        # Apply the power law: loss = C * num_experts^alpha * dense_parameter_count^beta
        loss_validation = C * (num_experts ** alpha) * (dense_parameter_count ** beta)

        results.append({
            'loss_validation': loss_validation
        })

    return results

#5 Run 5 R² = -0.121429

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    import math

    # Coefficients fitted to the MoE scaling law
    # Mathematical form:
    # Loss = a + b*log10(num_experts) + c*log10(dense_parameter_count)
    #        + d*log10(num_experts)*log10(dense_parameter_count)

    # Group-specific parameters (currently only 'all_data' group exists in training set)
    params = {
        'all_data': {
            'a': 7.02938669,
            'b': -0.80004463,
            'c': -0.53603608,
            'd': 0.08089269,
        }
    }

    # Use provided group parameters, or fall back to 'all_data' if group not found
    if group in params:
        coeff = params[group]
    else:
        # Use all_data as default parameters
        coeff = params['all_data']

    a = coeff['a']
    b = coeff['b']
    c = coeff['c']
    d = coeff['d']

    results = []

    for data_point in input_data:
        num_experts = data_point['num_experts']
        dense_parameter_count = data_point['dense_parameter_count']

        # Compute log10 values
        log10_experts = math.log10(num_experts)
        log10_params = math.log10(dense_parameter_count)

        # Apply the scaling law
        loss_validation = (
            a +
            b * log10_experts +
            c * log10_params +
            d * log10_experts * log10_params
        )

        results.append({'loss_validation': loss_validation})

    return results