SLD - MoE Scaling Law - claude-code + claude-sonnet-4-5

Best Run 1 R² = 0.832730

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """

    # Parameters fitted for each group
    # Formula: loss_validation = a / (N^alpha * E^beta) + c
    # where N = dense_parameter_count, E = num_experts

    parameters = {
        'all_data': {
            'a': 4.347567e+01,
            'alpha': 0.198986,
            'beta': 0.073983,
            'c': 1.617018
        }
    }

    # Get parameters for the specified group
    if group not in parameters:
        raise ValueError(f"Unknown group: {group}. Available groups: {list(parameters.keys())}")

    params = parameters[group]
    a = params['a']
    alpha = params['alpha']
    beta = params['beta']
    c = params['c']

    # Make predictions
    predictions = []
    for data_point in input_data:
        N = data_point['dense_parameter_count']
        E = data_point['num_experts']

        # Apply the scaling law: L = a / (N^alpha * E^beta) + c
        loss_validation = a / ((N ** alpha) * (E ** beta)) + c

        predictions.append({'loss_validation': loss_validation})

    return predictions

#2 Run 2 R² = 0.832727

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Parameters fitted for each group
    # The scaling law is: L = A / ((N_dense * E^beta)^alpha) + L_inf
    # where:
    #   L = validation loss
    #   N_dense = dense_parameter_count
    #   E = num_experts
    #   A, alpha, beta, L_inf are fitted constants

    group_parameters = {
        'all_data': {
            'A': 43.47568,
            'alpha': 0.198986,
            'beta': 0.371799,
            'L_inf': 1.617018
        }
    }

    # Get parameters for the specified group
    if group not in group_parameters:
        raise ValueError(f"Unknown group: {group}. Available groups: {list(group_parameters.keys())}")

    params = group_parameters[group]
    A = params['A']
    alpha = params['alpha']
    beta = params['beta']
    L_inf = params['L_inf']

    # Make predictions
    predictions = []
    for data_point in input_data:
        N_dense = data_point['dense_parameter_count']
        E = data_point['num_experts']

        # Apply the scaling law
        N_effective = N_dense * (E ** beta)
        loss_validation = A / (N_effective ** alpha) + L_inf

        predictions.append({'loss_validation': loss_validation})

    return predictions

#3 Run 3 R² = 0.832697

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """

    # Parameters fitted for each group
    # Form: L = a * D^(-alpha) * E^(-beta) + c
    # Where L = loss_validation, D = dense_parameter_count, E = num_experts

    parameters = {
        'all_data': {
            'a': 4.3475459848e+01,
            'alpha': 0.1989852740,
            'beta': 0.0739825824,
            'c': 1.6170160931
        }
    }

    # Get parameters for the specified group
    if group not in parameters:
        raise ValueError(f"Unknown group: {group}. Available groups: {list(parameters.keys())}")

    params = parameters[group]
    a = params['a']
    alpha = params['alpha']
    beta = params['beta']
    c = params['c']

    # Apply the scaling law to each input data point
    output_data = []
    for data_point in input_data:
        num_experts = data_point['num_experts']
        dense_parameter_count = data_point['dense_parameter_count']

        # Calculate predicted loss using the scaling law
        loss_validation = a * (dense_parameter_count ** (-alpha)) * (num_experts ** (-beta)) + c

        output_data.append({'loss_validation': loss_validation})

    return output_data

#4 Run 4 R² = 0.832696

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Scaling law for MoE architectures:
    # L = a * N^(-alpha) * E^(-beta) + c
    # where:
    #   L = loss_validation (predicted output)
    #   N = dense_parameter_count (input)
    #   E = num_experts (input)
    #   a, alpha, beta, c = fitted parameters

    # Parameters fitted on the training data for group 'all_data'
    # These parameters were obtained through nonlinear least squares fitting
    # achieving R² = 0.958, RMSE = 0.052, MAE = 0.038
    parameters = {
        'all_data': {
            'a': 4.3475562897e+01,
            'alpha': 0.1989854424,
            'beta': 0.0739826608,
            'c': 1.6170169395
        }
    }

    # Get parameters for the specified group
    if group not in parameters:
        raise ValueError(f"Unknown group: {group}. Available groups: {list(parameters.keys())}")

    params = parameters[group]
    a = params['a']
    alpha = params['alpha']
    beta = params['beta']
    c = params['c']

    # Generate predictions for each input data point
    predictions = []
    for data_point in input_data:
        N = data_point['dense_parameter_count']
        E = data_point['num_experts']

        # Apply the scaling law formula
        loss_pred = a * (N ** (-alpha)) * (E ** (-beta)) + c

        predictions.append({'loss_validation': loss_pred})

    return predictions

#5 Run 5 R² = 0.832695

▼

MoE Scaling Law

All Runs (sorted by R²)