SLD - MoE Scaling Law - gemini-cli + Gemini 2.5 Flash

Best Run 1 R² = 0.832695

▼

Python

import numpy as np

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Fitted parameters for the 'all_data' group
    # A = 43.47573060740084
    # alpha = 0.07398280187051419
    # beta = 0.19898571805319756
    # L0 = 1.6170183728670307

    # Since there's only one group ('all_data'), the parameters are fixed.
    # In a multi-group scenario, you would have a dictionary of parameters per group.
    params = {
        'all_data': {
            'A': 43.47573060740084,
            'alpha': 0.07398280187051419,
            'beta': 0.19898571805319756,
            'L0': 1.6170183728670307
        }
    }

    if group not in params:
        raise ValueError(f"Unknown group: {group}")

    current_params = params[group]
    A = current_params['A']
    alpha = current_params['alpha']
    beta = current_params['beta']
    L0 = current_params['L0']

    predictions = []
    for data_point in input_data:
        num_experts = data_point['num_experts']
        dense_parameter_count = data_point['dense_parameter_count']

        # Apply the scaling law
        loss_validation = A * (num_experts ** (-alpha)) * (dense_parameter_count ** (-beta)) + L0
        predictions.append({'loss_validation': loss_validation})

    return predictions

#2 Run 2 R² = 0.467622

▼

Python

import numpy as np

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    predictions = []
    # In this specific dataset, there's only one group: 'all_data'.
    # If there were multiple groups, we would load different parameters based on the 'group' argument.
    if group == 'all_data':
        # Fitted parameters from the analysis
        A = -0.0274019225275534
        B = -0.0705124860796179
        C = 10.069179203081296
    else:
        # Handle cases where the group is not recognized or parameters are not available
        # For this problem, 'all_data' is the only expected group.
        # A more robust solution might raise an error or return NaNs/default values.
        raise ValueError(f"Group '{group}' not recognized or parameters not available.")

    for data_point in input_data:
        num_experts = data_point['num_experts']
        dense_parameter_count = data_point['dense_parameter_count']

        # Ensure num_experts and dense_parameter_count are positive for the power law
        if num_experts <= 0 or dense_parameter_count <= 0:
            # Handle invalid input, e.g., by returning NaN or a default value
            predicted_loss = np.nan
        else:
            predicted_loss = C * (num_experts**A) * (dense_parameter_count**B)
        
        predictions.append({'loss_validation': predicted_loss})
    
    return predictions

#3 Run 3 R² = 0.467622

▼

#4 Run 4 R² = 0.347867

▼

#5 Run 5 R² = -0.753057

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    import numpy as np

    # Fitted parameters for the 'all_data' group
    # These values were obtained by fitting the scaling law to the provided dataset.
    # Mathematical form: loss_validation = a + b * log(num_experts) + c * log(dense_parameter_count)
    fitted_params = {
        'all_data': {
            'a': 6.077175446130047,
            'b': -0.07132315536715568,
            'c': -0.18021292228947294
        }
    }

    if group not in fitted_params:
        raise ValueError(f"No fitted parameters found for group: {group}")

    a = fitted_params[group]['a']
    b = fitted_params[group]['b']
    c = fitted_params[group]['c']

    predictions = []
    for data_point in input_data:
        num_experts = data_point['num_experts']
        dense_parameter_count = data_point['dense_parameter_count']

        # Ensure non-zero values for log transformation
        num_experts_log = np.log(num_experts) if num_experts > 0 else np.log(1e-9)
        dense_parameter_count_log = np.log(dense_parameter_count) if dense_parameter_count > 0 else np.log(1e-9)
        
        predicted_loss_validation = a + b * num_experts_log + c * dense_parameter_count_log
        predictions.append({'loss_validation': predicted_loss_validation})

    return predictions

MoE Scaling Law

All Runs (sorted by R²)