SLD - Data-Constrained Scaling Law - claude-code + claude-haiku-4-5

All Runs (sorted by R²)

Best Run 1 R² = 0.914139

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    The underlying scaling law is:
    loss = a + b/params^α + c/tokens^β + d/unique_tokens^γ

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Expected keys: 'params', 'tokens', 'unique_tokens'
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s), specifically 'loss'.
    """
    # Fitted parameters for the scaling law
    # These were obtained by fitting the model to the experimental dataset
    params_by_group = {
        'all_data': {
            'a': 1.854174103052296,
            'b': 5185.897600342049,
            'alpha': 0.5065474635986845,
            'c': 108445.00928304848,
            'beta': 0.5635676108042057,
            'd': 14.148203751260953,
            'gamma': 0.12921116039317365
        }
    }

    # Use provided group, or fall back to 'all_data' if not found
    if group in params_by_group:
        params = params_by_group[group]
    elif group is None or group == '':
        params = params_by_group['all_data']
    else:
        # If unknown group, use the universal parameters from 'all_data'
        params = params_by_group['all_data']

    a = params['a']
    b = params['b']
    alpha = params['alpha']
    c = params['c']
    beta = params['beta']
    d = params['d']
    gamma = params['gamma']

    results = []

    for data_point in input_data:
        params_val = data_point.get('params', 1.0)
        tokens_val = data_point.get('tokens', 1.0)
        unique_tokens_val = data_point.get('unique_tokens', 1.0)

        # Avoid division by zero and ensure positive values for exponentiation
        params_val = max(params_val, 1e-10)
        tokens_val = max(tokens_val, 1e-10)
        unique_tokens_val = max(unique_tokens_val, 1e-10)

        # Apply the scaling law formula
        loss = a + b / (params_val ** alpha) + c / (tokens_val ** beta) + d / (unique_tokens_val ** gamma)

        results.append({'loss': loss})

    return results

#2 Run 2 R² = 0.914136

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """

    # Fitted parameters for the scaling law: loss = a + b/params^alpha + c/tokens^beta + d/unique_tokens^gamma
    # These parameters were derived from fitting on the 'all_data' group
    params_dict = {
        'all_data': {
            'a': 1.85425599,
            'b': 5186.01367392,
            'alpha': 0.50654896,
            'c': 108441.31009788,
            'beta': 0.56356580,
            'd': 14.15027407,
            'gamma': 0.12922242
        }
    }

    # Get parameters for the specified group (default to 'all_data' if not found)
    if group not in params_dict:
        group = 'all_data'

    params = params_dict[group]
    a = params['a']
    b = params['b']
    alpha = params['alpha']
    c = params['c']
    beta = params['beta']
    d = params['d']
    gamma = params['gamma']

    # Make predictions for each input data point
    results = []
    for data_point in input_data:
        params_val = data_point.get('params', 1)
        tokens_val = data_point.get('tokens', 1)
        unique_tokens_val = data_point.get('unique_tokens', 1)

        # Apply the scaling law formula
        predicted_loss = a + b / (params_val ** alpha) + c / (tokens_val ** beta) + d / (unique_tokens_val ** gamma)

        results.append({'loss': predicted_loss})

    return results

#3 Run 3 R² = 0.804670

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    The scaling law is: loss = a * params^alpha * tokens^beta * unique_tokens^gamma

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Group-specific parameters
    # All data belongs to 'all_data' group
    params_by_group = {
        'all_data': {
            'a': 89.03636,
            'alpha': -0.067132,
            'beta': -0.057418,
            'gamma': -0.028216
        }
    }

    # Get parameters for the specified group
    if group not in params_by_group:
        raise ValueError(f"Unknown group: {group}")

    coefficients = params_by_group[group]
    a = coefficients['a']
    alpha = coefficients['alpha']
    beta = coefficients['beta']
    gamma = coefficients['gamma']

    # Make predictions
    predictions = []
    for data_point in input_data:
        params = data_point['params']
        tokens = data_point['tokens']
        unique_tokens = data_point['unique_tokens']

        # Calculate loss using the scaling law: loss = a * params^alpha * tokens^beta * unique_tokens^gamma
        loss = a * (params ** alpha) * (tokens ** beta) * (unique_tokens ** gamma)

        predictions.append({'loss': loss})

    return predictions

#4 Run 4 R² = 0.804644

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    The scaling law is derived from data-constrained pre-training of language models and
    follows a power law relationship:

    loss = a * params^b * tokens^c * unique_tokens^d

    Where:
    - params: model parameter count
    - tokens: total number of training tokens
    - unique_tokens: number of unique tokens in the dataset
    - a, b, c, d: fitted coefficients specific to each group

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """

    # Group-specific parameters fitted from the experimental data
    # Formula: loss = a * params^b * tokens^c * unique_tokens^d
    group_parameters = {
        'all_data': {
            'a': 8.9036358201e+01,
            'b': -0.0671315603,
            'c': -0.0574183729,
            'd': -0.0282163211
        }
    }

    # Get parameters for the specified group, or use the 'all_data' parameters as default
    if group in group_parameters:
        params = group_parameters[group]
    else:
        # Default to 'all_data' parameters if group not found
        params = group_parameters['all_data']

    a = params['a']
    b = params['b']
    c = params['c']
    d = params['d']

    # Make predictions for each input data point
    predictions = []
    for data_point in input_data:
        params_val = data_point['params']
        tokens_val = data_point['tokens']
        unique_tokens_val = data_point['unique_tokens']

        # Apply the scaling law: loss = a * params^b * tokens^c * unique_tokens^d
        loss = a * (params_val ** b) * (tokens_val ** c) * (unique_tokens_val ** d)

        predictions.append({'loss': loss})

    return predictions

#5 Run 5 R² = 0.453505

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    The scaling law follows a multiplicative power law model:
    loss = a * params^(-α) * tokens^(-β) * unique_tokens^(-γ) + e

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """

    # Fitted parameters for each group
    # The model: loss = a * params^(-α) * tokens^(-β) * unique_tokens^(-γ) + e
    parameters = {
        'all_data': {
            'a': 34956.653629,
            'alpha': 0.166158,
            'beta': 0.200495,
            'gamma': 0.133205,
            'e': 2.555185
        }
    }

    # Get parameters for the specified group
    if group not in parameters:
        # If group not found, use all_data as default
        params = parameters.get('all_data', parameters[list(parameters.keys())[0]])
    else:
        params = parameters[group]

    predictions = []

    for data_point in input_data:
        # Extract input variables
        p = data_point.get('params', 1.0)
        t = data_point.get('tokens', 1.0)
        u = data_point.get('unique_tokens', 1.0)

        # Compute prediction using the scaling law
        # loss = a * params^(-α) * tokens^(-β) * unique_tokens^(-γ) + e
        a = params['a']
        alpha = params['alpha']
        beta = params['beta']
        gamma = params['gamma']
        e = params['e']

        loss = a * (p ** (-alpha)) * (t ** (-beta)) * (u ** (-gamma)) + e

        predictions.append({'loss': loss})

    return predictions