SLD - Data-Constrained Scaling Law - claude-code + claude-sonnet-4-5

Best Run 1 R² = 0.920974

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """

    # Fitted parameters for each group
    # The scaling law form: L = A/N^α + B/D_eff^β + E
    # where D_eff = U^γ * D^(1-γ) is the effective data considering repetition
    GROUP_PARAMS = {
        'all_data': {
            'A': 8.3711431840e+02,
            'alpha': 0.3742628023,
            'B': 1.9741512532e+03,
            'beta': 0.3464706122,
            'gamma': 0.1898222449,
            'E': 2.0896145867
        },
    }

    # Get parameters for the specified group
    if group not in GROUP_PARAMS:
        raise ValueError(f"Unknown group: {group}. Available groups: {list(GROUP_PARAMS.keys())}")

    params = GROUP_PARAMS[group]
    A = params['A']
    alpha = params['alpha']
    B = params['B']
    beta = params['beta']
    gamma = params['gamma']
    E = params['E']

    # Make predictions for each data point
    results = []
    for data_point in input_data:
        # Extract input variables
        N = data_point['params']  # Model parameters
        D = data_point['tokens']  # Total training tokens
        U = data_point['unique_tokens']  # Unique tokens in dataset

        # Calculate effective data
        # D_eff blends unique tokens and total tokens
        # When γ ≈ 0: D_eff ≈ D (repetition has full benefit)
        # When γ ≈ 1: D_eff ≈ U (repetition has no benefit)
        # Fitted γ ≈ 0.19 indicates repetition has substantial but diminishing benefit
        D_eff = (U ** gamma) * (D ** (1 - gamma))

        # Apply the scaling law
        # L = A/N^α: Model size component (larger models → lower loss)
        # B/D_eff^β: Data component (more effective data → lower loss)
        # E: Irreducible loss (theoretical minimum)
        loss = A / (N ** alpha) + B / (D_eff ** beta) + E

        # Return prediction
        results.append({'loss': loss})

    return results

#2 Run 2 R² = 0.914154

▼

#3 Run 3 R² = 0.914136

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Parameters for the scaling law (fitted for group 'all_data')
    # L = E + A/N^alpha + B/D^beta + C/D_unique^gamma
    # where N = params, D = tokens, D_unique = unique_tokens

    params_by_group = {
        'all_data': {
            'A': 5185.9632176098,
            'alpha': 0.5065483528,
            'B': 108444.8271623368,
            'beta': 0.5635675280,
            'C': 14.1500380222,
            'gamma': 0.1292210670,
            'E': 1.8542463718
        }
    }

    # Get parameters for the specified group
    if group not in params_by_group:
        raise ValueError(f"Unknown group: {group}. Available groups: {list(params_by_group.keys())}")

    params = params_by_group[group]
    A = params['A']
    alpha = params['alpha']
    B = params['B']
    beta = params['beta']
    C = params['C']
    gamma = params['gamma']
    E = params['E']

    # Compute predictions for each data point
    results = []
    for data_point in input_data:
        N = data_point['params']
        D = data_point['tokens']
        D_unique = data_point['unique_tokens']

        # Scaling law formula
        loss = E + A / (N ** alpha) + B / (D ** beta) + C / (D_unique ** gamma)

        results.append({'loss': loss})

    return results

#4 Run 4 R² = 0.914136

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Fitted parameters for each group
    # Formula: L = E + A/N^alpha + B/D^beta + C/U^gamma
    # Where:
    #   L = validation loss
    #   N = params (model parameter count)
    #   D = tokens (total training tokens)
    #   U = unique_tokens (unique tokens in dataset)

    parameters = {
        'all_data': {
            'E': 1.8542457510648729,
            'A': 5185.984539664452,
            'alpha': 0.5065486169843577,
            'B': 108444.9527330933,
            'beta': 0.5635675919920129,
            'C': 14.150022810264154,
            'gamma': 0.12922096782078824
        }
    }

    # Get parameters for the specified group
    if group not in parameters:
        raise ValueError(f"Unknown group: {group}. Available groups: {list(parameters.keys())}")

    params = parameters[group]
    E = params['E']
    A = params['A']
    alpha = params['alpha']
    B = params['B']
    beta = params['beta']
    C = params['C']
    gamma = params['gamma']

    # Make predictions for each input data point
    predictions = []
    for data_point in input_data:
        # Extract input variables
        N = data_point['params']  # Model parameter count
        D = data_point['tokens']  # Total training tokens
        U = data_point['unique_tokens']  # Unique tokens in dataset

        # Apply the scaling law: L = E + A/N^alpha + B/D^beta + C/U^gamma
        loss = E + A / (N ** alpha) + B / (D ** beta) + C / (U ** gamma)

        # Return prediction as a dictionary
        predictions.append({'loss': loss})

    return predictions

#5 Run 5 R² = 0.914127

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """

    # Parameters fitted for the 'all_data' group
    # Mathematical form: L = A/N^α + B/D_unique^β + C/D^γ + E
    # where N = params, D = tokens, D_unique = unique_tokens, L = loss

    parameters = {
        'all_data': {
            'A': 5.1859029522e+03,
            'alpha': 0.5065480417,
            'B': 1.4152744291e+01,
            'beta': 0.1292381892,
            'C': 1.0842433871e+05,
            'gamma': 0.5635575861,
            'E': 1.8543956711
        }
    }

    # Get parameters for the specified group
    if group not in parameters:
        raise ValueError(f"Unknown group: {group}. Available groups: {list(parameters.keys())}")

    params = parameters[group]
    A = params['A']
    alpha = params['alpha']
    B = params['B']
    beta = params['beta']
    C = params['C']
    gamma = params['gamma']
    E = params['E']

    # Make predictions
    results = []
    for data_point in input_data:
        N = data_point['params']
        D = data_point['tokens']
        D_unique = data_point['unique_tokens']

        # Apply the scaling law
        term1 = A / (N ** alpha)
        term2 = B / (D_unique ** beta)
        term3 = C / (D ** gamma)

        loss = term1 + term2 + term3 + E

        results.append({'loss': loss})

    return results

Data-Constrained Scaling Law

All Runs (sorted by R²)