SLD - Vocabulary Scaling Law - claude-code + claude-sonnet-4-5

All Runs (sorted by R²)

Best Run 1 R² = 0.986447

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """

    # Parameters fitted for each group
    # Model: unigram_normalized_loss = A + B * vocab_size^alpha / (non_vocab_parameters^beta * num_characters^gamma)

    group_params = {
        'all_data': {
            'A': -5.6710314467673895,
            'B': 3997.4900001850224,
            'alpha': 0.060389341616412094,
            'beta': 0.035426879627548834,
            'gamma': 0.34778022803102326
        }
    }

    # Get parameters for the specified group
    # If group not found, use 'all_data' as default
    params = group_params.get(group, group_params['all_data'])

    A = params['A']
    B = params['B']
    alpha = params['alpha']
    beta = params['beta']
    gamma = params['gamma']

    # Make predictions for each input data point
    predictions = []
    for data_point in input_data:
        V = data_point['vocab_size']
        N = data_point['non_vocab_parameters']
        D = data_point['num_characters']

        # Apply scaling law formula
        loss = A + B * (V ** alpha) / ((N ** beta) * (D ** gamma))

        predictions.append({
            'unigram_normalized_loss': loss
        })

    return predictions

#2 Run 2 R² = 0.980335

▼

Python

import numpy as np


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Model parameters (fitted on 'all_data' group)
    # These parameters are specific to the group but the functional form remains the same

    # Polynomial degree-2 model in log-space
    # loss = intercept + c1*log(N) + c2*log(V) + c3*log(D)
    #        + c4*log²(N) + c5*log(N)*log(V) + c6*log(N)*log(D)
    #        + c7*log²(V) + c8*log(V)*log(D) + c9*log²(D)

    # Parameters for 'all_data' group (the only group in training data)
    params = {
        'all_data': {
            'intercept': 43.653023,
            'c1': 0.584601,      # log(N)
            'c2': 0.779496,      # log(V)
            'c3': -4.504395,     # log(D)
            'c4': 0.025814,      # log²(N)
            'c5': 0.022593,      # log(N)*log(V)
            'c6': -0.081356,     # log(N)*log(D)
            'c7': 0.028554,      # log²(V)
            'c8': -0.073865,     # log(V)*log(D)
            'c9': 0.137360,      # log²(D)
        }
    }

    # Get parameters for the specified group (default to 'all_data' if group not found)
    group_params = params.get(group, params['all_data'])

    # Extract coefficients
    intercept = group_params['intercept']
    c1 = group_params['c1']
    c2 = group_params['c2']
    c3 = group_params['c3']
    c4 = group_params['c4']
    c5 = group_params['c5']
    c6 = group_params['c6']
    c7 = group_params['c7']
    c8 = group_params['c8']
    c9 = group_params['c9']

    # Prepare output
    results = []

    for data_point in input_data:
        # Extract input variables
        N = data_point['non_vocab_parameters']  # Non-vocabulary parameters
        V = data_point['vocab_size']             # Vocabulary size
        D = data_point['num_characters']         # Number of characters in training data

        # Compute log transformations
        log_N = np.log(N)
        log_V = np.log(V)
        log_D = np.log(D)

        # Apply the polynomial scaling law
        predicted_loss = (
            intercept
            + c1 * log_N
            + c2 * log_V
            + c3 * log_D
            + c4 * log_N**2
            + c5 * log_N * log_V
            + c6 * log_N * log_D
            + c7 * log_V**2
            + c8 * log_V * log_D
            + c9 * log_D**2
        )

        # Return the predicted output
        results.append({
            'unigram_normalized_loss': predicted_loss
        })

    return results

#3 Run 3 R² = 0.973920

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """

    # Parameters for the scaling law, fitted to the 'all_data' group
    # Formula: L = a + b/N^α + c/D^β + d*V
    # where:
    #   L = unigram_normalized_loss
    #   N = non_vocab_parameters
    #   D = num_characters
    #   V = vocab_size

    params = {
        'all_data': {
            'a': -5.7257846952760705,
            'b': 14225.684466145338,
            'alpha': 0.6378870596985718,
            'c': 5553.040069198156,
            'beta': 0.36932125490284595,
            'd': 2.6514193787820294e-06
        }
    }

    # Get parameters for the specified group
    # If group not found, use 'all_data' as default
    if group not in params:
        group = 'all_data'

    p = params[group]

    # Apply the scaling law to each input data point
    results = []
    for data_point in input_data:
        N = data_point['non_vocab_parameters']
        D = data_point['num_characters']
        V = data_point['vocab_size']

        # Calculate predicted loss using the scaling law
        unigram_normalized_loss = (
            p['a'] +
            p['b'] / (N ** p['alpha']) +
            p['c'] / (D ** p['beta']) +
            p['d'] * V
        )

        results.append({
            'unigram_normalized_loss': unigram_normalized_loss
        })

    return results

#4 Run 4 R² = 0.969565

▼

Python

import math


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Model parameters for each group
    # The model is a polynomial in log-space:
    # loss = a + b₁·log(C) + b₂·log(N) + b₃·log(V)
    #        + c₁·log(C)² + c₂·log(N)² + c₃·log(V)²
    #        + d·log(C)·log(N)
    # where C=num_characters, N=non_vocab_parameters, V=vocab_size

    params = {
        'all_data': {
            'intercept': 55.034742478160304,
            'log_chars': -4.985542273029513,
            'log_non_vocab': 0.5431887031961984,
            'log_vocab': -0.3290853205882094,
            'log_chars_sq': 0.1205166281173205,
            'log_non_vocab_sq': 0.0162115590376144,
            'log_vocab_sq': 0.01962529579067221,
            'log_c_x_log_nv': -0.05358364971911641
        }
    }

    # Get parameters for the specified group
    if group not in params:
        raise ValueError(f"Unknown group: {group}. Available groups: {list(params.keys())}")

    p = params[group]

    # Make predictions for each data point
    results = []
    for data_point in input_data:
        # Extract input variables
        num_characters = data_point['num_characters']
        non_vocab_parameters = data_point['non_vocab_parameters']
        vocab_size = data_point['vocab_size']

        # Compute log-transformed features
        log_chars = math.log(num_characters)
        log_non_vocab = math.log(non_vocab_parameters)
        log_vocab = math.log(vocab_size)

        # Compute derived features
        log_chars_sq = log_chars ** 2
        log_non_vocab_sq = log_non_vocab ** 2
        log_vocab_sq = log_vocab ** 2
        log_c_x_log_nv = log_chars * log_non_vocab

        # Apply the scaling law
        unigram_normalized_loss = (
            p['intercept'] +
            p['log_chars'] * log_chars +
            p['log_non_vocab'] * log_non_vocab +
            p['log_vocab'] * log_vocab +
            p['log_chars_sq'] * log_chars_sq +
            p['log_non_vocab_sq'] * log_non_vocab_sq +
            p['log_vocab_sq'] * log_vocab_sq +
            p['log_c_x_log_nv'] * log_c_x_log_nv
        )

        results.append({'unigram_normalized_loss': unigram_normalized_loss})

    return results

#5 Run 5 R² = 0.900809

▼

Python

import math


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Coefficients for the 'all_data' group
    # These were obtained by fitting a quadratic model in log space:
    # loss = a0 + a1*log(N) + a2*log(V) + a3*log(P) + a4*log(V)^2 + a5*log(P)^2 + a6*log(V)*log(P)
    # where N = num_characters, V = vocab_size, P = non_vocab_parameters

    coefficients = {
        'all_data': {
            'a0': 40.1852863461,      # intercept
            'a1': -0.5147260634,      # log(num_characters)
            'a2': 0.2811177220,       # log(vocab_size)
            'a3': -3.6353177267,      # log(non_vocab_parameters)
            'a4': 0.0197370270,       # log(vocab_size)^2
            'a5': 0.1038993151,       # log(non_vocab_parameters)^2
            'a6': -0.0312617648       # log(vocab_size) * log(non_vocab_parameters)
        }
    }

    # Get coefficients for the specified group
    if group not in coefficients:
        raise ValueError(f"Unknown group: {group}. Available groups: {list(coefficients.keys())}")

    coeffs = coefficients[group]

    # Compute predictions
    predictions = []
    for data_point in input_data:
        # Extract input variables
        num_characters = data_point['num_characters']
        vocab_size = data_point['vocab_size']
        non_vocab_parameters = data_point['non_vocab_parameters']

        # Compute log features
        log_chars = math.log(num_characters)
        log_vocab = math.log(vocab_size)
        log_params = math.log(non_vocab_parameters)

        # Apply the scaling law formula
        predicted_loss = (
            coeffs['a0']
            + coeffs['a1'] * log_chars
            + coeffs['a2'] * log_vocab
            + coeffs['a3'] * log_params
            + coeffs['a4'] * log_vocab ** 2
            + coeffs['a5'] * log_params ** 2
            + coeffs['a6'] * log_vocab * log_params
        )

        predictions.append({
            'unigram_normalized_loss': predicted_loss
        })

    return predictions