SLD - Vocabulary Scaling Law - claude-code + claude-haiku-4-5

All Runs (sorted by R²)

Best Run 1 R² = 0.949582

▼

Python

import numpy as np


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """

    # Scaling law coefficients discovered through regression analysis
    # The model is: loss = a + b1*log(vocab) + b2*log(params) + b3*log(chars) + b4*log(params)*log(chars)

    # Group-specific parameters
    group_params = {
        'all_data': {
            'intercept': 65.573639301665,
            'coef_vocab': 0.065643930083,
            'coef_params': -3.059110450551,
            'coef_chars': -3.086349037920,
            'coef_interaction': 0.133786043000
        }
    }

    # Use 'all_data' parameters as default for any group
    if group not in group_params:
        params = group_params['all_data']
    else:
        params = group_params[group]

    results = []

    for data_point in input_data:
        # Extract input variables
        vocab_size = data_point['vocab_size']
        non_vocab_parameters = data_point['non_vocab_parameters']
        num_characters = data_point['num_characters']

        # Compute logarithmic features
        log_vocab = np.log(vocab_size)
        log_params = np.log(non_vocab_parameters)
        log_chars = np.log(num_characters)
        interaction = log_params * log_chars

        # Compute prediction using the scaling law
        prediction = (
            params['intercept'] +
            params['coef_vocab'] * log_vocab +
            params['coef_params'] * log_params +
            params['coef_chars'] * log_chars +
            params['coef_interaction'] * interaction
        )

        # Return the predicted unigram_normalized_loss
        results.append({
            'unigram_normalized_loss': float(prediction)
        })

    return results

#2 Run 2 R² = 0.938178

▼

Python

import numpy as np


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    The scaling law is based on a quadratic model in log-log space:

    log(loss) = a₀ + a₁·log(V) + a₂·log(P) + a₃·log(D) +
                a₄·[log(V)]² + a₅·[log(P)]² + a₆·[log(D)]²

    where V is vocab_size, P is non_vocab_parameters, and D is num_characters.

    The loss is unigram_normalized_loss, which is negative. The model predicts log(-loss).

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """

    # Fitted coefficients for the scaling law
    # These are derived from linear regression in log-log space with squared terms
    coefficients = {
        'all_data': {
            'intercept': -20.58579395,
            'log_vocab_size': 0.04095321,
            'log_non_vocab_parameters': -0.43878558,
            'log_num_characters': 2.12979100,
            'log_vocab_size_sq': -0.00309215,
            'log_non_vocab_parameters_sq': 0.01159718,
            'log_num_characters_sq': -0.04320388,
        }
    }

    # Get coefficients for the specified group
    if group not in coefficients:
        # Default to 'all_data' if group not found (only one group in dataset)
        group = 'all_data'

    coeff = coefficients[group]

    results = []

    for data_point in input_data:
        # Extract input variables
        vocab_size = data_point.get('vocab_size', 1.0)
        non_vocab_parameters = data_point.get('non_vocab_parameters', 1.0)
        num_characters = data_point.get('num_characters', 1.0)

        # Compute log values
        log_vocab_size = np.log(vocab_size)
        log_non_vocab_parameters = np.log(non_vocab_parameters)
        log_num_characters = np.log(num_characters)

        # Compute log(loss) using the quadratic model
        log_loss = (
            coeff['intercept'] +
            coeff['log_vocab_size'] * log_vocab_size +
            coeff['log_non_vocab_parameters'] * log_non_vocab_parameters +
            coeff['log_num_characters'] * log_num_characters +
            coeff['log_vocab_size_sq'] * (log_vocab_size ** 2) +
            coeff['log_non_vocab_parameters_sq'] * (log_non_vocab_parameters ** 2) +
            coeff['log_num_characters_sq'] * (log_num_characters ** 2)
        )

        # Convert back from log space: loss = -exp(log_loss)
        # (negative because the unigram_normalized_loss is negative)
        unigram_normalized_loss = -np.exp(log_loss)

        results.append({
            'unigram_normalized_loss': float(unigram_normalized_loss)
        })

    return results

#3 Run 3 R² = 0.866698

▼

Python

import math

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    The discovered scaling law follows the form:
    unigram_normalized_loss = a*log(vocab_size) + b*log(non_vocab_parameters) + c*log(num_characters) + d

    Where:
    - a is a shared coefficient across all groups
    - b, c, d are parameters that vary by group (where group is identified by vocab_size)

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                In this dataset, the group is identified by the vocab_size value.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """

    # Unified coefficient across all groups
    a = 0.0634011567

    # Group-specific parameters indexed by vocab_size
    # Each group has its own b, c, d values
    group_params = {
        4096.0: {'b': 0.0103157282, 'c': -0.4387568777, 'd': 5.0740243542},
        6144.0: {'b': 0.0019714668, 'c': -0.4572441132, 'd': 5.6220287899},
        8192.0: {'b': 0.0035061757, 'c': -0.4762909418, 'd': 6.0311809571},
        10240.0: {'b': 0.0097884790, 'c': -0.4849630808, 'd': 6.1153025956},
        16384.0: {'b': 0.0080820317, 'c': -0.5083907212, 'd': 6.6778105051},
        24576.0: {'b': 0.0128570922, 'c': -0.5238156652, 'd': 6.9554173182},
        32768.0: {'b': 0.0118341620, 'c': -0.5321262189, 'd': 7.1667365668},
        48128.0: {'b': 0.0572019544, 'c': -0.5462202420, 'd': 6.6517005780},
        64512.0: {'b': 0.0299395040, 'c': -0.5454710483, 'd': 7.1626047194},
    }

    results = []

    for data_point in input_data:
        vocab_size = data_point['vocab_size']
        non_vocab_parameters = data_point['non_vocab_parameters']
        num_characters = data_point['num_characters']

        # Find the group parameters for this vocab_size
        # If exact match not found, use the closest vocab_size
        if vocab_size in group_params:
            params = group_params[vocab_size]
        else:
            # Find closest vocab_size in available groups
            available_sizes = list(group_params.keys())
            closest_size = min(available_sizes, key=lambda x: abs(x - vocab_size))
            params = group_params[closest_size]

        b = params['b']
        c = params['c']
        d = params['d']

        # Calculate prediction using the scaling law
        # loss = a*log(vocab_size) + b*log(non_vocab_parameters) + c*log(num_characters) + d
        prediction = (
            a * math.log(vocab_size) +
            b * math.log(non_vocab_parameters) +
            c * math.log(num_characters) +
            d
        )

        results.append({'unigram_normalized_loss': prediction})

    return results

#4 Run 4 R² = 0.861121

▼

Python

import numpy as np


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Coefficients fitted for the 'all_data' group
    # Formula: loss = c + b1*log10(vocab_size) + b2*log10(non_vocab_parameters) + b3*log10(num_characters)

    # Group-specific coefficients (currently only 'all_data' group in the training data)
    group_coefficients = {
        'all_data': {
            'constant': 6.3805912366,
            'vocab_size': 0.1459881172,
            'non_vocab_parameters': 0.0377878723,
            'num_characters': -1.1552084671,
        }
    }

    # Get coefficients for the requested group
    # If group not found, use 'all_data' as fallback
    if group in group_coefficients:
        coeffs = group_coefficients[group]
    else:
        # Fallback to 'all_data' if group not found
        coeffs = group_coefficients['all_data']

    results = []

    for data_point in input_data:
        # Extract input variables
        vocab_size = data_point.get('vocab_size')
        non_vocab_parameters = data_point.get('non_vocab_parameters')
        num_characters = data_point.get('num_characters')

        # Compute prediction using the fitted formula
        # loss = c + b1*log10(vocab_size) + b2*log10(non_vocab_parameters) + b3*log10(num_characters)
        prediction = (
            coeffs['constant']
            + coeffs['vocab_size'] * np.log10(vocab_size)
            + coeffs['non_vocab_parameters'] * np.log10(non_vocab_parameters)
            + coeffs['num_characters'] * np.log10(num_characters)
        )

        results.append({
            'unigram_normalized_loss': float(prediction)
        })

    return results

#5 Run 5 R² = 0.861120

▼

Python

import math


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    The scaling law is based on a log-linear regression model that relates the
    unigram-normalized loss to three input variables: vocabulary size, non-vocabulary
    parameters, and number of characters.

    Formula: loss = a + b*ln(vocab_size) + c*ln(non_vocab_parameters) + d*ln(num_characters)

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
               The functional form of the law must be the same for all groups,
               but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """

    # Fitted parameters for the "all_data" group
    # These coefficients are derived from log-linear regression on the training dataset
    params = {
        "all_data": {
            "intercept": 6.380591,
            "vocab_size_coeff": 0.063402,
            "non_vocab_parameters_coeff": 0.016411,
            "num_characters_coeff": -0.501701,
        }
    }

    # Use the parameters for the specified group; default to "all_data" if not found
    if group not in params:
        group = "all_data"

    coefficients = params[group]

    # Make predictions for each data point
    predictions = []
    for data_point in input_data:
        vocab_size = data_point.get("vocab_size", 1.0)
        non_vocab_parameters = data_point.get("non_vocab_parameters", 1.0)
        num_characters = data_point.get("num_characters", 1.0)

        # Compute the prediction using the log-linear formula
        # loss = a + b*ln(vocab_size) + c*ln(non_vocab_parameters) + d*ln(num_characters)
        predicted_loss = (
            coefficients["intercept"]
            + coefficients["vocab_size_coeff"] * math.log(vocab_size)
            + coefficients["non_vocab_parameters_coeff"] * math.log(non_vocab_parameters)
            + coefficients["num_characters_coeff"] * math.log(num_characters)
        )

        predictions.append({
            "unigram_normalized_loss": predicted_loss
        })

    return predictions