SLD - LR & Batch Size Scaling Law - claude-code + claude-sonnet-4-5

All Runs (sorted by R²)

Best Run 1 R² = -0.500295

▼

Python

import numpy as np


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """

    # Fitted parameters for each group
    # These were discovered through nonlinear regression on the training data
    params = {
        'all_data': {
            'A': 9.995778,
            'alpha': 0.094913,
            'B': 19.996945,
            'beta': 0.133021,
            'C': -0.028150,
            'E': 0.025909,
            'lr_opt': 1.290894e-03,
            'F': 0.018372,
            'bsz_opt': 199.999997
        }
    }

    # Get parameters for this group
    if group not in params:
        raise ValueError(f"Unknown group: {group}. Available groups: {list(params.keys())}")

    p = params[group]

    # Extract parameters
    A = p['A']
    alpha = p['alpha']
    B = p['B']
    beta = p['beta']
    C = p['C']
    E = p['E']
    lr_opt = p['lr_opt']
    F = p['F']
    bsz_opt = p['bsz_opt']

    # Make predictions for each input data point
    results = []
    for data_point in input_data:
        # Extract input variables
        N = data_point['non_embedding_param_size']
        D = data_point['data_size']
        lr = data_point['lr']
        bsz = data_point['bsz']

        # Apply the scaling law formula:
        # L = A/N^alpha + B/D^beta + C + E*(log(lr/lr_opt))^2 + F*(log(bsz/bsz_opt))^2

        # Base loss from model size and data size (power law terms)
        base_loss = A / (N ** alpha) + B / (D ** beta) + C

        # Learning rate penalty (quadratic in log space)
        lr_penalty = E * (np.log(lr / lr_opt)) ** 2

        # Batch size penalty (quadratic in log space)
        bsz_penalty = F * (np.log(bsz / bsz_opt)) ** 2

        # Total predicted loss
        lm_loss = base_loss + lr_penalty + bsz_penalty

        results.append({'lm_loss': float(lm_loss)})

    return results

#2 Run 2 R² = -0.818347

▼

Python

import numpy as np

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """

    # Parameters fitted for group "all_data"
    # These were obtained by fitting the model to the full training dataset
    params = {
        'all_data': {
            'L_inf': -72.685162,
            'A': 77.20122239,
            'alpha': 0.001655,
            'B': 577.703742,
            'beta': 0.314307,
            'C': 0.0247540,
            'lr_opt': 0.001185395,
            'E': 0.002184469
        }
    }

    # Get parameters for the specified group
    # If group is not found, use all_data parameters as default
    if group not in params:
        group_params = params['all_data']
    else:
        group_params = params[group]

    L_inf = group_params['L_inf']
    A = group_params['A']
    alpha = group_params['alpha']
    B = group_params['B']
    beta = group_params['beta']
    C = group_params['C']
    lr_opt = group_params['lr_opt']
    E = group_params['E']

    # Make predictions for each data point
    predictions = []

    for data_point in input_data:
        # Extract input features
        lr = data_point['lr']
        bsz = data_point['bsz']
        data_size = data_point['data_size']
        non_embedding_param_size = data_point['non_embedding_param_size']

        # Apply the scaling law formula:
        # L = L_inf + A/(N^α) + B/(D^β) + C*(log(lr/lr_opt))^2 + E*log(bsz)
        # where:
        #   N = non_embedding_param_size (model size)
        #   D = data_size (training data size)
        #   lr = learning rate
        #   bsz = batch size

        # Model size term (power law)
        model_term = A / (non_embedding_param_size ** alpha)

        # Data size term (power law)
        data_term = B / (data_size ** beta)

        # Learning rate term (quadratic penalty from optimal)
        lr_term = C * (np.log(lr / lr_opt)) ** 2

        # Batch size term (logarithmic)
        bsz_term = E * np.log(bsz)

        # Combine all terms
        lm_loss = L_inf + model_term + data_term + lr_term + bsz_term

        # Return prediction
        predictions.append({'lm_loss': lm_loss})

    return predictions

#3 Run 3 R² = -0.940444

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """

    # Parameters fitted for the 'all_data' group
    # These were obtained through differential evolution optimization
    # to minimize the mean squared error between predicted and actual loss values
    params = {
        'all_data': {
            'E': 1.07127324,           # Irreducible loss (minimum achievable loss)
            'A': 12.32047451,          # Model size scaling coefficient
            'alpha': 0.13428613,       # Model size scaling exponent
            'B': 96.03044736,          # Data size scaling coefficient
            'beta': 0.22927593,        # Data size scaling exponent
            'gamma': -0.21145263,      # Learning rate exponent
            'delta': -0.04056945,      # Batch size exponent
            'lr_0': 0.00010000,        # Reference learning rate
            'bsz_0': 1056.52577403,    # Reference batch size
        }
    }

    # Get parameters for the specified group
    if group not in params:
        raise ValueError(f"Unknown group: {group}. Available groups: {list(params.keys())}")

    p = params[group]

    # Apply the scaling law to each data point
    results = []
    for data_point in input_data:
        # Extract input variables
        N = data_point['non_embedding_param_size']
        D = data_point['data_size']
        lr = data_point['lr']
        bsz = data_point['bsz']

        # Calculate effective data size
        # The learning rate and batch size affect how effectively the model
        # utilizes the training data
        D_eff = D * ((lr / p['lr_0']) ** p['gamma']) * ((bsz / p['bsz_0']) ** p['delta'])

        # Apply the scaling law:
        # L = E + A/N^alpha + B/D_eff^beta
        #
        # Where:
        # - E is the irreducible loss (theoretical minimum)
        # - A/N^alpha captures the effect of model size (larger models → lower loss)
        # - B/D_eff^beta captures the effect of data size (more data → lower loss)
        # - D_eff adjusts data size based on training hyperparameters
        lm_loss = p['E'] + p['A'] / (N ** p['alpha']) + p['B'] / (D_eff ** p['beta'])

        results.append({'lm_loss': lm_loss})

    return results

#4 Run 4 R² = -0.969830

▼

Python

import numpy as np


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Group-specific parameters
    # These were fitted using differential evolution on the training data
    group_params = {
        'all_data': {
            'E': 1.50388274e+00,
            'A': 4.79640622e+01,
            'alpha': 2.31705082e-01,
            'B': 4.96341790e+01,
            'beta': 1.87783075e-01,
            'C': 2.68264892e-02,
            'lr_scale': 9.70220088e-03,
            'lr_exp': -1.04437884e-01,
            'F': 5.62388220e-03,
            'bsz_scale': 4.05875928e-09,
            'bsz_exp': -4.83464543e-01,
        }
    }

    # Get parameters for the specified group
    if group not in group_params:
        # If group not found, use 'all_data' as default
        params = group_params['all_data']
    else:
        params = group_params[group]

    # Extract parameters
    E = params['E']
    A = params['A']
    alpha = params['alpha']
    B = params['B']
    beta = params['beta']
    C = params['C']
    lr_scale = params['lr_scale']
    lr_exp = params['lr_exp']
    F = params['F']
    bsz_scale = params['bsz_scale']
    bsz_exp = params['bsz_exp']

    # Make predictions for each data point
    results = []
    for data_point in input_data:
        # Extract input features
        N = data_point['non_embedding_param_size']
        D = data_point['data_size']
        lr = data_point['lr']
        bsz = data_point['bsz']

        # Calculate optimal learning rate (scales with model size)
        lr_opt = lr_scale * (N ** lr_exp)

        # Calculate optimal batch size (scales with model size)
        bsz_opt = bsz_scale * (N ** bsz_exp)

        # Compute the scaling law:
        # L = E + A/N^alpha + B/D^beta + C*(log(lr/lr_opt))^2 + F*log(bsz_opt/bsz)

        # Base loss (irreducible loss)
        base_loss = E

        # Model size scaling term (larger models achieve lower loss)
        model_term = A / (N ** alpha)

        # Data size scaling term (more data achieves lower loss)
        data_term = B / (D ** beta)

        # Learning rate penalty (quadratic in log space, penalizes deviation from optimal)
        lr_penalty = C * ((np.log(lr) - np.log(lr_opt)) ** 2)

        # Batch size effect (logarithmic penalty for suboptimal batch size)
        bsz_effect = F * np.log(bsz_opt / bsz)

        # Total predicted loss
        lm_loss = base_loss + model_term + data_term + lr_penalty + bsz_effect

        # Return prediction
        results.append({'lm_loss': float(lm_loss)})

    return results

#5 Run 5 R² = -1.000000

▼

Python

import numpy as np


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Parameters fitted for the 'all_data' group
    # The scaling law has the form:
    # L = E + A/N^α + B/D^β + C*(log(lr) - log(lr_opt))^2 + G/bsz^γ
    # where lr_opt = lr_base * bsz^lr_scale

    params = {
        'all_data': {
            'E': 1.32668496,
            'A': 41.31399135,
            'alpha': 0.21844815,
            'B': 86.93092370,
            'beta': 0.21783959,
            'C': 0.02543848,
            'lr_base': 0.00026245,
            'lr_scale': 0.30984192,
            'G': 14.21523916,
            'gamma': 1.97068429
        }
    }

    # Get parameters for the specified group
    if group not in params:
        raise ValueError(f"Unknown group: {group}. Available groups: {list(params.keys())}")

    p = params[group]

    # Process each data point
    results = []
    for data_point in input_data:
        # Extract input variables
        N = data_point['non_embedding_param_size']
        D = data_point['data_size']
        lr = data_point['lr']
        bsz = data_point['bsz']

        # Compute the scaling law
        # Base loss components from model size and data size
        base_loss = p['E'] + p['A'] / (N ** p['alpha']) + p['B'] / (D ** p['beta'])

        # Learning rate penalty (quadratic around optimal learning rate)
        lr_opt = p['lr_base'] * (bsz ** p['lr_scale'])
        lr_penalty = p['C'] * (np.log(lr) - np.log(lr_opt)) ** 2

        # Batch size penalty (smaller batch sizes hurt performance)
        bsz_penalty = p['G'] / (bsz ** p['gamma'])

        # Total predicted loss
        lm_loss = base_loss + lr_penalty + bsz_penalty

        results.append({'lm_loss': lm_loss})

    return results