SLD - LR & Batch Size Scaling Law - gemini-cli + Gemini 2.5 Flash

Best Run 1 R² = -0.773483

▼

Python

import numpy as np

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """

    # Fitted parameters for the 'all_data' group
    # These were derived from linear regression on log-transformed data.
    # log10(lm_loss) = intercept + coef_lr * log10(lr) + coef_bsz * log10(bsz) +
    #                  coef_data_size * log10(data_size) + coef_non_embedding_param_size * log10(non_embedding_param_size)

    # This dictionary would ideally be loaded from a configuration or a pre-computed file
    # but for this specific problem, we hardcode it since there's only one group and no external config handling is specified.
    parameters = {
        "all_data": {
            "intercept": 1.3378449070245593,
            "coef_lr": 0.008636919053849442,
            "coef_bsz": -0.0005162836622544797,
            "coef_data_size": -0.04700957690670219,
            "coef_non_embedding_param_size": -0.05174150134631417
        }
    }

    if group not in parameters:
        raise ValueError(f"Parameters for group '{group}' not found.")

    group_params = parameters[group]
    intercept = group_params["intercept"]
    coef_lr = group_params["coef_lr"]
    coef_bsz = group_params["coef_bsz"]
    coef_data_size = group_params["coef_data_size"]
    coef_non_embedding_param_size = group_params["coef_non_embedding_param_size"]

    predictions = []
    for data_point in input_data:
        lr = data_point['lr']
        bsz = data_point['bsz']
        data_size = data_point['data_size']
        non_embedding_param_size = data_point['non_embedding_param_size']

        # Check for non-positive values before log transformation
        if not all(val > 0 for val in [lr, bsz, data_size, non_embedding_param_size]):
            # Handle cases where input might be zero or negative, which are invalid for log transformation
            # For a scaling law, inputs are typically positive. Return NaN or raise an error as appropriate.
            # Here, we'll return NaN for lm_loss if any input is non-positive.
            predicted_lm_loss = float('nan')
        else:
            # Apply the log-linear model
            log_lm_loss = (intercept +
                           coef_lr * np.log10(lr) +
                           coef_bsz * np.log10(bsz) +
                           coef_data_size * np.log10(data_size) +
                           coef_non_embedding_param_size * np.log10(non_embedding_param_size))

            # Convert back from log10 scale to original scale
            predicted_lm_loss = 10**log_lm_loss

        predictions.append({'lm_loss': predicted_lm_loss})

    return predictions

#2 Run 2 R² = -0.773484

▼

Python

import math

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Fitted parameters for the 'all_data' group
    # In a more complex scenario with multiple groups, these would be loaded dynamically
    # based on the 'group' argument.
    parameters = {
        'all_data': {
            'intercept': 3.0805017725989114,
            'coefficients': {
                'lr': 0.008636928917736202,
                'bsz': -0.0005162836945380653,
                'data_size': -0.04700957730284016,
                'non_embedding_param_size': -0.051741499753456495
            }
        }
    }

    # Retrieve parameters for the given group
    # For this dataset, only 'all_data' is present, so 'group' will always be 'all_data'.
    group_params = parameters.get(group)
    if not group_params:
        raise ValueError(f"No parameters found for group: {group}")

    ln_C = group_params['intercept']
    coef = group_params['coefficients']

    predicted_results = []
    for data_point in input_data:
        # Calculate ln(lm_loss)
        log_lm_loss_pred = ln_C
        log_lm_loss_pred += coef['lr'] * math.log(data_point['lr'])
        log_lm_loss_pred += coef['bsz'] * math.log(data_point['bsz'])
        log_lm_loss_pred += coef['data_size'] * math.log(data_point['data_size'])
        log_lm_loss_pred += coef['non_embedding_param_size'] * math.log(data_point['non_embedding_param_size'])

        # Convert back from log to original scale
        lm_loss_pred = math.exp(log_lm_loss_pred)
        predicted_results.append({'lm_loss': lm_loss_pred})

    return predicted_results

#3 Run 3 R² = -0.816700

▼

Python

import math

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Fitted parameters for the 'all_data' group
    # In a more complex scenario with multiple groups, this would be a dictionary
    # mapping group names to their respective parameter sets.
    # For this dataset, only 'all_data' group exists.
    if group == 'all_data':
        params = {
            'C': 21.603153428136885,
            'lr': 0.00932414933559713,
            'bsz': 0.000412360843847143,
            'data_size': -0.04811794508621384,
            'non_embedding_param_size': -0.05004428969823731
        }
    else:
        # Handle unknown groups, perhaps by raising an error or returning a default
        raise ValueError(f"Unknown group: {group}")

    predictions = []
    for data_point in input_data:
        lr = data_point['lr']
        bsz = data_point['bsz']
        data_size = data_point['data_size']
        non_embedding_param_size = data_point['non_embedding_param_size']

        # Ensure no input values are zero or negative if they are raised to a non-integer power.
        # Although current problem context implies positive values, it's a good practice.
        # For this specific problem, lr, bsz, data_size, non_embedding_param_size are expected to be positive.
        
        # Calculate lm_loss using the power law formula
        lm_loss = params['C'] * \
                  (lr**params['lr']) * \
                  (bsz**params['bsz']) * \
                  (data_size**params['data_size']) * \
                  (non_embedding_param_size**params['non_embedding_param_size'])
        
        predictions.append({'lm_loss': lm_loss})
    
    return predictions

#4 Run 4 R² = -1.000000

▼

Python

import numpy as np
import json
import os

# Load fitted parameters from the JSON file
_FITTED_PARAMS = {}
try:
    with open(os.path.join(os.path.dirname(__file__), 'fitted_params.json'), 'r') as f:
        _FITTED_PARAMS = json.load(f)
except FileNotFoundError:
    print("Error: fitted_params.json not found. Please ensure the analysis script has been run.")
    # Exit or handle the error appropriately for the testing environment
    # For now, we'll just have an empty dict, which will lead to a KeyError if group is accessed.

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    
    if group not in _FITTED_PARAMS:
        raise ValueError(f"Parameters for group '{group}' not found.")

    params = _FITTED_PARAMS[group]
    C = params['C']
    coeffs = params['coefficients']

    predictions = []
    epsilon = 1e-9 # To handle potential log(0) if any input value is exactly zero

    for data_point in input_data:
        lr = data_point['lr']
        bsz = data_point['bsz']
        data_size = data_point['data_size']
        non_embedding_param_size = data_point['non_embedding_param_size']

        # Apply the power law formula: lm_loss = C * (lr^a) * (bsz^b) * (data_size^c) * (non_embedding_param_size^d)
        predicted_lm_loss = (
            C
            * ((lr + epsilon)**coeffs['lr'])
            * ((bsz + epsilon)**coeffs['bsz'])
            * ((data_size + epsilon)**coeffs['data_size'])
            * ((non_embedding_param_size + epsilon)**coeffs['non_embedding_param_size'])
        )
        predictions.append({'lm_loss': predicted_lm_loss})
    
    return predictions

#5 Run 5 R² = -1.000000

▼

LR & Batch Size Scaling Law

All Runs (sorted by R²)