SLD - LR & Batch Size Scaling Law

All Runs (sorted by R²)

Best Run 1 R² = 0.380674

▼

Python

from __future__ import annotations

import math
from typing import Dict, List

# Discovered scaling law:
# Let x1 = log10(lr), x2 = log10(bsz), x3 = log10(data_size), x4 = log10(non_embedding_param_size).
# Predict lm_loss as a quadratic polynomial in these log-variables with interactions.
# The functional form is the same across groups; coefficients may differ by group.

# Coefficients fitted on the provided dataset for group "all_data" using
# Ridge regression on quadratic polynomial features of the log-variables.
# Keys correspond to polynomial feature names.
_COEFFICIENTS_BY_GROUP: Dict[str, Dict[str, float]] = {
    "all_data": {
        "1": 16.497915,
        "log_lr": 0.266742,
        "log_bsz": 0.907321,
        "log_data_size": -2.112344,
        "log_non_embedding_param_size": -0.308876,
        # Quadratic terms
        "log_lr^2": 0.148389,
        "log_bsz^2": 0.126924,
        "log_data_size^2": 0.134987,
        "log_non_embedding_param_size^2": 0.077240,
        # Pairwise interactions
        "log_lr log_bsz": -0.081928,
        "log_lr log_data_size": -0.024850,
        "log_lr log_non_embedding_param_size": 0.121794,
        "log_bsz log_data_size": -0.123098,
        "log_bsz log_non_embedding_param_size": -0.053240,
        "log_data_size log_non_embedding_param_size": -0.082462,
    }
}

# If an unknown group is provided, fall back to this group name
_DEFAULT_GROUP = "all_data"


def _predict_one(d: Dict[str, float], coeffs: Dict[str, float]) -> float:
    # Extract and validate input variables
    try:
        lr = float(d["lr"])
        bsz = float(d["bsz"])
        data_size = float(d["data_size"])
        non_emb_params = float(d["non_embedding_param_size"])
    except KeyError as e:
        raise KeyError(f"Missing required key: {e.args[0]}")

    if lr <= 0 or bsz <= 0 or data_size <= 0 or non_emb_params <= 0:
        raise ValueError("All inputs must be positive to compute logarithms.")

    # Log10 transform
    log_lr = math.log10(lr)
    log_bsz = math.log10(bsz)
    log_data_size = math.log10(data_size)
    log_non_emb = math.log10(non_emb_params)

    # Compute polynomial terms
    terms = {
        "1": 1.0,
        "log_lr": log_lr,
        "log_bsz": log_bsz,
        "log_data_size": log_data_size,
        "log_non_embedding_param_size": log_non_emb,
        "log_lr^2": log_lr * log_lr,
        "log_bsz^2": log_bsz * log_bsz,
        "log_data_size^2": log_data_size * log_data_size,
        "log_non_embedding_param_size^2": log_non_emb * log_non_emb,
        "log_lr log_bsz": log_lr * log_bsz,
        "log_lr log_data_size": log_lr * log_data_size,
        "log_lr log_non_embedding_param_size": log_lr * log_non_emb,
        "log_bsz log_data_size": log_bsz * log_data_size,
        "log_bsz log_non_embedding_param_size": log_bsz * log_non_emb,
        "log_data_size log_non_embedding_param_size": log_data_size * log_non_emb,
    }

    # Weighted sum
    y = 0.0
    for name, val in terms.items():
        coef = coeffs.get(name, 0.0)
        y += coef * val
    return y


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Choose coefficients for the group, or fall back.
    coeffs = _COEFFICIENTS_BY_GROUP.get(group)
    if coeffs is None:
        coeffs = _COEFFICIENTS_BY_GROUP[_DEFAULT_GROUP]

    outputs: List[Dict[str, float]] = []
    for d in input_data:
        y = _predict_one(d, coeffs)
        outputs.append({"lm_loss": float(y)})
    return outputs

#2 Run 2 R² = -0.015989

▼

Python

from math import log
from typing import Dict, List

# Discovered scaling law (shared functional form across groups):
# lm_loss = c0
#           + c1 * log(lr)
#           + c2 * (log(lr))**2
#           + c3 * log(bsz)
#           + c4 * log(data_size)
#           + c5 * log(non_embedding_param_size)
#
# Coefficients were fitted per group using ordinary least squares on the
# provided dataset, minimizing squared error on lm_loss. If a group is
# unknown, we fall back to the 'all_data' coefficients.

# Per‑group coefficients: [c0, c1, c2, c3, c4, c5]
_COEFFS: Dict[str, List[float]] = {
    # Fitted on the provided dataset (group='all_data')
    'all_data': [
        9.020305484479904,
        0.33748891735866704,
        0.025280995773975513,
        0.00026209194149558315,
        -0.12216858459839972,
        -0.1314629956084264,
    ],
}

# Fallback order if group not found
_FALLBACK_ORDER = ['all_data']


def _get_coeffs(group: str) -> List[float]:
    if group in _COEFFS:
        return _COEFFS[group]
    for g in _FALLBACK_ORDER:
        if g in _COEFFS:
            return _COEFFS[g]
    # As a last resort, return neutral-ish coefficients
    return [9.0, 0.3, 0.03, 0.0, -0.12, -0.13]


def _safe_log(x: float) -> float:
    # Guard against non-positive inputs; these variables should be > 0 in practice
    eps = 1e-12
    return log(x if x > eps else eps)


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Expected keys: 'lr', 'bsz', 'data_size',
                    'non_embedding_param_size'. Additional keys are ignored.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is shared across groups, while
                coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s). Keys: 'lm_loss'.
    """
    c0, c1, c2, c3, c4, c5 = _get_coeffs(group)

    preds: list[dict[str, float]] = []
    for row in input_data:
        lr = float(row.get('lr', 0.0))
        bsz = float(row.get('bsz', 0.0))
        data_size = float(row.get('data_size', 0.0))
        non_emb = float(row.get('non_embedding_param_size', 0.0))

        llr = _safe_log(lr)
        y = (
            c0
            + c1 * llr
            + c2 * (llr ** 2)
            + c3 * _safe_log(bsz)
            + c4 * _safe_log(data_size)
            + c5 * _safe_log(non_emb)
        )
        preds.append({'lm_loss': float(y)})

    return preds

#3 Run 3 R² = -0.554337

▼

Python

from __future__ import annotations
import math
from typing import Dict, List

# Coefficients fitted on the provided dataset.
# Same functional form across groups; coefficients can vary per group.
# Currently only 'all_data' group is present in the dataset. If an unknown
# group is requested, we fall back to 'all_data'.
_GROUP_COEFFICIENTS: Dict[str, Dict[str, float]] = {
    "all_data": {
        # Quadratic-in-logs with selected interactions
        "log_lr": 0.19600464136808087,
        "log_lr_sq": 0.02768344677092333,
        "log_bsz": 0.4109285947180476,
        "log_bsz_sq": 0.022836218783255697,
        "log_data_size": -0.0749600761598031,
        "log_non_embedding_param_size": 0.08928818705975993,
        "log_lr_log_bsz": -0.014524865103616121,
        "log_bsz_log_data": -0.013557991312862786,
        "log_lr_log_data": -0.003711387776763094,
        "log_lr_log_params": 0.01711654753022837,
        "log_bsz_log_params": -0.021682465414967024,
        "bias": 4.768639705961491,
    }
}

_FEATURE_ORDER = [
    "log_lr",
    "log_lr_sq",
    "log_bsz",
    "log_bsz_sq",
    "log_data_size",
    "log_non_embedding_param_size",
    "log_lr_log_bsz",
    "log_bsz_log_data",
    "log_lr_log_data",
    "log_lr_log_params",
    "log_bsz_log_params",
    "bias",
]


def _predict_one(x: Dict[str, float], coefs: Dict[str, float]) -> float:
    # Extract base variables (ensure positive for logs)
    lr = float(x["lr"])  # > 0
    bsz = float(x["bsz"])  # > 0
    data_size = float(x["data_size"])  # > 0
    non_embed_params = float(x["non_embedding_param_size"])  # > 0

    if lr <= 0 or bsz <= 0 or data_size <= 0 or non_embed_params <= 0:
        raise ValueError("All input variables must be positive for log-based law.")

    # Log features
    log_lr = math.log(lr)
    log_bsz = math.log(bsz)
    log_data = math.log(data_size)
    log_params = math.log(non_embed_params)

    # Derived terms
    feats = {
        "log_lr": log_lr,
        "log_lr_sq": log_lr * log_lr,
        "log_bsz": log_bsz,
        "log_bsz_sq": log_bsz * log_bsz,
        "log_data_size": log_data,
        "log_non_embedding_param_size": log_params,
        "log_lr_log_bsz": log_lr * log_bsz,
        "log_bsz_log_data": log_bsz * log_data,
        "log_lr_log_data": log_lr * log_data,
        "log_lr_log_params": log_lr * log_params,
        "log_bsz_log_params": log_bsz * log_params,
        "bias": 1.0,
    }

    # Linear combination
    pred = 0.0
    for k in _FEATURE_ORDER:
        pred += coefs[k] * feats[k]
    return float(pred)


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    coefs = _GROUP_COEFFICIENTS.get(group) or _GROUP_COEFFICIENTS["all_data"]
    out: List[Dict[str, float]] = []
    for row in input_data:
        y = _predict_one(row, coefs)
        out.append({"lm_loss": y})
    return out

#4 Run 4 R² = -0.650389

▼

Python

from math import log10
from typing import List, Dict

# Coefficients per experimental group for the shared functional form below.
# If an unknown group is provided, we fall back to 'all_data'.
#
# Shared functional form (base-10 logs):
#   x_lr = log10(lr)
#   x_b  = log10(bsz)
#   x_D  = log10(data_size)
#   x_P  = log10(non_embedding_param_size)
#   
#   lm_loss_hat = a
#                 + b1 * x_lr + b2 * x_lr**2
#                 + c_b * x_b + e_b2 * x_b**2
#                 + c_D * x_D + c_P * x_P
#                 + d_DP * (x_D * x_P)
#
# Coefficients were fitted on the provided dataset (group 'all_data').
COEFFICIENTS = {
    "all_data": {
        "a": 4.986977711869537,
        "b1": 0.7993289969526238,
        "b2": 0.13735770698080854,
        "c_b": -0.443466504631831,
        "e_b2": 0.09787731248280594,
        "c_D": 0.1797681406783725,
        "c_P": 0.22858361718194975,
        "d_DP": -0.053901509467974736,
    }
}


def _get_group_coeffs(group: str) -> Dict[str, float]:
    # Fallback to 'all_data' if group not found
    return COEFFICIENTS.get(group, COEFFICIENTS["all_data"])  # type: ignore[return-value]


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Expected keys: 'lr', 'bsz', 'data_size',
                    'non_embedding_param_size'.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s): {'lm_loss': float}.
    """
    coeffs = _get_group_coeffs(group)

    out: List[Dict[str, float]] = []

    # Small positive epsilon to avoid log of non-positive values.
    eps = 1e-16

    for row in input_data:
        lr = float(row.get("lr", 0.0))
        bsz = float(row.get("bsz", 0.0))
        data_size = float(row.get("data_size", 0.0))
        non_embed_params = float(row.get("non_embedding_param_size", 0.0))

        # Guard against non-positive values for logs
        lr = lr if lr > eps else eps
        bsz = bsz if bsz > eps else eps
        data_size = data_size if data_size > eps else eps
        non_embed_params = non_embed_params if non_embed_params > eps else eps

        x_lr = log10(lr)
        x_b = log10(bsz)
        x_D = log10(data_size)
        x_P = log10(non_embed_params)

        y = (
            coeffs["a"]
            + coeffs["b1"] * x_lr
            + coeffs["b2"] * (x_lr ** 2)
            + coeffs["c_b"] * x_b
            + coeffs["e_b2"] * (x_b ** 2)
            + coeffs["c_D"] * x_D
            + coeffs["c_P"] * x_P
            + coeffs["d_DP"] * (x_D * x_P)
        )

        out.append({"lm_loss": float(y)})

    return out

#5 Run 5 R² = -1.000000

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    The discovered law is a log-linear (Cobb–Douglas–style) relationship between the final
    language modeling loss (lm_loss) and the training hyperparameters:

        lm_loss = c0 + c_lr * ln(lr) + c_bsz * ln(bsz) + c_data * ln(data_size)
                         + c_param * ln(non_embedding_param_size)

    The functional form is the same for all groups; only the coefficients may differ by group.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Required keys per item:
                    'lr', 'bsz', 'data_size', 'non_embedding_param_size'
        group: The name of the experimental group for which to make predictions.
                If an unknown group is provided, a default set of coefficients
                learned from the full dataset is used.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s) under key 'lm_loss'.
    """
    import math

    # Coefficients fitted from the provided dataset using OLS on log-transformed features.
    # Format: [c0, c_lr, c_bsz, c_data, c_param]
    COEFS = {
        # Single group observed in the dataset; used as default for unknown groups
        'all_data': [8.030584551316633, 0.02226846891773654, 0.0002343781710282701,
                     -0.11937208428394176, -0.13307885911461645],
        # Add future groups here if available, keeping the same functional form
    }

    # Fall back to 'all_data' if the requested group is unknown
    if group not in COEFS:
        coeffs = COEFS['all_data']
    else:
        coeffs = COEFS[group]

    c0, c_lr, c_bsz, c_data, c_param = coeffs

    def safe_ln(x: float) -> float:
        # Guard against non-positive inputs; tiny epsilon avoids -inf
        return math.log(max(float(x), 1e-12))

    outputs: list[dict[str, float]] = []
    for row in input_data:
        lr = row.get('lr', 0.0)
        bsz = row.get('bsz', 0.0)
        data_size = row.get('data_size', 0.0)
        non_emb = row.get('non_embedding_param_size', 0.0)

        pred = (
            c0
            + c_lr * safe_ln(lr)
            + c_bsz * safe_ln(bsz)
            + c_data * safe_ln(data_size)
            + c_param * safe_ln(non_emb)
        )
        outputs.append({'lm_loss': float(pred)})

    return outputs