SLD - LR & Batch Size Scaling Law

All Runs (sorted by R²)

Best Run 1 R² = -0.773483

▼

Python

import math
from typing import List, Dict

# Shared exponents across all groups (fitted on provided dataset)
EXPONENTS = {
    "lr": 0.008636919053849154,
    "bsz": -0.0005162836622543873,
    "data_size": -0.04700957690670233,
    "non_embedding_param_size": -0.05174150134631459,
}

# Per-group log-intercepts (only 'all_data' observed). Unknown groups fall back to this baseline.
LOG_INTERCEPTS = {
    "all_data": 3.080501739652768,
}

DEFAULT_GROUP = "all_data"


def _safe_log(x: float) -> float:
    if x is None or x <= 0:
        raise ValueError("All inputs must be positive real numbers.")
    return math.log(x)


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    g = group if group in LOG_INTERCEPTS else DEFAULT_GROUP
    a = EXPONENTS["lr"]
    b = EXPONENTS["bsz"]
    c = EXPONENTS["data_size"]
    d = EXPONENTS["non_embedding_param_size"]
    intercept = LOG_INTERCEPTS[g]

    out = []
    for row in input_data:
        lr = float(row["lr"])  # type: ignore[index]
        bsz = float(row["bsz"])  # type: ignore[index]
        data_size = float(row["data_size"])  # type: ignore[index]
        non_embed = float(row["non_embedding_param_size"])  # type: ignore[index]

        log_pred = (
            intercept
            + a * _safe_log(lr)
            + b * _safe_log(bsz)
            + c * _safe_log(data_size)
            + d * _safe_log(non_embed)
        )
        pred = math.exp(log_pred)
        out.append({"lm_loss": pred})
    return out

#2 Run 2 R² = -0.773483

▼

Python

import math
from typing import List, Dict

# Log-linear (power-law) scaling law fitted on the provided dataset.
# log(lm_loss) = c + a*log(lr) + b*log(bsz) + d*log(data_size) + e*log(non_embedding_param_size)
# lm_loss = exp(c) * lr^a * bsz^b * data_size^d * non_embedding_param_size^e

_COEFFS: Dict[str, Dict[str, float]] = {
    # Single observed group in the dataset. Used as default for any group name.
    "all_data": {
        "const": 3.080501739652768,
        "log_lr": 0.008636919053849154,
        "log_bsz": -0.0005162836622543873,
        "log_data_size": -0.04700957690670233,
        "log_nps": -0.05174150134631459,
    },
}

_DEFAULT_GROUP = "all_data"


def _log_pos(x: float) -> float:
    # Numerically safe log for positive quantities
    return math.log(max(float(x), 1e-12))


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    coeffs = _COEFFS.get(group, _COEFFS[_DEFAULT_GROUP])
    c = coeffs["const"]
    a = coeffs["log_lr"]
    b = coeffs["log_bsz"]
    d = coeffs["log_data_size"]
    e = coeffs["log_nps"]

    outputs: List[Dict[str, float]] = []
    for row in input_data:
        lr = row.get("lr", 0.0)
        bsz = row.get("bsz", 0.0)
        data_size = row.get("data_size", 0.0)
        nps = row.get("non_embedding_param_size", 0.0)

        y_log = (
            c
            + a * _log_pos(lr)
            + b * _log_pos(bsz)
            + d * _log_pos(data_size)
            + e * _log_pos(nps)
        )
        y = math.exp(y_log)
        outputs.append({"lm_loss": y})

    return outputs

#3 Run 3 R² = -1.000000

▼

Python

from __future__ import annotations
import math
from typing import Dict, List


def _predict_lm_loss(x: Dict[str, float], coef: Dict[str, float]) -> float:
    lr = float(x.get("lr", 0.0))
    bsz = float(x.get("bsz", 0.0))
    data_size = float(x.get("data_size", 0.0))
    non_emb = float(x.get("non_embedding_param_size", 0.0))

    # Guard against non-positive inputs for log
    eps = 1e-16
    ll = math.log10(max(lr, eps))
    lb = math.log10(max(bsz, eps))
    ld = math.log10(max(data_size, eps))
    lp = math.log10(max(non_emb, eps))

    y = (
        coef["intercept"]
        + coef["log_lr"] * ll
        + coef["log_bsz"] * lb
        + coef["log_lr2"] * (ll * ll)
        + coef["log_bsz2"] * (lb * lb)
        + coef["lr_bsz"] * (ll * lb)
        + coef["log_data"] * ld
        + coef["log_params"] * lp
    )
    return float(y)


# Coefficients discovered via log-polynomial regression on the provided dataset.
# Functional form is the same for all groups; coefficients may differ per group.
COEFFS_BY_GROUP: Dict[str, Dict[str, float]] = {
    "all_data": {
        "intercept": 9.919174347950008,
        "log_lr": 0.934534343690493,
        "log_bsz": -0.6179383648150774,
        "log_lr2": 0.1368417220658123,
        "log_bsz2": 0.09978735298702487,
        "lr_bsz": -0.060204456752825174,
        "log_data": -0.28033584602209644,
        "log_params": -0.30419462596816593,
    }
}

# Fallback: if an unknown group is requested, use the closest available baseline.
DEFAULT_GROUP = "all_data"


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is the same for all groups,
                but the coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    coef = COEFFS_BY_GROUP.get(group, COEFFS_BY_GROUP[DEFAULT_GROUP])
    out = []
    for row in input_data:
        y = _predict_lm_loss(row, coef)
        out.append({"lm_loss": y})
    return out