SLD - LR & Batch Size Scaling Law

All Runs (sorted by R²)

Best Run 1 R² = 0.545767

▼

Python

from __future__ import annotations

import math
from typing import Dict, List


# Coefficients learned on the provided dataset for group 'all_data'.
# Feature order:
# [1, x1, x2, x3, x4, x1^2, x2^2, x3^2, x4^2, x1*x2, x1*x3, x1*x4, x2*x3, x2*x4, x3*x4]
_COEFS_BY_GROUP: Dict[str, List[float]] = {
    "all_data": [
        15.408655757208578,
        0.1479904624134041,
        0.925576816730592,
        -2.0155807017749745,
        -0.21074365992568728,
        0.1445807182504939,
        0.12570943660274597,
        0.13477282782648167,
        0.07811997175906828,
        -0.0778445730877946,
        -0.02359921758963033,
        0.1304365497600781,
        -0.12590176704259384,
        -0.050041748839094104,
        -0.09213648452069143,
    ]
}


def _predict_single(sample: Dict[str, float], coefs: List[float]) -> float:
    # Extract inputs
    lr = float(sample.get("lr", 0.0))
    bsz = float(sample.get("bsz", 0.0))
    data_size = float(sample.get("data_size", 0.0))
    non_embed_params = float(sample.get("non_embedding_param_size", 0.0))

    # Guard against non-positive values before log
    eps = 1e-300
    x1 = math.log10(max(lr, eps))
    x2 = math.log10(max(bsz, eps))
    x3 = math.log10(max(data_size, eps))
    x4 = math.log10(max(non_embed_params, eps))

    # Build feature vector in the fixed order
    feats = [
        1.0,
        x1,
        x2,
        x3,
        x4,
        x1 * x1,
        x2 * x2,
        x3 * x3,
        x4 * x4,
        x1 * x2,
        x1 * x3,
        x1 * x4,
        x2 * x3,
        x2 * x4,
        x3 * x4,
    ]

    # Linear combination
    pred = 0.0
    for f, c in zip(feats, coefs):
        pred += f * c
    return float(pred)


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Select coefficient set; default to 'all_data' when group is unknown
    coefs = _COEFS_BY_GROUP.get(group, _COEFS_BY_GROUP["all_data"]) 
    outputs: List[Dict[str, float]] = []
    for row in input_data:
        lm_loss = _predict_single(row, coefs)
        outputs.append({"lm_loss": lm_loss})
    return outputs

#2 Run 2 R² = 0.353284

▼

Python

from typing import List, Dict
import math


def _features(example: Dict[str, float]) -> List[float]:
    """Construct quadratic features in log-space for the scaling law."""
    x1 = math.log10(float(example["lr"]))
    x2 = math.log10(float(example["bsz"]))
    x3 = math.log10(float(example["data_size"]))
    x4 = math.log10(float(example["non_embedding_param_size"]))

    return [
        1.0,
        x1,
        x2,
        x3,
        x4,
        x1 * x1,
        x2 * x2,
        x3 * x3,
        x4 * x4,
        x1 * x2,
        x1 * x3,
        x1 * x4,
        x2 * x3,
        x2 * x4,
        x3 * x4,
    ]


# Per-group coefficients for the quadratic-in-log model
# y = sum_i c[i] * feature[i]
_COEFFICIENTS: Dict[str, List[float]] = {
    # Fitted on /app/data (2702 rows)
    # Metrics (fit on all data): R2 ≈ 0.9766, MAE ≈ 0.0198, RMSE ≈ 0.0303
    "all_data": [
        16.8138888600552,
        0.2624254210535559,
        0.9049176633537738,
        -2.142260361099579,
        -0.34899273153026433,
        0.14853075100299007,
        0.12695557272351365,
        0.13573629866090617,
        0.07862980741271874,
        -0.08196050004815598,
        -0.024765714838695822,
        0.12229811653279878,
        -0.12308842768445863,
        -0.05300037765711738,
        -0.08069891827953539,
    ],
}


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Fallback to a default set of coefficients if an unknown group is provided.
    # This preserves a single functional form while allowing per-group constants.
    coeffs = _COEFFICIENTS.get(group) or _COEFFICIENTS.get("all_data")
    if coeffs is None:
        raise ValueError(f"No coefficients available for group '{group}'.")

    outputs: List[Dict[str, float]] = []
    for ex in input_data:
        phi = _features(ex)
        # Dot product between features and coefficients
        y_hat = 0.0
        for c, f in zip(coeffs, phi):
            y_hat += c * f
        outputs.append({"lm_loss": float(y_hat)})

    return outputs

#3 Run 3 R² = -0.015989

▼

Python

import math
from typing import Dict, List


# Coefficients fitted on the provided dataset (group: 'all_data').
# Model form:
#   lm_loss = c0
#             + c1 * log10(lr)
#             + c2 * (log10(lr))^2
#             + c3 * log10(bsz)
#             + c4 * log10(data_size)
#             + c5 * log10(non_embedding_param_size)
COEFS: Dict[str, List[float]] = {
    "all_data": [
        9.0203054787606,    # c0 (intercept)
        0.7770969500785967, # c1 (log10(lr))
        0.1340372639030306, # c2 (log10(lr))^2
        0.0006034889974823782, # c3 (log10(bsz))
        -0.2813035622782266,   # c4 (log10(data_size))
        -0.3027047341882954,   # c5 (log10(non_embedding_param_size))
    ]
}


def _predict_single(row: Dict[str, float], coefs: List[float]) -> float:
    eps = 1e-12
    c0, c1, c2, c3, c4, c5 = coefs

    lr = max(float(row.get("lr", 0.0)), eps)
    bsz = max(float(row.get("bsz", 0.0)), eps)
    data_size = max(float(row.get("data_size", 0.0)), eps)
    params = max(float(row.get("non_embedding_param_size", 0.0)), eps)

    llr = math.log10(lr)
    lbsz = math.log10(bsz)
    ldata = math.log10(data_size)
    lparams = math.log10(params)

    return (
        c0
        + c1 * llr
        + c2 * (llr ** 2)
        + c3 * lbsz
        + c4 * ldata
        + c5 * lparams
    )


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Expected keys are: 'lr', 'bsz',
                    'data_size', and 'non_embedding_param_size'.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Fallback to 'all_data' coefficients if the provided group is unknown.
    coefs = COEFS.get(group, COEFS["all_data"])

    outputs: list[dict[str, float]] = []
    for row in input_data:
        y = _predict_single(row, coefs)
        outputs.append({"lm_loss": float(y)})
    return outputs

#4 Run 4 R² = -0.079115

▼

Python

from __future__ import annotations

import math
from typing import Dict, List


# Quadratic-in-logs scaling law fitted on the provided dataset.
# Variables (natural log):
#   x_lr = ln(lr)
#   x_b  = ln(bsz)
#   x_d  = ln(data_size)
#   x_n  = ln(non_embedding_param_size)
# Features order:
#   [x_lr, x_b, x_d, x_n,
#    x_lr^2, x_b^2, x_d^2, x_n^2,
#    x_lr*x_b, x_lr*x_d, x_lr*x_n,
#    x_b*x_d, x_b*x_n, x_d*x_n]


# Group-specific coefficients (same functional form across groups).
# Trained group available in the dataset: "all_data".
# If an unseen group is requested, we fall back to "all_data".
COEFFICIENTS: Dict[str, Dict[str, List[float]]] = {
    "all_data": {
        "intercept": [9.845717554648825],
        "coefs": [
            # linear terms
            0.06750242463128774,      # x_lr
            0.28796007724354983,      # x_b
            -0.40647200488009333,     # x_d
            -0.042787852040177925,    # x_n
            # squares
            0.02725586768292816,      # x_lr^2
            0.02407125998953225,      # x_b^2
            0.019730879533995164,     # x_d^2
            0.01893085387016256,      # x_n^2
            # interactions
            -0.014007732297484152,    # x_lr*x_b
            -0.0041614490016316195,   # x_lr*x_d
            0.023898037701275493,     # x_lr*x_n
            -0.022390145708785815,    # x_b*x_d
            -0.0052130124893074985,   # x_b*x_n
            -0.02799258320900191,     # x_d*x_n
        ],
    }
}


def _predict_single(sample: Dict[str, float], params: Dict[str, List[float]]) -> float:
    # Extract and validate inputs
    try:
        lr = float(sample["lr"])  # learning rate
        bsz = float(sample["bsz"])  # batch size
        data_size = float(sample["data_size"])  # tokens/examples seen
        n_params = float(sample["non_embedding_param_size"])  # non-embedding params
    except KeyError as e:
        raise KeyError(f"Missing required key: {e}")

    if lr <= 0 or bsz <= 0 or data_size <= 0 or n_params <= 0:
        raise ValueError("All inputs must be positive to compute logarithms.")

    x_lr = math.log(lr)
    x_b = math.log(bsz)
    x_d = math.log(data_size)
    x_n = math.log(n_params)

    # Construct feature vector in the fixed order
    feats = [
        x_lr, x_b, x_d, x_n,
        x_lr * x_lr,
        x_b * x_b,
        x_d * x_d,
        x_n * x_n,
        x_lr * x_b,
        x_lr * x_d,
        x_lr * x_n,
        x_b * x_d,
        x_b * x_n,
        x_d * x_n,
    ]

    coefs = params["coefs"]
    intercept = params["intercept"][0]
    pred = intercept + sum(c * f for c, f in zip(coefs, feats))
    return float(pred)


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Must include keys: 'lr', 'bsz',
                    'data_size', and 'non_embedding_param_size'. All values must be positive.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is the same for all groups, but
                the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s): {'lm_loss': float}.
    """

    # Choose group parameters, fallback to 'all_data' if unknown
    params = COEFFICIENTS.get(group)
    if params is None:
        params = COEFFICIENTS["all_data"]

    outputs: List[Dict[str, float]] = []
    for sample in input_data:
        y = _predict_single(sample, params)
        outputs.append({"lm_loss": y})
    return outputs

#5 Run 5 R² = -1.000000

▼

Python

from __future__ import annotations

from math import log
from typing import Dict, List


# Discovered scaling-law functional form (shared across groups):
# Let x1 = log(lr), x2 = log(bsz), x3 = log(data_size), x4 = log(non_embedding_param_size).
#   lm_loss = c0
#             + c1*x1 + c2*x2 + c3*x3 + c4*x4
#             + c5*(x1*x2) + c6*(x1*x3) + c7*(x1*x4)
#             + c8*(x2*x3) + c9*(x2*x4) + c10*(x3*x4)
#             + c11*(x1**2) + c12*(x2**2)
#
# Coefficients were fit per-group; if an unknown group is requested,
# we fall back to the 'default' set which mirrors the coefficients fit
# on the available training data.


_GROUP_COEFFICIENTS: Dict[str, List[float]] = {
    # Coefficients order:
    # [c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12]
    # Fitted on the provided dataset (group: 'all_data') using ridge regression with
    # a log-polynomial + pairwise interaction basis; see explain.md for details.
    "all_data": [
        0.02829860941068967,
        0.12477378907814929,
        0.29255278340861196,
        0.14949345652091237,
        0.32198741157393185,
        -0.014034212974161742,
        -0.002950229952215042,
        0.01917469948099148,
        -0.013977620291087227,
        -0.015096689742574953,
        -0.0111514016675562,
        0.026822329255043645,
        0.023125093454824875,
    ],
}

# Fallback coefficients for any unseen group (kept identical to 'all_data').
_GROUP_COEFFICIENTS["default"] = _GROUP_COEFFICIENTS["all_data"]


def _predict_one(sample: Dict[str, float], coefs: List[float]) -> float:
    # Safe log transform; clamp at a tiny epsilon to avoid log(<=0) if encountered.
    eps = 1e-12
    lr = float(sample.get("lr", 0.0))
    bsz = float(sample.get("bsz", 0.0))
    data_size = float(sample.get("data_size", 0.0))
    non_emb = float(sample.get("non_embedding_param_size", 0.0))

    lr = lr if lr > 0.0 else eps
    bsz = bsz if bsz > 0.0 else eps
    data_size = data_size if data_size > 0.0 else eps
    non_emb = non_emb if non_emb > 0.0 else eps

    x1 = log(lr)
    x2 = log(bsz)
    x3 = log(data_size)
    x4 = log(non_emb)

    (
        c0,
        c1,
        c2,
        c3,
        c4,
        c5,
        c6,
        c7,
        c8,
        c9,
        c10,
        c11,
        c12,
    ) = coefs

    y = (
        c0
        + c1 * x1
        + c2 * x2
        + c3 * x3
        + c4 * x4
        + c5 * (x1 * x2)
        + c6 * (x1 * x3)
        + c7 * (x1 * x4)
        + c8 * (x2 * x3)
        + c9 * (x2 * x4)
        + c10 * (x3 * x4)
        + c11 * (x1 ** 2)
        + c12 * (x2 ** 2)
    )
    return float(y)


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Select coefficients for the requested group, defaulting if not present.
    coefs = _GROUP_COEFFICIENTS.get(group, _GROUP_COEFFICIENTS["default"])

    outputs: List[Dict[str, float]] = []
    for sample in input_data:
        y = _predict_one(sample, coefs)
        outputs.append({"lm_loss": y})
    return outputs