SLD - LR & Batch Size Scaling Law

All Runs (sorted by R²)

Best Run 1 R² = 0.353682

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Functional form (shared across groups):
    Let x1 = log10(lr), x2 = log10(bsz), x3 = log10(data_size), x4 = log10(non_embedding_param_size).
    Then
        lm_loss = β0
                  + β1 x1 + β2 x2 + β3 x3 + β4 x4
                  + β5 x1^2 + β6 x2^2 + β7 x3^2 + β8 x4^2
                  + β9 x1 x2 + β10 x1 x3 + β11 x1 x4
                  + β12 x2 x3 + β13 x2 x4 + β14 x3 x4

    Coefficients β are group-specific when available; unknown groups fall back to a default set fit on all data.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s): {'lm_loss': <float>}.
    """
    import math

    # Coefficients per group (only 'all_data' available in fitting). Fallback to 'all_data'.
    COEFFICIENTS = {
        "all_data": [
            16.80946515,  # β0 (bias)
            0.26248593,   # β1 * x1
            0.90495135,   # β2 * x2
            -2.14184167,  # β3 * x3
            -0.34843091,  # β4 * x4
            0.14852876,   # β5 * x1^2
            0.12695513,   # β6 * x2^2
            0.13572582,   # β7 * x3^2
            0.07861034,   # β8 * x4^2
            -0.08196004,  # β9 * x1*x2
            -0.02476690,  # β10 * x1*x3
            0.12229106,   # β11 * x1*x4
            -0.12308856,  # β12 * x2*x3
            -0.05300373,  # β13 * x2*x4
            -0.08072360,  # β14 * x3*x4
        ]
    }

    beta = COEFFICIENTS.get(group, COEFFICIENTS["all_data"])

    outputs: list[dict[str, float]] = []
    eps = 1e-30  # guard for logs
    for row in input_data:
        x1 = math.log10(max(float(row["lr"]), eps))
        x2 = math.log10(max(float(row["bsz"]), eps))
        x3 = math.log10(max(float(row["data_size"]), eps))
        x4 = math.log10(max(float(row["non_embedding_param_size"]), eps))

        feats = [
            1.0,
            x1, x2, x3, x4,
            x1 * x1, x2 * x2, x3 * x3, x4 * x4,
            x1 * x2, x1 * x3, x1 * x4,
            x2 * x3, x2 * x4, x3 * x4,
        ]
        pred = 0.0
        for b, f in zip(beta, feats):
            pred += b * f
        outputs.append({"lm_loss": float(pred)})

    return outputs

#2 Run 2 R² = 0.353289

▼

Python

from __future__ import annotations

from math import log10
from typing import Dict, List

# Quadratic-in-log scaling law with key interactions for language modeling loss.
#
# Let ld = log10(data_size), lp = log10(non_embedding_param_size),
#     llr = log10(lr), lb = log10(bsz).
#
# lm_loss = c0 \
#           + c1 * ld \
#           + c2 * lp \
#           + c3 * llr \
#           + c4 * lb \
#           + c5 * (llr)**2 \
#           + c6 * ld * lp \
#           + c7 * (ld)**2 \
#           + c8 * (lp)**2 \
#           + c9  * llr * ld \
#           + c10 * llr * lp \
#           + c11 * lb * ld \
#           + c12 * lb * lp \
#           + c13 * lb * llr \
#           + c14 * (lb)**2
#
# Coefficients are fitted per experimental group. If an unknown group is
# requested, we fall back to the 'all_data' coefficients.

_COEFFS_BY_GROUP: Dict[str, List[float]] = {
    # Order:
    # [c0, c1(ld), c2(lp), c3(llr), c4(lb), c5(llr^2), c6(ld*lp), c7(ld^2), c8(lp^2),
    #  c9(llr*ld), c10(llr*lp), c11(lb*ld), c12(lb*lp), c13(lb*llr), c14(lb^2)]
    # Fitted on the provided dataset (/app/data)
    # Using least squares on 2702 points, R^2 ≈ 0.977 (5-fold CV ≈ 0.976)
    "all_data": [
        1.681388886e01,  # c0
        -2.14226036e00,  # c1 (ld)
        -3.48992730e-01, # c2 (lp)
        2.62425420e-01,  # c3 (llr)
        9.04917660e-01,  # c4 (lb)
        1.48530750e-01,  # c5 (llr^2)
        -8.06989200e-02, # c6 (ld*lp)
        1.35736300e-01,  # c7 (ld^2)
        7.86298100e-02,  # c8 (lp^2)
        -2.47657100e-02, # c9 (llr*ld)
        1.22298120e-01,  # c10 (llr*lp)
        -1.23088430e-01, # c11 (lb*ld)
        -5.30003800e-02, # c12 (lb*lp)
        -8.19605000e-02, # c13 (lb*llr)
        1.26955570e-01,  # c14 (lb^2)
    ],
}

# Default/fallback coefficients
_DEFAULT_GROUP = "all_data"


def _safe_log10(x: float) -> float:
    """Compute log10 with a tiny positive floor for numerical safety.

    The dataset and expected inputs should be strictly positive for all variables,
    but we guard against accidental non-positive inputs by flooring to a tiny
    positive value to avoid math domain errors and keep the function robust.
    """
    # Floor near double-precision minimum, but not too extreme to avoid inf
    tiny = 1e-300
    if not isinstance(x, (int, float)):
        raise TypeError(f"Expected a number, got {type(x)}")
    if x <= 0 or x != x:  # also handles NaN
        x = tiny
    return log10(x)


def _predict_row(row: Dict[str, float], coeffs: List[float]) -> float:
    ld = _safe_log10(float(row["data_size"]))
    lp = _safe_log10(float(row["non_embedding_param_size"]))
    llr = _safe_log10(float(row["lr"]))
    lb = _safe_log10(float(row["bsz"]))

    (
        c0, c1, c2, c3, c4,
        c5, c6, c7, c8,
        c9, c10, c11, c12, c13, c14,
    ) = coeffs
    y = (
        c0
        + c1 * ld
        + c2 * lp
        + c3 * llr
        + c4 * lb
        + c5 * (llr ** 2)
        + c6 * ld * lp
        + c7 * (ld ** 2)
        + c8 * (lp ** 2)
        + c9 * llr * ld
        + c10 * llr * lp
        + c11 * lb * ld
        + c12 * lb * lp
        + c13 * lb * llr
        + c14 * (lb ** 2)
    )
    return float(y)


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Required keys per dict:
                      - 'lr'
                      - 'bsz'
                      - 'data_size'
                      - 'non_embedding_param_size'
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is the same for all groups, but
                the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries with one key:
          - 'lm_loss': the predicted language modeling loss.
    """
    coeffs = _COEFFS_BY_GROUP.get(group, _COEFFS_BY_GROUP[_DEFAULT_GROUP])

    outputs: List[Dict[str, float]] = []
    for row in input_data:
        y = _predict_row(row, coeffs)
        outputs.append({"lm_loss": y})
    return outputs

#3 Run 3 R² = 0.353284

▼

Python

from __future__ import annotations

import math
from typing import Dict, List


# Quadratic polynomial in the natural logs of inputs:
# Let x1 = ln(lr), x2 = ln(bsz), x3 = ln(data_size), x4 = ln(non_embedding_param_size).
# The law predicts lm_loss as:
# y = c0
#     + c1*x1 + c2*x2 + c3*x3 + c4*x4
#     + c5*x1^2 + c6*x2^2 + c7*x3^2 + c8*x4^2
#     + c9*x1*x2 + c10*x1*x3 + c11*x1*x4 + c12*x2*x3 + c13*x2*x4 + c14*x3*x4
# Coefficients can vary by group, but the functional form is fixed across groups.
# The coefficients below were fit via ordinary least squares on the provided training data.

_COEFFICIENTS_BY_GROUP: Dict[str, Dict[str, float]] = {
    # Fitted on the entire dataset available in /app/data at build time.
    # Keys correspond to the terms in the quadratic expansion defined above.
    "all_data": {
        "1": 16.813888860056007,
        "Lr": 0.11396991227465522,
        "Lb": 0.39300074777132327,
        "Ld": -0.9303718536256446,
        "Ln": -0.15156561752798173,
        "Lr2": 0.02801463700508827,
        "Lb2": 0.023945306016463035,
        "Ld2": 0.02560145363651052,
        "Ln2": 0.014830501411815829,
        "LrLb": -0.015458709002005744,
        "LrLd": -0.004671103503437297,
        "LrLn": 0.02306685530058676,
        "LbLd": -0.023215917228054137,
        "LbLn": -0.009996491172168451,
        "LdLn": -0.015220759923710097,
    }
}

# If an unknown group is requested, fall back to this group.
_FALLBACK_GROUP = "all_data"


def _predict_single(values: Dict[str, float], coefs: Dict[str, float]) -> float:
    # Safeguard: ensure strictly positive inputs for logarithms
    eps = 1e-300
    lr = max(float(values.get("lr", 0.0)), eps)
    bsz = max(float(values.get("bsz", 0.0)), eps)
    data_size = max(float(values.get("data_size", 0.0)), eps)
    non_emb = max(float(values.get("non_embedding_param_size", 0.0)), eps)

    Lr = math.log(lr)
    Lb = math.log(bsz)
    Ld = math.log(data_size)
    Ln = math.log(non_emb)

    # Quadratic terms
    Lr2 = Lr * Lr
    Lb2 = Lb * Lb
    Ld2 = Ld * Ld
    Ln2 = Ln * Ln

    # Pairwise interactions
    LrLb = Lr * Lb
    LrLd = Lr * Ld
    LrLn = Lr * Ln
    LbLd = Lb * Ld
    LbLn = Lb * Ln
    LdLn = Ld * Ln

    y = (
        coefs["1"]
        + coefs["Lr"] * Lr
        + coefs["Lb"] * Lb
        + coefs["Ld"] * Ld
        + coefs["Ln"] * Ln
        + coefs["Lr2"] * Lr2
        + coefs["Lb2"] * Lb2
        + coefs["Ld2"] * Ld2
        + coefs["Ln2"] * Ln2
        + coefs["LrLb"] * LrLb
        + coefs["LrLd"] * LrLd
        + coefs["LrLn"] * LrLn
        + coefs["LbLd"] * LbLd
        + coefs["LbLn"] * LbLn
        + coefs["LdLn"] * LdLn
    )
    return float(y)


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Expected keys per item:
                    - 'lr'
                    - 'bsz'
                    - 'data_size'
                    - 'non_embedding_param_size'
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is the same for all groups, but
                coefficients can differ per group. Unknown groups fall back to
                a default set of coefficients fit on the full dataset.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s) under key 'lm_loss'.
    """
    coefs = _COEFFICIENTS_BY_GROUP.get(group, _COEFFICIENTS_BY_GROUP[_FALLBACK_GROUP])
    results: List[Dict[str, float]] = []
    for row in input_data:
        yhat = _predict_single(row, coefs)
        results.append({"lm_loss": yhat})
    return results

#4 Run 4 R² = 0.353284

▼

Python

from __future__ import annotations

import math
from typing import Dict, List

# Discovered scaling law:
# Quadratic polynomial in the logarithms of the inputs with all pairwise interactions.
# y = c0 + c1*L + c2*B + c3*D + c4*P
#     + c5*L^2 + c6*B^2 + c7*D^2 + c8*P^2
#     + c9*L*B + c10*L*D + c11*L*P + c12*B*D + c13*B*P + c14*D*P
# where L=log(lr), B=log(bsz), D=log(data_size), P=log(non_embedding_param_size)
#
# Coefficients are per-group. If an unknown group is provided, fall back to 'all_data'.

COEFS: Dict[str, List[float]] = {
    # Fitted on the provided dataset (single group: 'all_data')
    # Order: [c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14]
    "all_data": [
        16.813888860056007,
        0.11396991227465522,
        0.39300074777132327,
        -0.9303718536256446,
        -0.15156561752798173,
        0.02801463700508827,
        0.023945306016463035,
        0.02560145363651052,
        0.014830501411815829,
        -0.015458709002005744,
        -0.004671103503437297,
        0.02306685530058676,
        -0.023215917228054137,
        -0.009996491172168451,
        -0.015220759923710097,
    ],
}

DEFAULT_GROUP = "all_data"


def _predict_one(x: Dict[str, float], coeffs: List[float]) -> float:
    try:
        L = math.log(float(x["lr"]))
        B = math.log(float(x["bsz"]))
        D = math.log(float(x["data_size"]))
        P = math.log(float(x["non_embedding_param_size"]))
    except KeyError as e:
        raise KeyError(f"Missing required key in input data: {e}")
    except ValueError:
        raise ValueError("All input values must be positive to take logarithms.")

    c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14 = coeffs

    y = (
        c0
        + c1 * L
        + c2 * B
        + c3 * D
        + c4 * P
        + c5 * (L * L)
        + c6 * (B * B)
        + c7 * (D * D)
        + c8 * (P * P)
        + c9 * (L * B)
        + c10 * (L * D)
        + c11 * (L * P)
        + c12 * (B * D)
        + c13 * (B * P)
        + c14 * (D * P)
    )
    return float(y)


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    coeffs = COEFS.get(group, COEFS[DEFAULT_GROUP])

    outputs: List[Dict[str, float]] = []
    for row in input_data:
        y = _predict_one(row, coeffs)
        outputs.append({"lm_loss": y})
    return outputs

#5 Run 5 R² = -0.015979

▼

Python

from __future__ import annotations
from typing import Dict, List
import math

# Coefficients fitted on the provided dataset (group: 'all_data')
# Functional form (log10 domain):
# lm_loss = a0 + a1*z + a2*z^2 + a3*s + a4*p + a5*b
# where z = log10(lr), s = log10(data_size), p = log10(non_embedding_param_size), b = log10(bsz)

PARAMS: Dict[str, Dict[str, float]] = {
    "all_data": {
        "a0": 9.02029713,
        "a1": 0.777093486,
        "a2": 0.134036623,
        "a3": -0.281303442,
        "a4": -0.302704433,
        "a5": 0.000603497861,
    },
}

# Fallback to use if an unknown group is requested
DEFAULT_GROUP = "all_data"


def _safe_log10(x: float) -> float:
    # Guard against numerical issues; inputs in data are strictly positive by design
    # but we still clamp to avoid log of non-positive values.
    eps = 1e-12
    return math.log10(max(x, eps))


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    params = PARAMS.get(group, PARAMS.get(DEFAULT_GROUP, next(iter(PARAMS.values()))))
    a0 = params["a0"]; a1 = params["a1"]; a2 = params["a2"]; a3 = params["a3"]; a4 = params["a4"]; a5 = params["a5"]

    outputs: List[Dict[str, float]] = []
    for row in input_data:
        lr = float(row.get("lr", 0.0))
        bsz = float(row.get("bsz", 0.0))
        data_size = float(row.get("data_size", 0.0))
        non_embed_params = float(row.get("non_embedding_param_size", 0.0))

        z = _safe_log10(lr)
        s = _safe_log10(data_size)
        p = _safe_log10(non_embed_params)
        b = _safe_log10(bsz)

        lm_loss = a0 + a1 * z + a2 * (z ** 2) + a3 * s + a4 * p + a5 * b
        outputs.append({"lm_loss": float(lm_loss)})

    return outputs