SLD - Data-Constrained Scaling Law

All Runs (sorted by R²)

Best Run 1 R² = 0.913528

▼

Python

from __future__ import annotations
from typing import Dict, List

# Global exponents shared across groups (discovered via grid-search least squares)
_ALPHA_PARAMS = 0.50275
_BETA_TOKENS = 0.5658333333333334
_GAMMA_UNIQUE = 0.1328333333333333

# Group-specific linear coefficients [c, A, B, D] for the additive inverse-power model
# Fitted on the provided dataset. A default is provided for unknown groups.
_COEFFICIENTS: Dict[str, List[float]] = {
    # loss = c + A * params^{-alpha} + B * tokens^{-beta} + D * unique_tokens^{-gamma}
    "all_data": [1.8793173316766316, 4879.203039121107, 113188.27489200784, 14.824566834048097],
    "default":  [1.8793173316766316, 4879.203039121107, 113188.27489200784, 14.824566834048097],
}

# Small epsilon to guard against any accidental zero-valued inputs
_EPS = 1e-12


def _predict_single(x: Dict[str, float], coef: List[float]) -> float:
    c, A, B, D = coef
    p = max(float(x.get("params", 0.0)), _EPS)
    t = max(float(x.get("tokens", 0.0)), _EPS)
    u = max(float(x.get("unique_tokens", 0.0)), _EPS)
    return (
        c
        + A * (p ** (-_ALPHA_PARAMS))
        + B * (t ** (-_BETA_TOKENS))
        + D * (u ** (-_GAMMA_UNIQUE))
    )


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    The law used here is an additive inverse-power scaling model:
        loss = c + A * params^{-alpha} + B * tokens^{-beta} + D * unique_tokens^{-gamma}

    Exponents (alpha, beta, gamma) are shared across groups; the linear
    coefficients (c, A, B, D) are group-specific (with a default fallback).

    Args:
        input_data: A list of dictionaries, each containing the numeric inputs:
            - 'params' (float): model parameter count
            - 'tokens' (float): total pre-training tokens
            - 'unique_tokens' (float): number of unique tokens in the dataset
        group: The experimental group for which to make predictions.

    Returns:
        A list of dictionaries, each containing:
            - 'loss' (float): predicted final validation loss
    """
    coef = _COEFFICIENTS.get(group, _COEFFICIENTS["default"])
    return [{"loss": _predict_single(row, coef)} for row in input_data]

#2 Run 2 R² = 0.912524

▼

Python

from __future__ import annotations
from typing import Dict, List

# Discovered scaling law (functional form shared across groups):
#   loss = L0 + a * P^(-ap) + b * T^(-bt) + c * U^(-cu) + d * (P*T)^(-dx)
# where
#   P = params, T = tokens, U = unique_tokens
# Coefficients below are fitted per group. If an unknown group is provided,
# we fall back to the 'all_data' coefficients.

_GROUP_PARAMS: Dict[str, Dict[str, float]] = {
    "all_data": {
        # Fitted with nonnegative coefficients using non-linear least squares
        # on the provided dataset.
        "L0": 1.89642926,
        "a": 3220.35969,
        "ap": 0.488875099,
        "b": 138466.144,
        "bt": 0.584352928,
        "c": 16.2409846,
        "cu": 0.136988374,
        "d": 19125.4726,
        "dx": 0.29439468,
    }
}

_DEFAULT_GROUP = "all_data"


def _predict_single(P: float, T: float, U: float, params: Dict[str, float]) -> float:
    # Guard against nonpositive values (outside training distribution)
    if P <= 0 or T <= 0 or U <= 0:
        return float("nan")
    L0 = params["L0"]
    a, ap = params["a"], params["ap"]
    b, bt = params["b"], params["bt"]
    c, cu = params["c"], params["cu"]
    d, dx = params["d"], params["dx"]
    return (
        L0
        + a * (P ** (-ap))
        + b * (T ** (-bt))
        + c * (U ** (-cu))
        + d * ((P * T) ** (-dx))
    )


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Expected keys: 'params', 'tokens', 'unique_tokens'.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s) under key 'loss'.
    """
    params = _GROUP_PARAMS.get(group, _GROUP_PARAMS[_DEFAULT_GROUP])

    outputs: List[Dict[str, float]] = []
    for row in input_data:
        P = float(row.get("params", 0.0))
        T = float(row.get("tokens", 0.0))
        U = float(row.get("unique_tokens", 0.0))
        pred = _predict_single(P, T, U, params)
        outputs.append({"loss": float(pred)})
    return outputs

#3 Run 3 R² = 0.905629

▼

Python

from __future__ import annotations

# Discovered scaling law (data-constrained LM pre-training):
#   loss(params, tokens, unique_tokens) = L_inf
#       + A * params^{-alpha}
#       + B * tokens^{-beta}
#       + C * unique_tokens^{-gamma}
# The functional form is identical across groups; only the coefficients differ.
# If an unknown group is provided, we fall back to the "default" coefficients.

# Fitted on the provided dataset (group == "all_data").
# Coefficients obtained via nonlinear least squares with random restarts.
_COEFFICIENTS: dict[str, dict[str, float]] = {
    # Best RMSE on provided data ≈ 0.272 (see /app/explain.md)
    "all_data": {
        "L_inf": 2.29977243,
        "A": 1101.09385, "alpha": 0.40907593,
        "B": 106860.325, "beta": 0.56202189,
        "C": 166.571827, "gamma": 0.29285241,
    },
    # Fallback coefficients, identical to all_data for now.
    "default": {
        "L_inf": 2.29977243,
        "A": 1101.09385, "alpha": 0.40907593,
        "B": 106860.325, "beta": 0.56202189,
        "C": 166.571827, "gamma": 0.29285241,
    },
}

_REQUIRED_KEYS = ("params", "tokens", "unique_tokens")


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    coeffs = _COEFFICIENTS.get(group, _COEFFICIENTS["default"])

    Linf = coeffs["L_inf"]
    A, alpha = coeffs["A"], coeffs["alpha"]
    B, beta = coeffs["B"], coeffs["beta"]
    C, gamma = coeffs["C"], coeffs["gamma"]

    outputs: list[dict[str, float]] = []
    for row in input_data:
        # Validate expected keys
        if not all(k in row for k in _REQUIRED_KEYS):
            outputs.append({"loss": float("nan")})
            continue
        # Extract and guard values (strictly positive for power laws)
        p = float(row.get("params", 0.0))
        t = float(row.get("tokens", 0.0))
        u = float(row.get("unique_tokens", 0.0))
        eps = 1.0
        p = p if p > 0.0 else eps
        t = t if t > 0.0 else eps
        u = u if u > 0.0 else eps

        loss_val = (
            Linf
            + A * (p ** (-alpha))
            + B * (t ** (-beta))
            + C * (u ** (-gamma))
        )
        outputs.append({"loss": float(loss_val)})

    return outputs

#4 Run 4 R² = 0.869045

▼

Python

from __future__ import annotations
from typing import Dict, List


def _predict_loss(params: float, tokens: float, unique_tokens: float, coef: Dict[str, float]) -> float:
    # Numerical safety: enforce strictly positive inputs
    eps = 1e-12
    N = max(float(params), eps)
    D = max(float(tokens), eps)
    U = max(float(unique_tokens), eps)

    c = coef["c"]
    a = coef["a"]
    alpha = coef["alpha"]
    b = coef["b"]
    beta = coef["beta"]
    s = coef["s"]

    # Effective data after accounting for duplication / limited uniqueness
    Deff = min(D, s * U)

    # Scaling law: independent capacity- and data-limited improvements + irreducible floor
    # L = c + a * N^{-alpha} + b * Deff^{-beta}
    loss = c + a * (N ** (-alpha)) + b * (Deff ** (-beta))
    return float(loss)


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Required keys: 'params', 'tokens', 'unique_tokens'.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s): {'loss': float}.
    """
    # Per-group coefficients for the law. If an unseen group is provided, fall back to 'all_data'.
    COEFFICIENTS: Dict[str, Dict[str, float]] = {
        # Fitted on the provided dataset (see /app/explain.md for details)
        # L = c + a * N^{-alpha} + b * min(D, s * U)^{-beta}
        # where N=params, D=tokens, U=unique_tokens
        "all_data": {
            "c": 2.255038883,   # irreducible loss floor
            "a": 4.24239542e04, # parameter-scaling amplitude
            "alpha": 0.645550388, # parameter-scaling exponent
            "b": 3.44184023e03, # data-scaling amplitude
            "beta": 0.361914566, # data-scaling exponent
            "s": 2.40311025e01,  # effective-uniqueness multiplier
        },
    }

    coef = COEFFICIENTS.get(group, COEFFICIENTS["all_data"])

    outputs: List[Dict[str, float]] = []
    for row in input_data:
        loss = _predict_loss(
            params=row.get("params", 0.0),
            tokens=row.get("tokens", 0.0),
            unique_tokens=row.get("unique_tokens", 0.0),
            coef=coef,
        )
        outputs.append({"loss": loss})

    return outputs

#5 Run 5 R² = 0.866873

▼

Python

from __future__ import annotations

from typing import Dict, List
import math

# Discovered scaling law (same functional form for all groups):
#   loss = L0 + A * params^(-alpha) + B * Neff^(-beta)
# with an effective data term that accounts for limited uniqueness in the corpus:
#   Neff = (tokens * (c * unique_tokens)) / (tokens + c * unique_tokens)
# which behaves like a smooth minimum of tokens and c * unique_tokens.
#
# Fitted coefficients per group. If an unknown group is provided, we fall back to
# the "all_data" coefficients.
_GROUP_COEFFS: Dict[str, Dict[str, float]] = {
    # Fitted on the provided dataset using non-linear least squares
    # L0, A, alpha, B, beta, c
    "all_data": {
        "L0": 2.38717219,
        "A": 1.60700128e04,
        "alpha": 5.81892030e-01,
        "B": 9.76230068e03,
        "beta": 4.22008080e-01,
        "c": 2.54449411e01,
    },
}

# Default group to use when the provided group is not found
_DEFAULT_GROUP = "all_data"


def _predict_single(P: float, T: float, U: float, coeffs: Dict[str, float]) -> float:
    """Apply the scaling law for a single data point.

    Args:
        P: params (parameter count)
        T: tokens (total training tokens)
        U: unique_tokens (number of unique tokens)
        coeffs: dictionary with keys {L0, A, alpha, B, beta, c}

    Returns:
        Predicted loss (float)
    """
    L0 = float(coeffs["L0"])  # irreducible loss floor
    A = float(coeffs["A"])    # capacity scaling amplitude
    alpha = float(coeffs["alpha"])  # capacity exponent (>0)
    B = float(coeffs["B"])    # data scaling amplitude
    beta = float(coeffs["beta"])    # data exponent (>0)
    c = float(coeffs["c"])          # uniqueness-to-tokens coupling scale

    # Numerical safety
    eps = 1e-12
    P = max(float(P), eps)
    T = max(float(T), 0.0)
    U = max(float(U), 0.0)

    # Effective number of independent tokens (smooth min between T and c*U)
    CU = c * U
    denom = T + CU
    if denom <= eps:
        Neff = 0.0
    else:
        Neff = (T * CU) / denom

    # Clamp Neff minimally to avoid division by zero in power with negative exponent
    Neff = max(Neff, eps)

    loss = L0 + A * (P ** (-abs(alpha))) + B * (Neff ** (-abs(beta)))
    return float(loss)


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Required keys: 'params', 'tokens', 'unique_tokens'.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s): {'loss': <float>}.
    """
    coeffs = _GROUP_COEFFS.get(group, _GROUP_COEFFS[_DEFAULT_GROUP])

    outputs: List[Dict[str, float]] = []
    for row in input_data:
        P = row.get("params")
        T = row.get("tokens")
        U = row.get("unique_tokens")
        if P is None or T is None or U is None:
            raise ValueError("Each input dict must contain 'params', 'tokens', and 'unique_tokens'.")
        pred = _predict_single(P, T, U, coeffs)
        outputs.append({"loss": pred})

    return outputs