SLD - Vocabulary Scaling Law

All Runs (sorted by R²)

Best Run 1 R² = 0.980344

▼

Python

import math
from typing import List, Dict

# Quadratic polynomial in natural logs of inputs, fitted via ridge regression (alpha ~= 1e-6)
# Target: unigram_normalized_loss
# Features: 1, ln N, ln D, ln V, (ln N)^2, (ln N)(ln D), (ln N)(ln V), (ln D)^2, (ln D)(ln V), (ln V)^2
# Where N = non_vocab_parameters, D = num_characters, V = vocab_size

# Per-group coefficients. Functional form is identical across groups; coefficients may differ.
# Only 'all_data' was present in the released dataset; we use it as the default for any unknown group.
_COEFFICIENTS_BY_GROUP: Dict[str, Dict[str, float]] = {
    "all_data": {
        "c0": 43.65241359337898,
        "c_log_N": 0.5845970659865998,
        "c_log_D": -4.504391609574668,
        "c_log_V": 0.7794943512417376,
        "c_log_N2": 0.02581377699971201,
        "c_log_N_log_D": -0.0813545696437359,
        "c_log_N_log_V": 0.022588042542338404,
        "c_log_D2": 0.13736449927091602,
        "c_log_D_log_V": -0.0738696772199968,
        "c_log_V2": 0.0285489527696865,
    }
}

# Fallback order when an unknown group is requested
_FALLBACK_GROUP = "all_data"


def _predict_single(x: Dict[str, float], coeffs: Dict[str, float]) -> float:
    N = float(x.get("non_vocab_parameters", 0.0))
    D = float(x.get("num_characters", 0.0))
    V = float(x.get("vocab_size", 0.0))
    if N <= 0 or D <= 0 or V <= 0:
        # Guard against invalid inputs for logarithms; return NaN to signal invalid prediction
        return float("nan")
    lnN = math.log(N)
    lnD = math.log(D)
    lnV = math.log(V)
    y = (
        coeffs["c0"]
        + coeffs["c_log_N"] * lnN
        + coeffs["c_log_D"] * lnD
        + coeffs["c_log_V"] * lnV
        + coeffs["c_log_N2"] * (lnN ** 2)
        + coeffs["c_log_N_log_D"] * (lnN * lnD)
        + coeffs["c_log_N_log_V"] * (lnN * lnV)
        + coeffs["c_log_D2"] * (lnD ** 2)
        + coeffs["c_log_D_log_V"] * (lnD * lnV)
        + coeffs["c_log_V2"] * (lnV ** 2)
    )
    return float(y)


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    coeffs = _COEFFICIENTS_BY_GROUP.get(group, _COEFFICIENTS_BY_GROUP[_FALLBACK_GROUP])
    outputs: List[Dict[str, float]] = []
    for x in input_data:
        y = _predict_single(x, coeffs)
        outputs.append({"unigram_normalized_loss": y})
    return outputs

#2 Run 2 R² = 0.980329

▼

Python

from __future__ import annotations

from math import log
from typing import Dict, List

# Quadratic-in-logs scaling law fitted on the provided dataset
# Targets: unigram_normalized_loss
# Inputs: vocab_size (V), non_vocab_parameters (P), num_characters (D)
#
# y = c0
#     + c1*ln(V) + c2*ln(P) + c3*ln(D)
#     + c4*ln(V)^2 + c5*ln(V)*ln(P) + c6*ln(V)*ln(D)
#     + c7*ln(P)^2 + c8*ln(P)*ln(D) + c9*ln(D)^2
#
# The functional form is identical for all groups; coefficients may differ per group.
# The dataset contains a single group 'all_data'. If an unknown group is requested,
# we fall back to 'all_data'.

_COEFFICIENTS: Dict[str, Dict[str, float]] = {
    # Coefficients obtained via ordinary least squares on degree-2 polynomial features of
    # natural logs using the provided dataset.
    # Metrics (on provided data): R^2 ≈ 0.988, RMSE ≈ 0.088; holdout R^2 ≈ 0.986.
    "all_data": {
        "c0": 43.65302340313523,
        "c1": 0.7794957511937278,   # ln(V)
        "c2": 0.5846007123502589,   # ln(P)
        "c3": -4.504394566402874,  # ln(D)
        "c4": 0.028553981965247575,    # ln(V)^2
        "c5": 0.022592838156027455,    # ln(V)*ln(P)
        "c6": -0.07386461582128316,    # ln(V)*ln(D)
        "c7": 0.025813565754715825,    # ln(P)^2
        "c8": -0.08135643672422146,    # ln(P)*ln(D)
        "c9": 0.13736040362701446,     # ln(D)^2
    }
}

_EPS = 1e-12  # numerical safety for logs


def _predict_one(vocab_size: float, non_vocab_parameters: float, num_characters: float, coefs: Dict[str, float]) -> float:
    # Guard against non-positive inputs for logarithms
    V = max(float(vocab_size), _EPS)
    P = max(float(non_vocab_parameters), _EPS)
    D = max(float(num_characters), _EPS)

    lv = log(V)
    lp = log(P)
    ld = log(D)

    return (
        coefs["c0"]
        + coefs["c1"] * lv
        + coefs["c2"] * lp
        + coefs["c3"] * ld
        + coefs["c4"] * (lv ** 2)
        + coefs["c5"] * (lv * lp)
        + coefs["c6"] * (lv * ld)
        + coefs["c7"] * (lp ** 2)
        + coefs["c8"] * (lp * ld)
        + coefs["c9"] * (ld ** 2)
    )


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    coefs = _COEFFICIENTS.get(group, _COEFFICIENTS["all_data"])
    outputs: List[Dict[str, float]] = []
    for row in input_data:
        y = _predict_one(
            vocab_size=row.get("vocab_size", 0.0),
            non_vocab_parameters=row.get("non_vocab_parameters", 0.0),
            num_characters=row.get("num_characters", 0.0),
            coefs=coefs,
        )
        outputs.append({"unigram_normalized_loss": float(y)})
    return outputs

#3 Run 3 R² = 0.980329

▼

Python

from __future__ import annotations

import math
from typing import Dict, List

# Quadratic-in-logs scaling law discovered on the provided dataset.
# Variables: 
#   P = non_vocab_parameters
#   V = vocab_size
#   D = num_characters
# Prediction: unigram_normalized_loss
#
# Functional form (natural logarithms):
#   y = b0
#       + b1 * ln P + b2 * ln V + b3 * ln D
#       + b4 * (ln P)^2 + b5 * (ln P)(ln V) + b6 * (ln P)(ln D)
#       + b7 * (ln V)^2 + b8 * (ln V)(ln D) + b9 * (ln D)^2
#
# Coefficients can differ per experimental group, but the form is the same.
# Below are the coefficients fitted for the groups present in the training data.

COEFFICIENTS: Dict[str, Dict[str, float]] = {
    # Fitted on the provided dataset (single group present: 'all_data').
    # Intercept and coefficients learned via ordinary least squares on
    # the full quadratic features in ln P, ln V, ln D.
    "all_data": {
        "b0": 43.65302340313573,
        "b1": 0.5846007123502085,   # ln P
        "b2": 0.779495751193743,    # ln V
        "b3": -4.504394566402879,   # ln D
        "b4": 0.02581356575471539,  # (ln P)^2
        "b5": 0.02259283815602762,  # (ln P)(ln V)
        "b6": -0.08135643672421937, # (ln P)(ln D)
        "b7": 0.028553981965246198, # (ln V)^2
        "b8": -0.07386461582128219, # (ln V)(ln D)
        "b9": 0.1373604036270136,   # (ln D)^2
    },
}

# Fallback group to use when an unknown group is requested.
_DEFAULT_GROUP = "all_data"


def _predict_single(d: Dict[str, float], coeffs: Dict[str, float]) -> float:
    """Predicts unigram_normalized_loss for a single datum using given coefficients."""
    try:
        P = float(d["non_vocab_parameters"])  # non-vocabulary parameters
        V = float(d["vocab_size"])           # vocabulary size
        D = float(d["num_characters"])       # number of characters (data)
    except KeyError as e:
        raise KeyError(
            f"Missing required key {e!s}. Required keys: 'non_vocab_parameters', 'vocab_size', 'num_characters'"
        )

    if P <= 0 or V <= 0 or D <= 0:
        raise ValueError("All inputs must be positive to compute logarithms.")

    lnP = math.log(P)
    lnV = math.log(V)
    lnD = math.log(D)

    y = (
        coeffs["b0"]
        + coeffs["b1"] * lnP
        + coeffs["b2"] * lnV
        + coeffs["b3"] * lnD
        + coeffs["b4"] * (lnP ** 2)
        + coeffs["b5"] * (lnP * lnV)
        + coeffs["b6"] * (lnP * lnD)
        + coeffs["b7"] * (lnV ** 2)
        + coeffs["b8"] * (lnV * lnD)
        + coeffs["b9"] * (lnD ** 2)
    )
    return float(y)


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    coeffs = COEFFICIENTS.get(group, COEFFICIENTS[_DEFAULT_GROUP])
    preds = []
    for d in input_data:
        y = _predict_single(d, coeffs)
        preds.append({"unigram_normalized_loss": y})
    return preds

#4 Run 4 R² = 0.933929

▼

Python

from __future__ import annotations
from typing import List, Dict

# Discovered scaling law (inverse power-law saturation):
#   L = L_inf + A * P^{-alpha} + B * C^{-beta} + D * V^{-gamma}
# where
#   L: unigram_normalized_loss (lower is better)
#   P: non_vocab_parameters (parameters not allocated to the embedding)
#   C: num_characters (training data size in characters)
#   V: vocab_size (number of tokens in the vocabulary)
# The same functional form is used for all groups; coefficients differ per group.
#
# Coefficients below were fit on the provided dataset using non-linear least squares.
# If an unknown group is requested, we fall back to the 'all_data' parameters.

_PARAMS_BY_GROUP: Dict[str, Dict[str, float]] = {
    # Fitted on the provided dataset (group == 'all_data')
    # Methodology: SciPy curve_fit with bounds, model described above.
    # See /app/explain.md for details.
    "all_data": {
        "Linf": -5.803541619999999,  # asymptotic loss at infinite scale
        "A": 2.68428369e+01,         # coefficient for parameters term
        "alpha": 2.36765535e-01,     # exponent for parameters term
        "B": 6.70168939e+03,         # coefficient for data term
        "beta": 3.80181291e-01,      # exponent for data term
        "D": 2.22135843e+02,         # coefficient for vocab term
        "gamma": 2.80988265e+00,     # exponent for vocab term
    },
}

# Fallback group name to use when an unknown group is requested
_FALLBACK_GROUP = "all_data"


def _predict_one(x: Dict[str, float], params: Dict[str, float]) -> float:
    # Safeguard against zero/negative values (should not happen in valid data)
    P = max(float(x.get("non_vocab_parameters", 0.0)), 1e-12)
    C = max(float(x.get("num_characters", 0.0)), 1e-12)
    V = max(float(x.get("vocab_size", 0.0)), 1e-12)

    Linf = params["Linf"]
    A = params["A"]; alpha = params["alpha"]
    B = params["B"]; beta = params["beta"]
    D = params["D"]; gamma = params["gamma"]

    # Inverse power-law saturation
    return Linf + A * (P ** (-alpha)) + B * (C ** (-beta)) + D * (V ** (-gamma))


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    params = _PARAMS_BY_GROUP.get(group, _PARAMS_BY_GROUP[_FALLBACK_GROUP])
    out: List[Dict[str, float]] = []
    for x in input_data:
        y = _predict_one(x, params)
        out.append({"unigram_normalized_loss": float(y)})
    return out

#5 Run 5 R² = 0.933122

▼

Python

from __future__ import annotations
from typing import List, Dict

# Discovered scaling law (additive power-law with a constant floor):
#   y = L + A * Np^{-alpha} + B * D^{-beta} + C * V^{-gamma}
# where
#   y  = unigram_normalized_loss
#   Np = non_vocab_parameters
#   D  = num_characters
#   V  = vocab_size
# The functional form is shared across groups; coefficients may vary by group.
# This repository's dataset only contains a single group ("all_data"). We also
# provide a "default" set of parameters that mirrors the same fit.

PARAMS_BY_GROUP = {
    # Fit obtained by cross-validated grid-search on exponents
    # (alpha, beta, gamma) and least-squares on coefficients with
    # nonnegativity encouraged for A, B, C. Vocabulary-size effect
    # is negligible in this dataset (C ~ 0), so gamma is included
    # for completeness but contributes little.
    "all_data": {
        # Refined fit (nonnegative A,B,C with intercept re-fit), full-data:
        # exponents: alpha=0.06, beta=0.35, gamma=0.02
        # coefficients: L=-6.45718219, A=2.51363526, B=3866.31610, C=0.0
        "alpha": 0.06,
        "beta": 0.35,
        "gamma": 0.02,
        "L": -6.45718219,
        "A": 2.51363526,
        "B": 3866.31610,
        "C": 0.0,
    },
    "default": {
        "alpha": 0.06,
        "beta": 0.35,
        "gamma": 0.02,
        "L": -6.45718219,
        "A": 2.51363526,
        "B": 3866.31610,
        "C": 0.0,
    },
}


def _predict_one(x: Dict[str, float], p: Dict[str, float]) -> float:
    # Extract and guard against non-positive inputs (should not occur in sane data)
    V = float(x.get("vocab_size", 0.0))
    Np = float(x.get("non_vocab_parameters", 0.0))
    D = float(x.get("num_characters", 0.0))

    eps = 1e-12
    V = max(V, eps)
    Np = max(Np, eps)
    D = max(D, eps)

    alpha = p["alpha"]; beta = p["beta"]; gamma = p["gamma"]
    L = p["L"]; A = p["A"]; B = p["B"]; C = p["C"]

    y = (
        L
        + A * (Np ** (-alpha))
        + B * (D ** (-beta))
        + C * (V ** (-gamma))
    )
    return float(y)


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    params = PARAMS_BY_GROUP.get(group, PARAMS_BY_GROUP["default"])  # fallback if unseen group
    out: List[Dict[str, float]] = []
    for row in input_data:
        y = _predict_one(row, params)
        out.append({"unigram_normalized_loss": y})
    return out