SLD - Data-Constrained Scaling Law

All Runs (sorted by R²)

Best Run 1 R² = 0.936135

▼

Python

from typing import List, Dict
import math

# Fitted coefficients per group for the scaling law:
# loss = L0 + a * params^(-alpha) + b * tokens^(-beta) + c * ln(unique_tokens)
#
# Notes:
# - Coefficients below were obtained via non-linear least squares on the provided dataset.
# - If an unknown group is requested, we fall back to the 'all_data' coefficients.

_COEFFS = {
    # Trained from the dataset at /app/data (161 points, single group 'all_data')
    "all_data": {
        "L0": 5.314158928164251,
        "a": 4163.742173986624,
        "alpha": 0.4910050761229603,
        "b": 109180.20697694572,
        "beta": 0.5637776884040872,
        "c": -0.11944428211525198,
    }
}


def _predict_single(x: Dict[str, float], k: Dict[str, float]) -> float:
    # Safeguards for domain constraints
    params = max(float(x.get("params", 0.0)), 1e-12)
    tokens = max(float(x.get("tokens", 0.0)), 1e-12)
    unique_tokens = max(float(x.get("unique_tokens", 0.0)), 1.0)

    return (
        k["L0"]
        + k["a"] * (params ** (-k["alpha"]))
        + k["b"] * (tokens ** (-k["beta"]))
        + k["c"] * math.log(unique_tokens)
    )


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts the final validation loss ('loss') for language model pre-training
    given parameter count ('params'), total training tokens ('tokens'), and the
    number of unique tokens in the dataset ('unique_tokens').

    Functional form (shared across groups):
        loss = L0 + a * params^(-alpha) + b * tokens^(-beta) + c * ln(unique_tokens)

    The coefficients (L0, a, alpha, b, beta, c) are group-specific. If the
    provided group is unknown, this function falls back to 'all_data'.

    Args:
        input_data: List of dicts; each must contain 'params', 'tokens',
                    and 'unique_tokens' (floats).
        group: Name of the experimental group.

    Returns:
        List of dicts with a single key 'loss' containing the prediction.
    """
    if not isinstance(input_data, list):
        raise TypeError("input_data must be a list of dictionaries")

    coeffs = _COEFFS.get(group, _COEFFS["all_data"])  # fallback to all_data

    out: List[Dict[str, float]] = []
    for row in input_data:
        y = _predict_single(row, coeffs)
        out.append({"loss": float(y)})
    return out

#2 Run 2 R² = 0.915943

▼

Python

from __future__ import annotations
from typing import List, Dict

# Discovered scaling law (shared exponents across groups):
#   loss ≈ L_inf[group] + a[group] * params^{-alpha} + b[group] * tokens^{-beta} + c[group] * unique_tokens^{-gamma}
# Fitted on provided dataset; see /app/explain.md for details.

# Global exponents (shared across groups), from grid-searched least squares fit
_ALPHA = 0.5
_BETA = 0.55
_GAMMA = 0.15

# Per-group coefficients (L_inf, a, b, c). If a group is not found, fall back to 'all_data'
_GROUP_COEFFICIENTS = {
    # Fitted on the only group present in the dataset
    'all_data': {
        'L_inf': 1.9772224464978783,
        'a': 4677.914495956277,
        'b': 84210.885300064,
        'c': 18.46655768657379,
    },
}


def _predict_loss(params: float, tokens: float, unique_tokens: float, group: str) -> float:
    # Guard against non-positive inputs to avoid numerical issues
    p = max(float(params), 1.0)
    t = max(float(tokens), 1.0)
    u = max(float(unique_tokens), 1.0)

    coeffs = _GROUP_COEFFICIENTS[group] if group in _GROUP_COEFFICIENTS else _GROUP_COEFFICIENTS['all_data']
    L_inf = coeffs['L_inf']
    a = coeffs['a']
    b = coeffs['b']
    c = coeffs['c']

    return (
        L_inf
        + a * (p ** (-_ALPHA))
        + b * (t ** (-_BETA))
        + c * (u ** (-_GAMMA))
    )


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Expected keys: 'params', 'tokens', 'unique_tokens'.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is the same for all groups,
                but the constant coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s). Keys: 'loss'.
    """
    outputs: List[Dict[str, float]] = []
    for row in input_data:
        params = row.get('params')
        tokens = row.get('tokens')
        unique_tokens = row.get('unique_tokens')
        if params is None or tokens is None or unique_tokens is None:
            raise ValueError("Each input row must have 'params', 'tokens', and 'unique_tokens'.")
        pred_loss = _predict_loss(params, tokens, unique_tokens, group)
        outputs.append({'loss': float(pred_loss)})
    return outputs

#3 Run 3 R² = 0.915943

▼

Python

from __future__ import annotations
from typing import Dict, List
import math

# Functional form (same for all groups):
#   loss = L_inf + A * params^{-a} + B * tokens^{-b} + C * unique_tokens^{-c}
# Coefficients may differ per group. Values below were fit on the provided dataset.

_COEFFICIENTS: Dict[str, Dict[str, float]] = {
    # Fitted on group == "all_data"
    "all_data": {
        "L_inf": 1.9772224464979034,
        "A": 4677.914495956284,
        "B": 84210.88530006418,
        "C": 18.4665576865743,
        "a": 0.5,
        "b": 0.55,
        "c": 0.15,
    },
}

# Fallback group key if an unseen group is requested
_FALLBACK_GROUP = "all_data"


def _predict_single(x: Dict[str, float], coefs: Dict[str, float]) -> float:
    # Guard against non-positive inputs for power operations
    eps = 1e-12
    p = max(float(x.get("params", 0.0)), eps)
    t = max(float(x.get("tokens", 0.0)), eps)
    u = max(float(x.get("unique_tokens", 0.0)), eps)

    L_inf = coefs["L_inf"]
    A = coefs["A"]; a = coefs["a"]
    B = coefs["B"]; b = coefs["b"]
    C = coefs["C"]; c = coefs["c"]

    return (
        L_inf
        + A * (p ** (-a))
        + B * (t ** (-b))
        + C * (u ** (-c))
    )


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Expected keys: 'params', 'tokens', 'unique_tokens'.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    coefs = _COEFFICIENTS.get(group, _COEFFICIENTS[_FALLBACK_GROUP])
    preds = []
    for row in input_data:
        y = _predict_single(row, coefs)
        preds.append({"loss": float(y)})
    return preds

#4 Run 4 R² = 0.915943

▼

Python

from __future__ import annotations
from typing import List, Dict

# Discovered scaling law (additive shifted power laws)
# loss = L0 + Bp*(P/P0)^(-alpha_p) + Bt*(T/T0)^(-beta_t) + Bu*(U/U0)^(-gamma_u)
# The functional form is shared across groups; coefficients may differ by group.

# Non-dimensionalization constants (fixed across groups)
_P0 = 1e9       # parameters reference scale
_T0 = 1e11      # tokens reference scale
_U0 = 1e10      # unique tokens reference scale

# Exponents (fixed across groups from global fit)
_ALPHA_P = 0.50
_BETA_T = 0.55
_GAMMA_U = 0.15

# Group-specific coefficients fitted on the provided dataset
# If an unknown group is requested, we fall back to 'all_data'.
_GROUP_COEFFS = {
    "all_data": {
        "L0": 1.97722245,
        "Bp": 0.14792865,
        "Bt": 0.07505303,
        "Bu": 0.58396383,
    }
}


def _predict_loss(params: float, tokens: float, unique_tokens: float, coeffs: Dict[str, float]) -> float:
    p = params / _P0
    t = tokens / _T0
    u = unique_tokens / _U0
    return (
        coeffs["L0"]
        + coeffs["Bp"] * (p ** (-_ALPHA_P))
        + coeffs["Bt"] * (t ** (-_BETA_T))
        + coeffs["Bu"] * (u ** (-_GAMMA_U))
    )


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Required keys: 'params', 'tokens', 'unique_tokens'.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s) under key 'loss'.
    """
    if group in _GROUP_COEFFS:
        coeffs = _GROUP_COEFFS[group]
    else:
        coeffs = _GROUP_COEFFS["all_data"]
    outputs: List[Dict[str, float]] = []
    for row in input_data:
        params = float(row.get("params", 0.0))
        tokens = float(row.get("tokens", 0.0))
        unique_tokens = float(row.get("unique_tokens", 0.0))
        loss = _predict_loss(params, tokens, unique_tokens, coeffs)
        outputs.append({"loss": float(loss)})
    return outputs

#5 Run 5 R² = 0.914136

▼

Python

from __future__ import annotations

from typing import Dict, List

# Global exponents shared across groups (functional form is identical)
# Fitted via nonlinear search on the provided dataset.
EXPONENTS = {
    "p_params": 0.5065484647862601,   # exponent for params
    "q_tokens": 0.5635675499712252,   # exponent for tokens
    "r_unique": 0.1292210842785036,   # exponent for unique_tokens
}

# Per-group coefficients. If an unknown group is requested, fall back to "all_data".
# Coefficients correspond to the additive power-law model:
# loss = L + a * params^(-p) + b * tokens^(-q) + c * unique_tokens^(-r)
GROUP_COEFFS: Dict[str, Dict[str, float]] = {
    "all_data": {
        "L": 1.8542464832328804,
        "a": 5185.971392997345,
        "b": 108444.86104642624,
        "c": 14.15004130726869,
    }
}


def _predict_point(x: Dict[str, float], coeffs: Dict[str, float]) -> float:
    p = EXPONENTS["p_params"]
    q = EXPONENTS["q_tokens"]
    r = EXPONENTS["r_unique"]

    params = float(x["params"])  # model parameter count
    tokens = float(x["tokens"])  # total train tokens
    unique = float(x["unique_tokens"])  # number of unique tokens in dataset

    # Assumes all inputs are positive; scaling law is defined for positive domain only.
    loss = (
        coeffs["L"]
        + coeffs["a"] * (params ** (-p))
        + coeffs["b"] * (tokens ** (-q))
        + coeffs["c"] * (unique ** (-r))
    )
    return float(loss)


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Required keys: 'params', 'tokens', 'unique_tokens'.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is the same for all groups, but
                the constant coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s): {'loss': float}.
    """
    coeffs = GROUP_COEFFS.get(group) or GROUP_COEFFS["all_data"]

    preds: List[Dict[str, float]] = []
    for x in input_data:
        y = _predict_point(x, coeffs)
        preds.append({"loss": y})
    return preds