SLD - Data-Constrained Scaling Law

All Runs (sorted by R²)

Best Run 1 R² = 0.914608

▼

Python

from __future__ import annotations

from typing import Dict, List


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Expected keys: 'params', 'tokens', 'unique_tokens'.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s). Keys: 'loss'.
    """

    # Shared exponents across groups (fitted on the dataset in /app/data)
    alpha = 0.5036363636363637  # exponent for params
    beta = 0.5609090909090909   # exponent for tokens
    gamma = 0.1347272727272727  # exponent for unique_tokens

    # Per-group coefficients (intercept d, and positive coefficients a, b, c)
    # If an unknown group is provided, fall back to 'all_data'.
    group_coefs: Dict[str, Dict[str, float]] = {
        # Coefficients format: {"d": d, "a": a, "b": b, "c": c}
        "all_data": {
            "d": 1.89106612698,
            "a": 4951.85197888,
            "b": 103223.597751,
            "c": 15.153346927,
        },
    }

    coefs = group_coefs.get(group, group_coefs["all_data"])

    out: List[Dict[str, float]] = []
    for row in input_data:
        P = float(row.get("params", 0.0))
        T = float(row.get("tokens", 0.0))
        U = float(row.get("unique_tokens", 0.0))

        # Guard against non-positive inputs to avoid math domain issues.
        if P <= 0 or T <= 0 or U <= 0:
            # Degenerate fallback: return intercept if inputs invalid.
            yhat = float(coefs["d"])
        else:
            term_p = P ** (-alpha)
            term_t = T ** (-beta)
            term_u = U ** (-gamma)
            yhat = (
                float(coefs["d"]) +
                float(coefs["a"]) * term_p +
                float(coefs["b"]) * term_t +
                float(coefs["c"]) * term_u
            )

        out.append({"loss": float(yhat)})

    return out

#2 Run 2 R² = 0.914142

▼

Python

from __future__ import annotations

from typing import Dict, List


def _predict_loss(params: float, tokens: float, unique_tokens: float, coef: Dict[str, float]) -> float:
    """Additive power-law scaling with an irreducible loss floor.

    Formula:
        loss = L0 + a * params^(-alpha) + b * tokens^(-beta) + c * unique_tokens^(-gamma)

    All coefficients come from offline fitting per experimental group.
    """
    L0 = coef["L0"]
    a = coef["a"]
    alpha = coef["alpha"]
    b = coef["b"]
    beta = coef["beta"]
    c = coef["c"]
    gamma = coef["gamma"]

    return (
        L0
        + a * (params ** (-alpha))
        + b * (tokens ** (-beta))
        + c * (unique_tokens ** (-gamma))
    )


# Per-group fitted coefficients. If an unknown group is provided, fall back to
# the 'all_data' fit which was trained on the full dataset available.
_GROUP_COEFFICIENTS: Dict[str, Dict[str, float]] = {
    # Fitted on the provided dataset (/app/data), see explain.md for details.
    # Values rounded to 12 significant digits.
    "all_data": {
        "L0": 1.854238681380,
        "a": 5186.834853712,
        "alpha": 0.506558591032,
        "b": 108396.235322297,
        "beta": 0.563543703801,
        "c": 14.149776017335,
        "gamma": 0.129220485797,
    },
}


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    coefs = _GROUP_COEFFICIENTS.get(group) or _GROUP_COEFFICIENTS.get("all_data")
    if coefs is None:
        raise ValueError(f"No coefficients available for group '{group}' and no 'all_data' fallback.")

    outputs: List[Dict[str, float]] = []
    for row in input_data:
        params = float(row["params"])  # model parameter count
        tokens = float(row["tokens"])  # total training tokens
        unique_tokens = float(row["unique_tokens"])  # unique tokens in dataset

        pred_loss = _predict_loss(params=params, tokens=tokens, unique_tokens=unique_tokens, coef=coefs)
        outputs.append({"loss": float(pred_loss)})

    return outputs

#3 Run 3 R² = 0.914136

▼

Python

from __future__ import annotations

from typing import Dict, List
import math


def _predict_loss(params: float, tokens: float, unique_tokens: float, coeffs: Dict[str, float]) -> float:
    """
    Core scaling-law prediction used across groups.

    Functional form (additive inverse power-laws):
        loss = L_inf \
               + A * params**(-alpha) \
               + B * tokens**(-beta) \
               + C * unique_tokens**(-gamma)

    All coefficients are positive and group-specific.
    """

    # Guard against non-positive inputs that would make the power undefined.
    p = max(float(params), 1.0)
    t = max(float(tokens), 1.0)
    u = max(float(unique_tokens), 1.0)

    L_inf = coeffs["L_inf"]
    A = coeffs["A"]
    alpha = coeffs["alpha"]
    B = coeffs["B"]
    beta = coeffs["beta"]
    C = coeffs["C"]
    gamma = coeffs["gamma"]

    return (
        L_inf
        + A * (p ** (-alpha))
        + B * (t ** (-beta))
        + C * (u ** (-gamma))
    )


# Learned coefficients per experimental group. If an unknown group is passed,
# we fall back to the most general fit ("all_data").
_GROUP_COEFFS: Dict[str, Dict[str, float]] = {
    # Fitted on the provided dataset (/app/data)
    # Using non-linear least squares (see explain.md for details).
    "all_data": {
        "L_inf": 1.8542436817280514,
        "A": 5185.963577534392,
        "alpha": 0.5065483600283685,
        "B": 108445.00803126824,
        "beta": 0.5635676173730012,
        "C": 14.149971201430411,
        "gamma": 0.12922069609065664,
    },
}


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """

    coeffs = _GROUP_COEFFS.get(group, _GROUP_COEFFS["all_data"])  # fallback

    outputs: List[Dict[str, float]] = []
    for row in input_data:
        params = float(row.get("params", 0.0))
        tokens = float(row.get("tokens", 0.0))
        unique_tokens = float(row.get("unique_tokens", 0.0))

        pred_loss = _predict_loss(params, tokens, unique_tokens, coeffs)
        outputs.append({"loss": float(pred_loss)})

    return outputs

#4 Run 4 R² = 0.806950

▼

Python

from __future__ import annotations

from typing import Dict, List


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    The functional form is a multiplicative power law with an asymptotic floor:

        loss = A + K * params**a * tokens**b * unique_tokens**c

    The exponents (a, b, c), the scale K, and the asymptote A are constant within
    an experimental group, but may differ across groups. If an unknown group is
    requested, this implementation falls back to the coefficients learned for
    'all_data'.

    Args:
        input_data: A list of dictionaries with keys: 'params', 'tokens', 'unique_tokens'.
        group: The experimental group name.

    Returns:
        A list of dictionaries with a single key 'loss' for each input point.
    """

    # Coefficients fitted on the provided dataset (group 'all_data').
    # Format: group -> (A, K, a, b, c)
    COEFFICIENTS: Dict[str, tuple[float, float, float, float, float]] = {
        # Derived from least-squares in log-space on /app/data
        # A = 2.554117, K = 605300.7661134443, a = -0.22335831735443584,
        # b = -0.15792362123370007, c = -0.2764050828072919
        "all_data": (2.554117, 605300.7661134443, -0.22335831735443584, -0.15792362123370007, -0.2764050828072919),
    }

    A, K, a, b, c = COEFFICIENTS.get(group, COEFFICIENTS["all_data"])

    out: List[Dict[str, float]] = []
    for row in input_data:
        p = float(row.get("params", 0.0))
        t = float(row.get("tokens", 0.0))
        u = float(row.get("unique_tokens", 0.0))

        # Basic guards to avoid invalid math; domain of the law expects positives.
        if p <= 0 or t <= 0 or u <= 0:
            # Fall back to the asymptote if inputs are invalid or missing.
            pred = float(A)
        else:
            pred = float(A + K * (p ** a) * (t ** b) * (u ** c))

        out.append({"loss": pred})

    return out

#5 Run 5 R² = 0.267461

▼

Python

from __future__ import annotations

from typing import Dict, List


def _predict_loss(params: float, tokens: float, unique_tokens: float, coeffs: Dict[str, float]) -> float:
    """
    Core scaling-law prediction used for every group.

    Functional form (data-constrained pretraining):
        E = min(tokens, unique_tokens)
        loss = c + a * params**(-alpha) + d * (E / params)**(-eta)

    This captures diminishing returns from model size (params) and from data,
    using the effective non-redundant tokens E which saturates at the number of
    unique tokens.
    """

    # Effective non-redundant tokens (saturates once we exhaust uniqueness)
    E = tokens if tokens <= unique_tokens else unique_tokens

    c = coeffs["c"]
    a = coeffs["a"]
    alpha = coeffs["alpha"]
    d = coeffs["d"]
    eta = coeffs["eta"]

    # Guard against pathological inputs
    p = max(params, 1.0)
    e = max(E, 1.0)

    return c + a * (p ** (-alpha)) + d * ((e / p) ** (-eta))


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Required keys: 'params', 'tokens', 'unique_tokens'.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s) under key 'loss'.
    """

    # Coefficients per group (same functional form). If an unknown group is
    # provided, fall back to the most data-rich fit ('all_data').
    GROUP_COEFFS: Dict[str, Dict[str, float]] = {
        # Fitted on the provided dataset located at /app/data
        # Optimization method: non-linear least squares on the functional form
        # defined in _predict_loss (see /app/explain.md for details).
        "all_data": {
            "c": 2.39290393,
            "a": 1.40051550e04,
            "alpha": 0.509319873,
            "d": 0.371778024,
            "eta": 0.742509273,
        },
    }

    coeffs = GROUP_COEFFS.get(group, GROUP_COEFFS["all_data"])

    outputs: list[dict[str, float]] = []
    for row in input_data:
        params = float(row.get("params", 0.0))
        tokens = float(row.get("tokens", 0.0))
        unique_tokens = float(row.get("unique_tokens", 0.0))
        loss = _predict_loss(params, tokens, unique_tokens, coeffs)
        outputs.append({"loss": float(loss)})

    return outputs