SLD - Data-Constrained Scaling Law

All Runs (sorted by R²)

Best Run 1 R² = 0.978110

▼

Python

from __future__ import annotations

import math
from typing import Dict, List

# Global, group-invariant exponents discovered from model selection
# loss ≈ d_g + a_g * P^{-ALPHA} + b_g * T^{-BETA} + c_g * U^{-GAMMA} + e_g * (T/U)^{-DELTA}
ALPHA = 0.5905144353732534
BETA = 0.6010038076798887
GAMMA = 0.04387501406139306
DELTA = 0.9108617183578192

# Default coefficients, primarily for fallback if training data for a group is unavailable.
# These were fit on the provided dataset's single group ("all_data").
_DEFAULT_COEFS: Dict[str, list[float]] = {
    "all_data": [
        -1.48205727e+00,  # d
         1.91025244e+04,  # a
         1.87355582e+05,  # b
         1.08182446e+01,  # c
         3.85869099e-01,  # e
    ]
}

# Cache for coefficients per group once fit from disk data
_COEF_CACHE: Dict[str, list[float]] = {}


def _fit_group_from_disk(group: str) -> list[float] | None:
    try:
        from datasets import load_from_disk  # type: ignore
        import numpy as np  # type: ignore
    except Exception:
        return None
    try:
        ds = load_from_disk("/app/data")
        if hasattr(ds, "keys"):
            d = ds.get("train", next(iter(ds.values())))
        else:
            d = ds
        # Filter to requested group if present
        if "group" in d.column_names:
            df = d.to_pandas()
            if group in set(df["group"].unique()):
                gdf = df[df["group"] == group]
            else:
                # Fallback: use all rows to provide a generic estimate
                gdf = df
        else:
            gdf = d.to_pandas()
        P = gdf["params"].to_numpy(dtype=float)
        T = gdf["tokens"].to_numpy(dtype=float)
        U = gdf["unique_tokens"].to_numpy(dtype=float)
        y = gdf["loss"].to_numpy(dtype=float)
        u_safe = np.maximum(U, 1.0)
        X = np.stack([
            np.ones_like(P),
            P ** (-ALPHA),
            T ** (-BETA),
            U ** (-GAMMA),
            (T / u_safe) ** (-DELTA),
        ], axis=1)
        coef, *_ = np.linalg.lstsq(X, y, rcond=None)
        return [float(v) for v in coef]
    except Exception:
        return None


def _get_coefs(group: str) -> list[float]:
    if group in _COEF_CACHE:
        return _COEF_CACHE[group]
    coef = _fit_group_from_disk(group)
    if coef is None:
        # Fallbacks
        if group in _DEFAULT_COEFS:
            coef = _DEFAULT_COEFS[group]
        elif "all_data" in _DEFAULT_COEFS:
            coef = _DEFAULT_COEFS["all_data"]
        else:
            # Conservative neutral defaults
            coef = [4.0, 0.0, 0.0, 0.0, 0.0]
    _COEF_CACHE[group] = coef
    return coef


def _safe_pow(x: float, power: float) -> float:
    # Avoid infinities for non-positive bases; clamp to small positive epsilon
    if not (x > 0.0):
        x = 1e-18
    try:
        return x ** power
    except Exception:
        # Fallback for extreme cases
        return math.exp(power * math.log(max(x, 1e-18)))


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Expected keys: 'params', 'tokens', 'unique_tokens'.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is the same for all groups, but
                the coefficients differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s) under key 'loss'.
    """
    if not input_data:
        return []

    d0, a, b, c, e = _get_coefs(group)

    preds: List[Dict[str, float]] = []
    for item in input_data:
        P = float(item.get("params", 0.0))
        T = float(item.get("tokens", 0.0))
        U = float(item.get("unique_tokens", 1.0))
        U = max(U, 1.0)

        x1 = _safe_pow(P, -ALPHA)
        x2 = _safe_pow(T, -BETA)
        x3 = _safe_pow(U, -GAMMA)
        x4 = _safe_pow(T / U if U > 0 else 0.0, -DELTA)

        yhat = d0 + a * x1 + b * x2 + c * x3 + e * x4
        preds.append({"loss": float(yhat)})
    return preds

#2 Run 2 R² = 0.914137

▼

Python

from typing import List, Dict

# Discovered scaling law (selected by cross-validation):
# loss = a + b * params^(-alpha) + c * tokens^(-beta) + d * unique_tokens^(-gamma)
# Coefficients below were fit on the provided dataset.
# The same functional form is used for all groups; coefficients may differ per group.

_COEFFICIENTS: Dict[str, tuple] = {
    # a, b, alpha, c, beta, d, gamma
    "all_data": (
        1.8542523880131971,
        5186.04959122421,
        0.5065493761615472,
        108438.28037762076,
        0.5635643095113135,
        14.150177938556224,
        0.129221963954118,
    ),
}

_DEFAULT_GROUP = "all_data"


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Required keys: 'params', 'tokens', 'unique_tokens'.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s): {'loss': float}.
    """
    coeffs = _COEFFICIENTS.get(group, _COEFFICIENTS.get(_DEFAULT_GROUP))
    if coeffs is None:
        raise ValueError("No coefficients available for the given group and no default group present.")
    a, b, alpha, c, beta, d, gamma = coeffs

    preds: List[Dict[str, float]] = []
    for x in input_data:
        N = float(x.get("params", 0.0))
        T = float(x.get("tokens", 0.0))
        U = float(x.get("unique_tokens", 0.0))
        # Guard against non-positive inputs for stability
        if N <= 0:
            N = 1.0
        if T <= 0:
            T = 1.0
        if U <= 0:
            U = 1.0
        loss = a + b * (N ** (-alpha)) + c * (T ** (-beta)) + d * (U ** (-gamma))
        preds.append({"loss": float(loss)})
    return preds

#3 Run 3 R² = 0.914136

▼

Python

# Coefficients fitted on the provided dataset (group: 'all_data').
# Functional form (same for all groups):
#   loss = L + a * params**(-alpha) + b * tokens**(-beta) + c * unique_tokens**(-delta)
# If an unknown group is requested, fall back to 'all_data'.
_COEFFICIENTS: dict[str, dict[str, float]] = {
    "all_data": {
        "L": 1.8542675886026672,
        "a": 5185.960460450213,
        "alpha": 0.5065483524669877,
        "b": 108445.06124670798,
        "beta": 0.5635676440871512,
        "c": 14.150572879421185,
        "delta": 0.12922397065055474,
    }
}


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    g = group if group in _COEFFICIENTS else "all_data"
    coeffs = _COEFFICIENTS[g]
    L = coeffs["L"]
    a, alpha = coeffs["a"], coeffs["alpha"]
    b, beta = coeffs["b"], coeffs["beta"]
    c, delta = coeffs["c"], coeffs["delta"]

    preds: list[dict[str, float]] = []
    for x in input_data:
        P = float(x["params"])  # model parameter count
        T = float(x["tokens"])  # total training tokens
        U = float(x["unique_tokens"])  # number of unique tokens in the dataset
        y_hat = L + a * (P ** (-alpha)) + b * (T ** (-beta)) + c * (U ** (-delta))
        preds.append({"loss": float(y_hat)})
    return preds

#4 Run 4 R² = 0.862997

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Required keys: 'params', 'tokens', 'unique_tokens'.
        group: The name of the experimental group for which to make predictions.
               The functional form is shared across groups; coefficients may differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s) under key 'loss'.
    """
    import math

    # Per-group coefficients for the scaling law:
    # loss = L_inf + A * params^{-alpha} + B * Te^{-beta}
    # where Te = U * (1 - exp(- tokens / (k * U))) and U = unique_tokens.
    # Fitted on the provided dataset.
    coeffs_by_group = {
        "all_data": {
            "L_inf": 2.34510780,
            "A": 3.11455518e4,
            "alpha": 0.625427295,
            "B": 1.91139592e3,
            "beta": 0.398823673,
            "k": 22.1270822,
        }
    }

    # Fallback: if an unknown group is requested, use the closest available set (here, 'all_data').
    if group not in coeffs_by_group:
        use = coeffs_by_group.get("all_data")
    else:
        use = coeffs_by_group[group]

    L_inf = float(use["L_inf"])  # asymptotic irreducible loss
    A = float(use["A"])          # scale for model-size term
    alpha = float(use["alpha"])  # exponent for model-size term
    B = float(use["B"])          # scale for data term
    beta = float(use["beta"])    # exponent for data term
    k = float(use["k"])          # saturation scale for effective tokens

    out = []
    for x in input_data:
        N = float(x["params"])          # model parameters
        T = float(x["tokens"])          # total seen tokens
        U = float(x["unique_tokens"])   # number of unique tokens

        # Prevent degenerate values
        U = max(U, 1.0)
        N = max(N, 1e-12)
        T = max(T, 0.0)

        # Effective tokens accounting for repetition saturation.
        Te = U * (1.0 - math.exp(- T / (k * U)))
        Te = max(Te, 1e-12)

        loss = L_inf + A * (N ** (-alpha)) + B * (Te ** (-beta))
        out.append({"loss": float(loss)})

    return out