SLD - Data-Constrained Scaling Law

Best Run 1 R² = 0.914136

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    loss = L_inf + a * params**(-alpha) + b * tokens**(-beta) + c * unique_tokens**(-gamma)

    The functional form is the same for all groups; coefficients differ per group.
    """
    coeffs_by_group = {
        'all_data': {'L_inf': 1.85424454245, 'a': 5185.97461306, 'alpha': 0.506548495709, 'b': 108445.065878, 'beta': 0.563567646749, 'c': 14.1499927807, 'gamma': 0.129220806386},
    }
    if coeffs_by_group:
        avg = {k: sum(p[k] for p in coeffs_by_group.values())/len(coeffs_by_group) for k in next(iter(coeffs_by_group.values())).keys()}
    else:
        avg = {'L_inf': 0.0, 'a': 0.0, 'alpha': 1.0, 'b': 0.0, 'beta': 1.0, 'c': 0.0, 'gamma': 1.0}
    c = coeffs_by_group.get(group, avg)
    out = []
    eps = 1e-12
    for x in input_data:
        N = float(x.get('params', 0.0))
        T = float(x.get('tokens', 0.0))
        U = float(x.get('unique_tokens', 0.0))
        if N <= 0: N = eps
        if T <= 0: T = eps
        if U <= 0: U = eps
        y = c['L_inf'] + c['a'] * (N ** (-c['alpha'])) + c['b'] * (T ** (-c['beta'])) + c['c'] * (U ** (-c['gamma']))
        out.append({"loss": float(y)})
    return out

#2 Run 2 R² = 0.862760

▼

Python

from __future__ import annotations
import math
from typing import List, Dict

FEATURES = ['log_params', 'log_tokens', 'log_unique_tokens', 'log_params:log_tokens', 'tokens_inv_sqrt']
GROUP_PARAMS = {
  "all_data": {
    "intercept": 21.017514457355812,
    "coefs": [
      -0.8745591289420206,
      -0.5044278099541184,
      -0.11831988273483411,
      0.029264245542033336,
      38424.77315308764
    ]
  }
}
GLOBAL_PARAMS = {
  "intercept": 21.017514457355812,
  "coefs": [
    -0.8745591289420206,
    -0.5044278099541184,
    -0.11831988273483411,
    0.029264245542033336,
    38424.77315308764
  ]
}

def _feature_vector(params: float, tokens: float, unique_tokens: float):
    lp = math.log(params)
    lt = math.log(tokens)
    lu = math.log(unique_tokens)
    vec = []
    for feat in FEATURES:
        if feat == "log_params":
            vec.append(lp)
        elif feat == "log_tokens":
            vec.append(lt)
        elif feat == "log_unique_tokens":
            vec.append(lu)
        elif feat == "log_params:log_tokens":
            vec.append(lp*lt)
        elif feat == "tokens_inv_sqrt":
            vec.append(1.0/math.sqrt(tokens))
        else:
            vec.append(0.0)
    return vec

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    coeffs = GROUP_PARAMS.get(group, GLOBAL_PARAMS)
    intercept = coeffs["intercept"]
    beta = coeffs["coefs"]
    out: list[dict[str, float]] = []
    for row in input_data:
        p = float(row["params"])
        t = float(row["tokens"])
        u = float(row["unique_tokens"])
        x = _feature_vector(p, t, u)
        y = intercept + sum(b*xi for b, xi in zip(beta, x))
        out.append({"loss": float(y)})
    return out

#3 Run 3 R² = 0.804644

▼

#4 Run 4 R² = 0.103641

▼

#5 Run 5 R² = -0.852814

▼

Python

from __future__ import annotations
from math import pow
from typing import Dict, List

# Shared exponents across groups
_ALPHA = 2.88154375571247
_BETA = 0.4714873693356799
_GAMMA = 2.468731307378303

# Per-group coefficients
_COEFFS: Dict[str, Dict[str, float]] = {'all_data': {'L_inf': 2.9842494330943747, 'A': 0.0, 'B': 27959.06945832133, 'C': 8.297512369762968e-12}}

# Fallback group (if an unknown group is requested)
_FALLBACK_GROUP = next(iter(_COEFFS.keys())) if _COEFFS else 'default'

def _get_group(g: str) -> str:
    return g if g in _COEFFS else _FALLBACK_GROUP

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    g = _get_group(group)
    c = _COEFFS[g]
    a, b, cexp = _ALPHA, _BETA, _GAMMA
    out: List[Dict[str, float]] = []
    for row in input_data:
        P = float(row.get('params', 0.0))
        T = float(row.get('tokens', 0.0))
        U = float(row.get('unique_tokens', 0.0))
        # Guard against non-positive inputs
        if P <= 0 or T <= 0 or U <= 0:
            P = max(P, 1e-12)
            T = max(T, 1e-12)
            U = max(U, 1e-12)
        pred = (
            c['L_inf']
            + c['A'] * pow(P, -a)
            + c['B'] * pow(T, -b)
            + c['C'] * pow(U, -cexp)
        )
        out.append({'loss': float(pred)})
    return out

Data-Constrained Scaling Law

All Runs (sorted by R²)