SLD - Vocabulary Scaling Law - terminus-2 + GPT-5

Best Run 1 R² = 0.978030

▼

Python

from __future__ import annotations
from typing import List, Dict
import math

# Coefficients learned from the observed dataset. Same functional form; group-specific coefficients.
_GROUP_COEFS = {'all_data': {'mse': 0.00731547465744217, 'coef': {'const': -2.1279174866216723, 'V_inv_sqrt': -249.95274501751913, 'P_inv_sqrt': 85788.82318771542, 'C_inv_sqrt': -412035.65012517374, 'logV': 0.13411632826121078, 'logP': -0.5280171191793501, 'logC': 0.24657662157971197, 'V_inv_sqrt_logP': 14.031301495867208, 'P_inv_sqrt_logC': -3829.9875212486672, 'C_inv_sqrt_logP': 25651.188832628683}}}
_FEATURES = ['const', 'V_inv_sqrt', 'P_inv_sqrt', 'C_inv_sqrt', 'logV', 'logP', 'logC', 'V_inv_sqrt_logP', 'P_inv_sqrt_logC', 'C_inv_sqrt_logP']

# Feature computation matches the training pipeline

def _compute_features(d: Dict[str, float]) -> list[float]:
    V = float(d.get('vocab_size', 0.0))
    P = float(d.get('non_vocab_parameters', 0.0))
    C = float(d.get('num_characters', 0.0))
    eps = 1e-12
    V = V if V > eps else eps
    P = P if P > eps else eps
    C = C if C > eps else eps
    feats = {
        'const': 1.0,
        'V_inv_sqrt': V**(-0.5),
        'P_inv_sqrt': P**(-0.5),
        'C_inv_sqrt': C**(-0.5),
        'logV': math.log(V),
        'logP': math.log(P),
        'logC': math.log(C),
        'V_inv_sqrt_logP': (V**(-0.5))*math.log(P),
        'P_inv_sqrt_logC': (P**(-0.5))*math.log(C),
        'C_inv_sqrt_logP': (C**(-0.5))*math.log(P),
    }
    return [feats[name] for name in _FEATURES]


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Select coefficients for the requested group; if unseen, average across known groups
    if group in _GROUP_COEFS:
        coefs = _GROUP_COEFS[group]['coef']
    else:
        keys = list(next(iter(_GROUP_COEFS.values()))['coef'].keys())
        avg = {k: 0.0 for k in keys}
        n = 0
        for rec in _GROUP_COEFS.values():
            for k, v in rec['coef'].items():
                avg[k] += v
            n += 1
        for k in avg:
            avg[k] /= max(n, 1)
        coefs = avg

    beta = [coefs[name] for name in _FEATURES]

    outputs: list[dict[str, float]] = []
    for d in input_data:
        x = _compute_features(d)
        y = sum(b * xi for b, xi in zip(beta, x))
        outputs.append({'unigram_normalized_loss': float(y)})
    return outputs

#2 Run 2 R² = 0.966564

▼

Python

# Auto-generated scaling law implementation
from typing import List, Dict

# Per-group parameters fitted from dataset
_PARAMS: Dict[str, Dict[str, float]] = {
    'all_data': {'Linf': -4.765198315721425, 'a': -1.9237509883501536, 'alpha': 0.05, 'b': 26201950506.048325, 'beta': 1.5, 'c': 1505.1149651298308, 'gamma': 0.3},
}

# Fallback default: average of parameters if group not found
_DEFAULT = None
if _PARAMS:
    vals = list(_PARAMS.values())
    _DEFAULT = {k: sum(d[k] for d in vals)/len(vals) for k in vals[0].keys()}


def _predict_one(x: Dict[str, float], p: Dict[str, float]) -> float:
    V = float(x.get('vocab_size', 0.0))
    N = float(x.get('non_vocab_parameters', 0.0))
    D = float(x.get('num_characters', 0.0))
    if V <= 0 or N <= 0 or D <= 0:
        # Guard against invalid inputs; return Linf in that case
        return float(p['Linf'])
    return (
        float(p['Linf'])
        + float(p['a']) * (V ** (-float(p['alpha'])))
        + float(p['b']) * (N ** (-float(p['beta'])))
        + float(p['c']) * (D ** (-float(p['gamma'])))
    )


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    p = _PARAMS.get(group, _DEFAULT)
    if p is None:
        return [{'unigram_normalized_loss': 0.0} for _ in input_data]
    return [{'unigram_normalized_loss': float(_predict_one(x, p))} for x in input_data]

#3 Run 3 R² = 0.861121

▼

Python

from __future__ import annotations
import math

# Coefficients per group for the scaling law:
# L = a + b*log(V) + c*log(P_nv) + d*log(N_chars)
_COEFS = {
  "all_data": {
    "a": 6.380591236628991,
    "b": 0.06340183374111474,
    "c": 0.016411064426657424,
    "d": -0.5017006627222854,
    "r2": 0.8762104080459034,
    "n": 1080
  }
}

_DEF_GROUP = next(iter(_COEFS.keys())) if _COEFS else 'all'


def _predict_one(sample: dict[str, float], group: str) -> float:
    g = group if group in _COEFS else _DEF_GROUP
    params = _COEFS[g]
    a = params['a']; b = params['b']; c = params['c']; d = params['d']
    V = float(sample.get('vocab_size', 0.0))
    Pnv = float(sample.get('non_vocab_parameters', 0.0))
    N = float(sample.get('num_characters', 0.0))
    eps = 1e-12
    V = V if V > 0 else eps
    Pnv = Pnv if Pnv > 0 else eps
    N = N if N > 0 else eps
    L = a + b*math.log(V) + c*math.log(Pnv) + d*math.log(N)
    return L


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    outputs: list[dict[str, float]] = []
    for row in input_data:
        pred = _predict_one(row, group)
        outputs.append({'unigram_normalized_loss': float(pred)})
    return outputs

#4 Run 4 R² = 0.861121

▼

#5 Run 5 R² = -1.000000

▼

Python

# Auto-generated scaling law implementation
# This file intentionally defines a single public function: law

# Fitted parameters per group
_PER_GROUP = {
  "all_data": {
    "L": 2.7706746033627237e-30,
    "a": 0.3867856666583771,
    "alpha": 1.256210639553385,
    "b": 0.4451697330289151,
    "beta": 1.2204180459981504,
    "c": 0.06814610472029942,
    "gamma": 1.9437720332434425
  }
}
_GLOBAL = {
  "L": 2.7706746033627237e-30,
  "a": 0.3867856666583771,
  "alpha": 1.256210639553385,
  "b": 0.4451697330289151,
  "beta": 1.2204180459981504,
  "c": 0.06814610472029942,
  "gamma": 1.9437720332434425
}

# Numerical stability offsets used during fitting
_N_OFFSET = 1e3
_D_OFFSET = 1e3
_V_OFFSET = 1.0

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    params = _PER_GROUP.get(group, _GLOBAL)
    L = params['L']; a = params['a']; alpha = params['alpha']
    b = params['b']; beta = params['beta']; c = params['c']; gamma = params['gamma']
    out: list[dict[str, float]] = []
    for x in input_data:
        N = float(x.get('non_vocab_parameters', 0.0))
        D = float(x.get('num_characters', 0.0))
        V = float(x.get('vocab_size', 0.0))
        N_eff = (0.0 if N < 0.0 else N) + _N_OFFSET
        D_eff = (0.0 if D < 0.0 else D) + _D_OFFSET
        V_eff = (1.0 if V < 1.0 else V) + _V_OFFSET
        y = L + a*(N_eff ** (-alpha)) + b*(D_eff ** (-beta)) + c*(V_eff ** (-gamma))
        out.append({'unigram_normalized_loss': float(y)})
    return out

Vocabulary Scaling Law

All Runs (sorted by R²)