SLD - Vocabulary Scaling Law - opencode + GPT-5

All Runs (sorted by R²)

Best Run 1 R² = 0.963340

▼

Python

from __future__ import annotations
from typing import List, Dict

# Discovered functional form (shared across groups):
#   L = L0_g + A_g * V^(-alpha) + B_g * Pnv^(-beta) + C_g * C^(-gamma)
# where
#   V   = vocab_size
#   Pnv = non_vocab_parameters
#   C   = num_characters
# Exponents are shared across groups; coefficients are per-group.

# Exponents (selected via grid search minimizing RMSE)
_ALPHA = 0.2
_BETA = 0.2
_GAMMA = 0.4

# Per-group coefficients fitted on the provided dataset
# Format: group -> (L0, A, B, C)
_COEFS: Dict[str, tuple[float, float, float, float]] = {
    # Only one group was present in the dataset; use it as default.
    "all_data": (
        -5.547737600980133,   # L0
        -1.8596813255288938,  # A (vocab term)
        17.1014092331671,     # B (non-vocab parameters term)
        9830.897391235507,    # C (num_characters term)
    ),
}

_DEFAULT_GROUP = "all_data"


def _predict_one(vocab_size: float, non_vocab_parameters: float, num_characters: float, coefs: tuple[float, float, float, float]) -> float:
    # Guard against non-positive inputs for power operations
    eps = 1e-12
    V = max(float(vocab_size), eps)
    Pnv = max(float(non_vocab_parameters), eps)
    C = max(float(num_characters), eps)

    L0, A, B, Cc = coefs
    return (
        L0
        + A * (V ** (-_ALPHA))
        + B * (Pnv ** (-_BETA))
        + Cc * (C ** (-_GAMMA))
    )


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    coefs = _COEFS.get(group)
    if coefs is None:
        # Fallback to default if unseen group; preserves functional form
        coefs = _COEFS[_DEFAULT_GROUP]

    outputs: List[Dict[str, float]] = []
    for row in input_data:
        V = float(row.get("vocab_size", 0.0))
        Pnv = float(row.get("non_vocab_parameters", 0.0))
        C = float(row.get("num_characters", 0.0))
        pred = _predict_one(V, Pnv, C, coefs)
        outputs.append({"unigram_normalized_loss": float(pred)})
    return outputs

#2 Run 2 R² = 0.900907

▼

Python

from __future__ import annotations
from typing import Dict, List

# Per-group coefficients for the scaling law. The functional form is identical
# across groups; only these constants vary by group.
#
# Formula:
#   y_hat = L_inf + A * V^{-alpha_vocab} * P^{-beta_params} * C^{-gamma_chars}
# where
#   y_hat  = predicted unigram-normalized loss
#   V      = vocab_size
#   P      = non_vocab_parameters
#   C      = num_characters
#
# If an unknown group is requested, we fall back to "all_data" if present,
# otherwise to the first available set of coefficients.
COEFFS: Dict[str, Dict[str, float]] = {
    "all_data": {
        "L_inf": -5.342768253504336,
        "A": 1488483.062515263,
        "alpha_vocab": 0.011056223114896735,
        "beta_params": 0.12289913666523904,
        "gamma_chars": 0.5218377213267072,
    }
}


def _select_group_coeffs(group: str) -> Dict[str, float]:
    if group in COEFFS:
        return COEFFS[group]
    if "all_data" in COEFFS:
        return COEFFS["all_data"]
    # Fallback to any available coefficients
    return next(iter(COEFFS.values()))


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Expected keys: 'vocab_size',
                    'non_vocab_parameters', 'num_characters'.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is the same for all groups, but
                the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s):
        {'unigram_normalized_loss': float}.
    """
    coeffs = _select_group_coeffs(group)
    L_inf = float(coeffs["L_inf"])
    A = float(coeffs["A"])
    a_v = float(coeffs["alpha_vocab"])
    b_p = float(coeffs["beta_params"])
    g_c = float(coeffs["gamma_chars"])

    # Numerical safety threshold to avoid zero/negative bases in power operations
    eps = 1e-12

    out: List[Dict[str, float]] = []
    for row in input_data:
        V = float(row.get("vocab_size", 0.0))
        P = float(row.get("non_vocab_parameters", 0.0))
        C = float(row.get("num_characters", 0.0))

        V = V if V > eps else eps
        P = P if P > eps else eps
        C = C if C > eps else eps

        y_hat = L_inf + A * (V ** (-a_v)) * (P ** (-b_p)) * (C ** (-g_c))
        out.append({"unigram_normalized_loss": float(y_hat)})

    return out

#3 Run 3 R² = 0.861121

▼

Python

import math
from typing import List, Dict

# Coefficients fitted on the provided dataset (group: 'all_data').
# Functional form (shared across groups):
#   unigram_normalized_loss = A + a * ln(vocab_size) + b * ln(non_vocab_parameters) + c * ln(num_characters)
# If an unknown group is requested, we fall back to 'all_data'.

COEFFICIENTS: Dict[str, Dict[str, float]] = {
    "all_data": {
        "A": 6.380591236628991,
        "a": 0.06340183374111474,
        "b": 0.016411064426657424,
        "c": -0.5017006627222854,
    }
}


def _get_group_coeffs(group: str) -> Dict[str, float]:
    if group in COEFFICIENTS:
        return COEFFICIENTS[group]
    # Fallback to 'all_data' if group not present
    if "all_data" in COEFFICIENTS:
        return COEFFICIENTS["all_data"]
    # As a last resort (should not happen), pick an arbitrary group's coeffs
    return next(iter(COEFFICIENTS.values()))


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Expected keys:
                    - 'vocab_size'
                    - 'non_vocab_parameters'
                    - 'num_characters'
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s):
            - 'unigram_normalized_loss'
    """
    coeffs = _get_group_coeffs(group)
    A = float(coeffs["A"])
    a = float(coeffs["a"])
    b = float(coeffs["b"])
    c = float(coeffs["c"])

    outputs: List[Dict[str, float]] = []
    for row in input_data:
        V = float(row["vocab_size"])  # assumes > 0
        P = float(row["non_vocab_parameters"])  # assumes > 0
        N = float(row["num_characters"])  # assumes > 0
        if V <= 0 or P <= 0 or N <= 0:
            raise ValueError("All inputs must be positive for logarithms.")
        y = A + a * math.log(V) + b * math.log(P) + c * math.log(N)
        outputs.append({"unigram_normalized_loss": float(y)})

    return outputs

#4 Run 4 R² = 0.861121

▼

Python

from __future__ import annotations

import math
from typing import Dict, List

# Fitted coefficients by experimental group for the scaling law:
# unigram_normalized_loss = c0 + cV * ln(vocab_size) + cP * ln(non_vocab_parameters) + cN * ln(num_characters)
# If an unknown group is provided, fall back to the 'default' coefficients.
_COEFFICIENTS: Dict[str, Dict[str, float]] = {
    # Derived from the provided dataset (group == 'all_data')
    "all_data": {
        "c0": 6.380591236628991,
        "cV": 0.06340183374111474,
        "cP": 0.016411064426657424,
        "cN": -0.5017006627222854,
    },
}

# Default to the only observed group's coefficients
_DEFAULT_GROUP = "all_data"


def _get_coeffs(group: str) -> Dict[str, float]:
    return _COEFFICIENTS.get(group, _COEFFICIENTS[_DEFAULT_GROUP])


def _safe_log(x: float) -> float:
    # Guard against non-positive inputs; clip to a tiny positive value
    return math.log(max(float(x), 1e-12))


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    The functional form is shared across groups, while coefficients differ per group.

    Model:
        unigram_normalized_loss = c0
                                  + cV * ln(vocab_size)
                                  + cP * ln(non_vocab_parameters)
                                  + cN * ln(num_characters)

    Args:
        input_data: A list of dicts with keys:
            - 'vocab_size'
            - 'non_vocab_parameters'
            - 'num_characters'
        group: Experimental group name controlling which coefficient set to use.

    Returns:
        A list of dicts with key 'unigram_normalized_loss' for each input row.
    """
    co = _get_coeffs(group)
    c0 = co["c0"]
    cV = co["cV"]
    cP = co["cP"]
    cN = co["cN"]

    outputs: List[Dict[str, float]] = []
    for row in input_data:
        V = row.get("vocab_size", float("nan"))
        Pnv = row.get("non_vocab_parameters", float("nan"))
        N = row.get("num_characters", float("nan"))

        y = c0 + cV * _safe_log(V) + cP * _safe_log(Pnv) + cN * _safe_log(N)
        outputs.append({"unigram_normalized_loss": float(y)})

    return outputs

#5 Run 5 R² = 0.860336

▼

Python

from __future__ import annotations
from math import log
from typing import Dict, List

# Fitted on the provided dataset (group = "all_data").
# Functional form (same for all groups):
#   y = a
#       + b * ln(V)
#       + c * ln(P_nv)
#       + d * ln(C)
#       + e * ln(V) * ln(P_nv)
#       + f * ln(V) * ln(C)
# where
#   y  = unigram_normalized_loss (to be predicted)
#   V  = vocab_size
#   P_nv = non_vocab_parameters
#   C  = num_characters
# Coefficients may differ per experimental group; unseen groups fall back to "all_data".

_COEFS_BY_GROUP: Dict[str, Dict[str, float]] = {
    # Values derived via least-squares on /app/data
    # keys: a, b, c, d, e, f as described above
    "all_data": {
        "a": -0.3185102834051369,
        "b": 0.7540070032843006,
        "c": -0.07846372542853836,
        "d": -0.1351093746275669,
        "e": 0.009780758365806914,
        "f": -0.03777531897782867,
    },
}

_DEFAULT_GROUP = "all_data"


def _predict_one(x: Dict[str, float], coefs: Dict[str, float]) -> float:
    V = float(x["vocab_size"])   # Vocabulary size
    P = float(x["non_vocab_parameters"])  # Non-vocab params
    C = float(x["num_characters"])  # Training characters

    # Guard against non-positive inputs before log
    if V <= 0 or P <= 0 or C <= 0:
        raise ValueError("All inputs must be positive to compute logarithms.")

    lv = log(V)
    lp = log(P)
    lc = log(C)

    a = coefs["a"]
    b = coefs["b"]
    c = coefs["c"]
    d = coefs["d"]
    e = coefs["e"]
    f = coefs["f"]

    y = (
        a
        + b * lv
        + c * lp
        + d * lc
        + e * lv * lp
        + f * lv * lc
    )
    return float(y)


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    coefs = _COEFS_BY_GROUP.get(group, _COEFS_BY_GROUP[_DEFAULT_GROUP])
    preds: List[Dict[str, float]] = []
    for row in input_data:
        y = _predict_one(row, coefs)
        preds.append({"unigram_normalized_loss": y})
    return preds