SLD - Vocabulary Scaling Law - openhands + GPT-5

All Runs (sorted by R²)

Best Run 1 R² = 0.980329

▼

Python

from __future__ import annotations
import math
from typing import Dict, List

# Quadratic-in-logs scaling law coefficients per experimental group.
# Fitted on the provided training dataset.
_COEFS: Dict[str, Dict[str, float]] = {
    "all_data": {
        "const": 43.653023403132735,
        "lnV": 0.7794957511938669,
        "lnP": 0.5846007123502984,
        "lnC": -4.504394566402747,
        "lnV2": 0.028553981965242906,
        "lnP2": 0.025813565754701645,
        "lnC2": 0.13736040362700275,
        "lnV_lnP": 0.02259283815603192,
        "lnV_lnC": -0.07386461582128809,
        "lnP_lnC": -0.08135643672419962,
    }
}

_FEATURES = (
    "const",
    "lnV",
    "lnP",
    "lnC",
    "lnV2",
    "lnP2",
    "lnC2",
    "lnV_lnP",
    "lnV_lnC",
    "lnP_lnC",
)


def _features(example: Dict[str, float]) -> Dict[str, float]:
    v = float(example["vocab_size"])  # V
    p = float(example["non_vocab_parameters"])  # P
    c = float(example["num_characters"])  # C

    # Natural logs; guard against non-positive with tiny epsilon
    eps = 1e-12
    lnV = math.log(v if v > 0 else eps)
    lnP = math.log(p if p > 0 else eps)
    lnC = math.log(c if c > 0 else eps)

    return {
        "const": 1.0,
        "lnV": lnV,
        "lnP": lnP,
        "lnC": lnC,
        "lnV2": lnV * lnV,
        "lnP2": lnP * lnP,
        "lnC2": lnC * lnC,
        "lnV_lnP": lnV * lnP,
        "lnV_lnC": lnV * lnC,
        "lnP_lnC": lnP * lnC,
    }


def _predict_one(ex: Dict[str, float], coefs: Dict[str, float]) -> float:
    feats = _features(ex)
    y = 0.0
    for k in _FEATURES:
        y += coefs.get(k, 0.0) * feats[k]
    return y


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    coefs = _COEFS.get(group, _COEFS.get("all_data", {}))
    if not coefs:
        raise ValueError(f"No coefficients available for group '{group}' and no default group present.")

    outputs: List[Dict[str, float]] = []
    for ex in input_data:
        y = _predict_one(ex, coefs)
        outputs.append({"unigram_normalized_loss": float(y)})
    return outputs

#2 Run 2 R² = 0.970260

▼

Python

from typing import List, Dict


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Parameters per group. Falls back to 'all_data' if unknown group is provided.
    params_by_group = {
        "all_data": {
            "alpha": 0.368,   # exponent for num_characters (D)
            "beta": 0.352,    # exponent for non_vocab_parameters (P)
            "gamma": 0.20,    # exponent for vocab_size (V)
            "c0": -6.02945619,
            "cD": 5393.47636,
            "cP": 131.906561,
            "cV": 0.0409389171,
        }
    }
    p = params_by_group.get(group, params_by_group["all_data"])

    results: List[Dict[str, float]] = []
    for x in input_data:
        D = float(x["num_characters"])           # training characters
        P = float(x["non_vocab_parameters"])     # non-vocab parameters
        V = float(x["vocab_size"])               # vocabulary size
        y = (
            p["c0"]
            + p["cD"] * (D ** (-p["alpha"]))
            + p["cP"] * (P ** (-p["beta"]))
            + p["cV"] * (V ** (p["gamma"]))
        )
        results.append({"unigram_normalized_loss": float(y)})
    return results

#3 Run 3 R² = 0.960017

▼

Python

from __future__ import annotations

from math import sqrt
from typing import Dict, List


# Coefficients fitted on /app/data for group 'all_data'
# Functional form (same across groups):
#   L = L_inf + A * P**(-a) + B * C**(-b) + D * V**(-0.5)
# where
#   P = non_vocab_parameters
#   C = num_characters
#   V = vocab_size
# Exponent on V is fixed to 0.5 to improve identifiability and stability.

_COEFFICIENTS: Dict[str, Dict[str, float]] = {
    "all_data": {
        "L_inf": -5.64048045,
        "A": 73.6472017,
        "a": 0.312385054,
        "B": 5827.68521,
        "b": 0.372171895,
        "D": -12.6312502,
        # fixed exponent for V
        "g": 0.5,
    }
}

_DEFAULT_GROUP = "all_data"


def _predict_one(x: Dict[str, float], coeffs: Dict[str, float]) -> float:
    P = float(x.get("non_vocab_parameters", 0.0))
    C = float(x.get("num_characters", 0.0))
    V = float(x.get("vocab_size", 0.0))

    # Guard against zero or negative inputs by clipping to a tiny positive value
    # (these variables are positive in the training data).
    eps = 1e-12
    P = max(P, eps)
    C = max(C, eps)
    V = max(V, eps)

    L_inf = coeffs["L_inf"]
    A = coeffs["A"]
    a = coeffs["a"]
    B = coeffs["B"]
    b = coeffs["b"]
    D = coeffs["D"]
    g = coeffs.get("g", 0.5)

    # V**(-0.5) computed via 1/sqrt(V) for numerical stability
    term_P = A * (P ** (-a))
    term_C = B * (C ** (-b))
    term_V = D * (1.0 / (V ** g) if g != 0.5 else 1.0 / sqrt(V))

    return L_inf + term_P + term_C + term_V


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Required keys: 'vocab_size',
                    'non_vocab_parameters', 'num_characters'.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is the same for all groups, but
                the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries with a single key 'unigram_normalized_loss'.
    """
    coeffs = _COEFFICIENTS.get(group, _COEFFICIENTS[_DEFAULT_GROUP])

    preds = []
    for x in input_data:
        y = _predict_one(x, coeffs)
        preds.append({"unigram_normalized_loss": float(y)})
    return preds