SLD - Vocabulary Scaling Law - mini-swe-agent + GPT-5

All Runs (sorted by R²)

Best Run 1 R² = 0.976217

▼

Python

# Auto-generated scaling law implementation
# Functional form:
# unigram_normalized_loss = a
#   + bV * V^(-alpha) + bP * P^(-beta) + bN * N^(-gamma)
#   + bVP * V^(-alpha) * P^(-beta)
#   + bVN * V^(-alpha) * N^(-gamma)
#   + bPN * P^(-beta) * N^(-gamma)
# where V = vocab_size, P = non_vocab_parameters, N = num_characters.
# Coefficients and exponents are fitted per group; a global fallback is provided.

from typing import List, Dict
import math

_PARAMS = {
  "all_data": {
    "alpha": 0.25,
    "beta": 0.75,
    "gamma": 0.5,
    "coef": {
      "a": -5.662989544286598,
      "bV": 2.168436615813606,
      "bP": 302653.3106628973,
      "bN": 121210.75170193214,
      "bVP": 200457.1519513687,
      "bVN": -385541.00382365123,
      "bPN": -11998348015.944254
    },
    "mse": 0.0070885392771583644,
    "count": 1080
  }
}
_GLOBAL = {
  "alpha": 0.25,
  "beta": 0.75,
  "gamma": 0.5,
  "coef": {
    "a": -5.662989544286598,
    "bV": 2.168436615813606,
    "bP": 302653.3106628973,
    "bN": 121210.75170193214,
    "bVP": 200457.1519513687,
    "bVN": -385541.00382365123,
    "bPN": -11998348015.944254
  },
  "mse": 0.0070885392771583644,
  "count": 1080
}

def _predict_single(x: dict, pars: dict) -> float:
    V = float(x.get("vocab_size", 0.0))
    P = float(x.get("non_vocab_parameters", 0.0))
    N = float(x.get("num_characters", 0.0))
    if V <= 0 or P <= 0 or N <= 0 or not all(map(math.isfinite, [V,P,N])):
        return float('nan')
    alpha = pars["alpha"]; beta = pars["beta"]; gamma = pars["gamma"]
    a = pars["coef"]["a"]; bV = pars["coef"]["bV"]; bP = pars["coef"]["bP"]; bN = pars["coef"]["bN"]
    bVP = pars["coef"]["bVP"]; bVN = pars["coef"]["bVN"]; bPN = pars["coef"]["bPN"]
    fV = V ** (-alpha)
    fP = P ** (-beta)
    fN = N ** (-gamma)
    y = a + bV*fV + bP*fP + bN*fN + bVP*(fV*fP) + bVN*(fV*fN) + bPN*(fP*fN)
    return float(y)

def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
               The functional form of the law is the same for all groups,
               but constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    pars = _PARAMS.get(str(group), _GLOBAL)
    out = []
    for x in input_data:
        y = _predict_single(x, pars)
        out.append({"unigram_normalized_loss": y})
    return out

#2 Run 2 R² = 0.964859

▼

Python

from __future__ import annotations
from typing import List, Dict

# Discovered scaling law (common across groups):
# L = c0 + cN * N^(-a) + cD * D^(-b) + cV * V^(-g)
# where:
#   N = non_vocab_parameters
#   D = num_characters
#   V = vocab_size
# Exponents (a, b, g) are shared across all groups; coefficients vary per group.

EXPONENTS = {'a': 1.0, 'b': 0.3, 'g': 0.1}

COEFFS_BY_GROUP = {
  "all_data": [
    -5.368667408369928,
    4673839.346438605,
    1502.3441237323534,
    -1.5489611402698065
  ]
}

# Fallback group if requested group is unknown
_DEFAULT_GROUP = "all_data"

def _predict_single(x: Dict[str, float], coeffs, exps):
    N = float(x.get('non_vocab_parameters', 0.0))
    D = float(x.get('num_characters', 0.0))
    V = float(x.get('vocab_size', 0.0))
    # Numerical safety
    eps = 1e-12
    N = N if N > eps else eps
    D = D if D > eps else eps
    V = V if V > eps else eps

    a = exps['a']; b = exps['b']; g = exps['g']
    c0, cN, cD, cV = coeffs
    return c0 + cN * (N ** (-a)) + cD * (D ** (-b)) + cV * (V ** (-g))

def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    coeffs = COEFFS_BY_GROUP.get(group)
    if coeffs is None:
        coeffs = COEFFS_BY_GROUP.get(_DEFAULT_GROUP)
        if coeffs is None and len(COEFFS_BY_GROUP) > 0:
            coeffs = next(iter(COEFFS_BY_GROUP.values()))
        elif coeffs is None:
            # As a last resort: neutral coefficients
            coeffs = [0.0, 0.0, 0.0, 0.0]

    preds = []
    for row in input_data:
        y = _predict_single(row, coeffs, EXPONENTS)
        preds.append({'unigram_normalized_loss': float(y)})
    return preds

#3 Run 3 R² = 0.962742

▼

Python

# Auto-generated scaling law
# Functional form:
# L = c0_g + c1_g * V^(-1.0) + c2_g * V^(0.25) + c3_g * Pnv^(-0.5) + c4_g * N^(-0.25)
from __future__ import annotations
from typing import List, Dict

EXPONENTS = dict(a1=1.0, a2=0.25, a3=0.5, a4=0.25)

COEFFS = {
  "all_data": [
    -6.673008180893682,
    260.8479120508126,
    0.02804094106290421,
    536.249428653781,
    598.7428891761693
  ]
}
DEFAULT_COEFFS = [-6.673008180893682, 260.8479120508126, 0.02804094106290421, 536.249428653781, 598.7428891761693]

def _predict_one(d: dict, c: list[float]) -> float:
    V = float(d.get("vocab_size", 0.0))
    Pnv = float(d.get("non_vocab_parameters", 0.0))
    N = float(d.get("num_characters", 0.0))
    # Safeguards
    eps = 1e-12
    V = V if V > eps else eps
    Pnv = Pnv if Pnv > eps else eps
    N = N if N > eps else eps
    a1 = EXPONENTS["a1"]; a2 = EXPONENTS["a2"]; a3 = EXPONENTS["a3"]; a4 = EXPONENTS["a4"]
    terms = [
        1.0,
        V ** (-a1),
        V ** (a2),
        Pnv ** (-a3),
        N ** (-a4),
    ]
    return float(c[0]*terms[0] + c[1]*terms[1] + c[2]*terms[2] + c[3]*terms[3] + c[4]*terms[4])

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is the same for all groups,
                but the coefficients differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    c = COEFFS.get(group, DEFAULT_COEFFS)
    out = []
    for d in input_data:
        y = _predict_one(d, c)
        out.append({"unigram_normalized_loss": y})
    return out

#4 Run 4 R² = 0.867500

▼

Python

# Auto-generated scaling law based on dataset at /app/data
# Functional form (same for all groups):
#   y = a0 + a1*log10(V) + a2*log10(P) + a3*log10(C) + a4*(log10(V))^2 + a5*log10(V)*log10(P) + a6*log10(V)*log10(C)
# where:
#   V = vocab_size
#   P = non_vocab_parameters
#   C = num_characters
# Coefficients (a0..a6) differ by group; fallback to global if group unknown.

import math

_COEF_BY_GROUP = {"all_data": [1.943622217604, 0.63942541583, -0.246590685233, -0.250427241353, 0.131718659734, 0.067329156434, -0.214521749053], "_global_fallback_": [1.943622217604, 0.63942541583, -0.246590685233, -0.250427241353, 0.131718659734, 0.067329156434, -0.214521749053]}

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    g = str(group)
    coeffs = _COEF_BY_GROUP.get(g)
    if coeffs is None:
        coeffs = _COEF_BY_GROUP.get("_global_fallback_")
    eps = 1e-12
    out = []
    for row in input_data:
        V = float(row.get("vocab_size", 0.0))
        P = float(row.get("non_vocab_parameters", 0.0))
        C = float(row.get("num_characters", 0.0))
        V = V if V > eps else eps
        P = P if P > eps else eps
        C = C if C > eps else eps
        lv = math.log10(V)
        lp = math.log10(P)
        lc = math.log10(C)
        feats = [1.0, lv, lp, lc, lv*lv, lv*lp, lv*lc]
        y = 0.0
        for ci, fi in zip(coeffs, feats):
            y += ci * fi
        out.append({"unigram_normalized_loss": float(y)})
    return out

#5 Run 5 R² = -1.000000

▼

Python

from typing import List, Dict

# This module is auto-generated to implement a scaling law:
#   L_hat = A_g * non_vocab_parameters^(b_g) * num_characters^(c_g) * vocab_size^(d_g)
# Using a single functional form across groups, with group-specific coefficients.
_COEFFS = {
  "all_data": {
    "b0": 0.0,
    "b1": 0.0,
    "b2": 0.0,
    "b3": 0.0,
    "A": 1.0
  },
  "ALL": {
    "b0": 0.0,
    "b1": 0.0,
    "b2": 0.0,
    "b3": 0.0,
    "A": 1.0
  }
}

def _get_group_key(group: str) -> str:
    if group in _COEFFS:
        return group
    # Fallback to ALL if unknown group name
    return "ALL"

def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    import math
    key = _get_group_key(group)
    pars = _COEFFS.get(key, _COEFFS.get("ALL"))
    if pars is None:
        # Ultimate fallback if nothing available
        pars = {"b0": 0.0, "b1": 0.0, "b2": 0.0, "b3": 0.0, "A": 1.0}
    A = float(pars.get("A", math.exp(float(pars.get("b0", 0.0)))))
    b1 = float(pars.get("b1", 0.0))  # exponent for non_vocab_parameters
    b2 = float(pars.get("b2", 0.0))  # exponent for num_characters
    b3 = float(pars.get("b3", 0.0))  # exponent for vocab_size

    out: List[Dict[str, float]] = []
    for row in input_data:
        Np = float(row.get("non_vocab_parameters", 0.0))
        D  = float(row.get("num_characters", 0.0))
        V  = float(row.get("vocab_size", 0.0))
        # Guard against non-positive values
        eps = 1e-12
        Np = max(Np, eps)
        D  = max(D, eps)
        V  = max(V, eps)
        y = A * (Np ** b1) * (D ** b2) * (V ** b3)
        out.append({"unigram_normalized_loss": float(y)})
    return out