SLD - Vocabulary Scaling Law

All Runs (sorted by R²)

Best Run 1 R² = 0.904636

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Lazy-fit coefficients from /app/data on first call, cache for reuse.
    # The discovered functional form is:
    #   L = L_inf + K * N^a * D^b * V^c
    # where:
    #   L = unigram_normalized_loss
    #   N = non_vocab_parameters
    #   D = num_characters
    #   V = vocab_size
    # and (L_inf, K, a, b, c) depend on the group but the form is shared.
    import math

    # A minimal, sane default in case fitting can't run (e.g., datasets/numpy missing).
    DEFAULT_PARAMS = {"L_inf": 0.6, "K": 0.4, "a": -0.1, "b": -0.1, "c": -0.1}

    # Initialize caches on the function object
    if not hasattr(law, "_params_by_group"):
        law._params_by_group = {}  # type: ignore[attr-defined]
    if not hasattr(law, "_fitted"):
        law._fitted = False  # type: ignore[attr-defined]

    def _safe_float(x, default=1.0):
        try:
            return float(x)
        except Exception:
            return float(default)

    def _predict_with_params(params, rows):
        L_inf = params["L_inf"]
        K = params["K"]
        a = params["a"]
        b = params["b"]
        c = params["c"]
        preds = []
        for row in rows:
            V = max(_safe_float(row.get("vocab_size", 0.0)), 1e-12)
            N = max(_safe_float(row.get("non_vocab_parameters", 0.0)), 1e-12)
            D = max(_safe_float(row.get("num_characters", 0.0)), 1e-12)
            pred = L_inf + K * (N ** a) * (D ** b) * (V ** c)
            preds.append({"unigram_normalized_loss": float(pred)})
        return preds

    def _write_explain_md(params_by_group):
        # Best-effort write; ignore any filesystem errors.
        try:
            lines = []
            lines.append("# Scaling law for unigram-normalized loss")
            lines.append("")
            lines.append("We model the unigram-normalized loss (L) as a sum of an irreducible floor and a separable power-law over compute, data, and vocabulary:")
            lines.append("")
            lines.append("L = L_inf + K * N^a * D^b * V^c")
            lines.append("")
            lines.append("where:")
            lines.append("- L: unigram_normalized_loss")
            lines.append("- N: non_vocabulary parameters (non_vocab_parameters)")
            lines.append("- D: total training characters (num_characters)")
            lines.append("- V: vocabulary size (vocab_size)")
            lines.append("")
            lines.append("Methodology summary:")
            lines.append("- For each group, we choose L_inf via a grid search below the minimum observed loss.")
            lines.append("- Given a candidate L_inf, we fit ln(L - L_inf) = ln K + a ln N + b ln D + c ln V via least squares.")
            lines.append("- We select the L_inf that minimizes the squared residuals in log-space.")
            lines.append("")
            lines.append("## Fitted parameters by group")
            lines.append("")
            if not params_by_group:
                lines.append("_No dataset found during fitting; defaults in use._")
            else:
                # Show GLOBAL first if present
                ordered = []
                if "GLOBAL" in params_by_group:
                    ordered.append(("GLOBAL", params_by_group["GLOBAL"]))
                ordered.extend([(g, p) for g, p in params_by_group.items() if g != "GLOBAL"])
                for g, p in ordered:
                    lines.append(f"### {g}")
                    lines.append(f"- L_inf: {p['L_inf']:.6g}")
                    lines.append(f"- K: {p['K']:.6g}")
                    lines.append(f"- a (non_vocab_parameters exponent): {p['a']:.6g}")
                    lines.append(f"- b (num_characters exponent): {p['b']:.6g}")
                    lines.append(f"- c (vocab_size exponent): {p['c']:.6g}")
                    lines.append("")
            with open("/app/explain.md", "w", encoding="utf-8") as f:
                f.write("\n".join(lines) + "\n")
        except Exception:
            pass

    def _fit_if_needed():
        if law._fitted:  # type: ignore[attr-defined]
            return
        # Attempt to fit from /app/data
        params_by_group = {}

        # Small helper to set defaults when fit fails
        def _set_defaults(groups):
            if not groups:
                params_by_group["GLOBAL"] = DEFAULT_PARAMS.copy()
            for g in groups:
                params_by_group[g] = DEFAULT_PARAMS.copy()

        try:
            # Import locally to keep the file limited to a single public function.
            try:
                import numpy as np  # type: ignore
            except Exception:
                # Can't fit without numpy
                _set_defaults(groups=[])
                law._params_by_group = params_by_group  # type: ignore[attr-defined]
                law._fitted = True  # type: ignore[attr-defined]
                _write_explain_md(params_by_group)
                return

            try:
                from datasets import load_from_disk  # type: ignore
            except Exception:
                # Can't load dataset, fall back to defaults
                _set_defaults(groups=[])
                law._params_by_group = params_by_group  # type: ignore[attr-defined]
                law._fitted = True  # type: ignore[attr-defined]
                _write_explain_md(params_by_group)
                return

            ds = load_from_disk("/app/data")

            # Flatten to a list of Python dicts
            rows = []
            try:
                # Dataset or DatasetDict
                if hasattr(ds, "keys") and callable(ds.keys):
                    for k in ds.keys():
                        split = ds[k]
                        for r in split:
                            rows.append(dict(r))
                else:
                    for r in ds:
                        rows.append(dict(r))
            except Exception:
                # As a fallback, try to access .to_list()
                try:
                    rows = list(ds.to_list())
                except Exception:
                    rows = []

            # Identify group column
            group_col = None
            if rows:
                candidate_cols = ["group", "Group", "group_name", "experiment_group", "family"]
                sample_keys = rows[0].keys()
                for c in candidate_cols:
                    if c in sample_keys:
                        group_col = c
                        break

            if not rows:
                _set_defaults(groups=[])
                law._params_by_group = params_by_group  # type: ignore[attr-defined]
                law._fitted = True  # type: ignore[attr-defined]
                _write_explain_md(params_by_group)
                return

            # Build groups
            if group_col is None:
                groups = {"GLOBAL": rows}
            else:
                groups = {}
                for r in rows:
                    g = r.get(group_col, "GLOBAL")
                    if g is None:
                        g = "GLOBAL"
                    g = str(g)
                    groups.setdefault(g, []).append(r)

            # Always include GLOBAL as an aggregate fit across all
            if group_col is not None:
                groups["GLOBAL"] = rows

            # Fit each group
            for gname, grows in groups.items():
                # Extract and validate
                N_list = []
                D_list = []
                V_list = []
                Y_list = []
                for r in grows:
                    try:
                        V = float(r.get("vocab_size", float("nan")))
                        N = float(r.get("non_vocab_parameters", float("nan")))
                        D = float(r.get("num_characters", float("nan")))
                        Y = float(r.get("unigram_normalized_loss", float("nan")))
                    except Exception:
                        continue
                    if not (V > 0 and N > 0 and D > 0 and math.isfinite(V) and math.isfinite(N) and math.isfinite(D)):
                        continue
                    if not (math.isfinite(Y)):
                        continue
                    N_list.append(N)
                    D_list.append(D)
                    V_list.append(V)
                    Y_list.append(Y)

                if len(Y_list) < 8:
                    # Not enough data to fit robustly
                    params_by_group[gname] = DEFAULT_PARAMS.copy()
                    continue

                N_arr = np.array(N_list, dtype=np.float64)
                D_arr = np.array(D_list, dtype=np.float64)
                V_arr = np.array(V_list, dtype=np.float64)
                Y_arr = np.array(Y_list, dtype=np.float64)

                # Grid search for L_inf below min(Y)
                y_min = float(np.min(Y_arr))
                y_max = float(np.max(Y_arr))
                y_range = max(y_max - y_min, 1e-6)
                L_low = y_min - 0.5 * y_range
                L_high = y_min - 1e-8
                # Ensure strictly less than min(Y)
                if L_low >= L_high:
                    L_low = y_min - 0.5 * max(y_range, 1.0)
                    L_high = y_min - 1e-8

                L_candidates = np.linspace(L_low, L_high, num=80, dtype=np.float64)

                lnN = np.log(N_arr)
                lnD = np.log(D_arr)
                lnV = np.log(V_arr)
                X = np.column_stack([lnN, lnD, lnV, np.ones_like(lnN)])

                best_sse = float("inf")
                best = None

                for L_inf in L_candidates:
                    Z = Y_arr - L_inf
                    if np.any(Z <= 0):
                        continue
                    lnZ = np.log(Z)
                    # Solve for theta: [a, b, c, lnK]
                    try:
                        theta, _, _, _ = np.linalg.lstsq(X, lnZ, rcond=None)
                    except Exception:
                        continue
                    residuals = lnZ - X.dot(theta)
                    sse = float(np.dot(residuals, residuals))
                    if sse < best_sse and np.isfinite(sse):
                        best_sse = sse
                        a, b, c, lnK = [float(t) for t in theta]
                        K = float(math.exp(lnK))
                        best = {"L_inf": float(L_inf), "K": K, "a": a, "b": b, "c": c}

                if best is None:
                    params_by_group[gname] = DEFAULT_PARAMS.copy()
                else:
                    params_by_group[gname] = best

            law._params_by_group = params_by_group  # type: ignore[attr-defined]
            law._fitted = True  # type: ignore[attr-defined]
            _write_explain_md(params_by_group)
            return
        except Exception:
            _set_defaults(groups=[])
            law._params_by_group = params_by_group  # type: ignore[attr-defined]
            law._fitted = True  # type: ignore[attr-defined]
            _write_explain_md(params_by_group)
            return

    # Ensure parameters are available
    _fit_if_needed()

    # Prepare predictions
    params_for_group = getattr(law, "_params_by_group", {}).get(group)  # type: ignore[attr-defined]
    if params_for_group is None:
        params_for_group = getattr(law, "_params_by_group", {}).get("GLOBAL")  # type: ignore[attr-defined]
    if params_for_group is None:
        params_for_group = DEFAULT_PARAMS

    # If input_data is empty, this call can still be used to trigger fitting and explain.md generation.
    if not input_data:
        return []

    return _predict_with_params(params_for_group, input_data)

#2 Run 2 R² = 0.895969

▼

Python

from __future__ import annotations

import math
import os
from typing import Dict, List, Tuple

import numpy as np

try:
    from datasets import load_from_disk, DatasetDict, concatenate_datasets
except Exception:  # datasets might not be available in some environments
    load_from_disk = None
    DatasetDict = None
    concatenate_datasets = None

# Module-level storage for fitted parameters per group.
# Each value is a dict with keys: L_inf, K, a, b, c
_PARAMS: Dict[str, Dict[str, float]] = {}


def _safe_log(x: np.ndarray, eps: float = 1e-12) -> np.ndarray:
    return np.log(np.clip(x, eps, None))


def _fit_power_law_with_floor(
    V: np.ndarray,
    Pnv: np.ndarray,
    N: np.ndarray,
    y: np.ndarray,
) -> Dict[str, float]:
    """
    Fit y ≈ L_inf + K * V^a * Pnv^b * N^c
    by grid-searching L_inf and doing linear least squares on logs for (K, a, b, c).
    """
    V = V.astype(np.float64)
    Pnv = Pnv.astype(np.float64)
    N = N.astype(np.float64)
    y = y.astype(np.float64)

    # Filter any non-positive inputs (cannot take logs)
    mask = (V > 0) & (Pnv > 0) & (N > 0) & np.isfinite(y)
    V, Pnv, N, y = V[mask], Pnv[mask], N[mask], y[mask]

    # Fallback if insufficient data
    if y.size < 4:
        return {"L_inf": float(max(0.0, np.min(y) - 1e-6) if y.size else 0.0),
                "K": 1.0, "a": -0.2, "b": -0.1, "c": -0.3}

    y_min = float(np.min(y))
    y_range = float(np.max(y) - y_min + 1e-12)

    # Construct candidate L_inf values strictly below min(y)
    # Use a blend of linear and logarithmic spacing for robustness
    candidates: List[float] = []

    # Linear approach near min(y)
    linear_fracs = np.linspace(1e-4, 0.9, 40)
    for f in linear_fracs:
        candidates.append(y_min - f * max(y_min, 1e-6))

    # Log-spaced deltas away from min(y)
    if y_min > 0:
        deltas = np.logspace(math.log10(y_min * 1e-6), math.log10(y_min * 0.99), 40)
        for d in deltas:
            candidates.append(y_min - d)

    # Ensure 0 is considered if allowed (it is less than y_min if y_min > 0)
    if y_min > 0:
        candidates.append(0.0)

    # Deduplicate and sort
    cand_arr = np.unique(np.array(candidates, dtype=np.float64))
    cand_arr = cand_arr[cand_arr < y_min - 1e-12]
    if cand_arr.size == 0:
        cand_arr = np.array([y_min * 0.99], dtype=np.float64)

    x1 = _safe_log(V)
    x2 = _safe_log(Pnv)
    x3 = _safe_log(N)
    X = np.stack([np.ones_like(x1), x1, x2, x3], axis=1)

    best: Tuple[float, np.ndarray, float] | None = None  # (L_inf, beta, sse)

    for L0 in cand_arr:
        diff = y - L0
        if np.any(diff <= 0):
            continue
        z = np.log(diff)
        try:
            beta, residuals, rank, s = np.linalg.lstsq(X, z, rcond=None)
            if residuals.size:
                sse = float(residuals[0])
            else:
                # If residuals not returned (e.g., exact fit), compute manually
                z_hat = X @ beta
                sse = float(np.sum((z - z_hat) ** 2))
            if (best is None) or (sse < best[2]):
                best = (float(L0), beta, sse)
        except Exception:
            continue

    # Fallback if fit failed
    if best is None:
        return {"L_inf": float(y_min * 0.99), "K": 1.0, "a": -0.2, "b": -0.1, "c": -0.3}

    L_star, beta, _ = best
    K = float(np.exp(beta[0]))
    a = float(beta[1])
    b = float(beta[2])
    c = float(beta[3])

    # Guard against pathological values
    if not np.isfinite(K) or K <= 0:
        K = 1.0
    for val in (a, b, c):
        if not np.isfinite(val):
            a, b, c = -0.2, -0.1, -0.3
            break

    return {"L_inf": float(L_star), "K": K, "a": a, "b": b, "c": c}


def _load_and_fit(path: str = "/app/data") -> Dict[str, Dict[str, float]]:
    """
    Load dataset from disk and fit parameters per group and globally.
    Expected fields: vocab_size, non_vocab_parameters, num_characters,
    unigram_normalized_loss, and a grouping column (default 'group').
    """
    params: Dict[str, Dict[str, float]] = {}

    if load_from_disk is None:
        # Datasets lib not available; return a generic global model
        params["_GLOBAL"] = {"L_inf": 0.0, "K": 1.0, "a": -0.2, "b": -0.1, "c": -0.3}
        return params

    try:
        ds = load_from_disk(path)
    except Exception:
        params["_GLOBAL"] = {"L_inf": 0.0, "K": 1.0, "a": -0.2, "b": -0.1, "c": -0.3}
        return params

    # Merge splits if present
    try:
        if isinstance(ds, DatasetDict):
            parts = [ds[k] for k in ds.keys()]
            ds = concatenate_datasets(parts)
    except Exception:
        pass

    colnames = set(getattr(ds, "column_names", []))
    # Try common group column names
    group_col = None
    for cand in ("group", "group_name", "Group", "dataset_group"):
        if cand in colnames:
            group_col = cand
            break

    # Extract arrays
    required = ["vocab_size", "non_vocab_parameters", "num_characters", "unigram_normalized_loss"]
    for req in required:
        if req not in colnames:
            # Missing expected columns; return generic
            params["_GLOBAL"] = {"L_inf": 0.0, "K": 1.0, "a": -0.2, "b": -0.1, "c": -0.3}
            return params

    V_all = np.array(ds["vocab_size"], dtype=np.float64)
    Pnv_all = np.array(ds["non_vocab_parameters"], dtype=np.float64)
    N_all = np.array(ds["num_characters"], dtype=np.float64)
    y_all = np.array(ds["unigram_normalized_loss"], dtype=np.float64)

    # Fit global model
    params["_GLOBAL"] = _fit_power_law_with_floor(V_all, Pnv_all, N_all, y_all)

    # Fit per group if possible
    if group_col is not None:
        groups = np.array(ds[group_col])
        # Normalize group labels to strings for keys
        groups = np.array([str(g) for g in groups])
        unique_groups = np.unique(groups)
        for g in unique_groups:
            mask = (groups == g)
            Vg = V_all[mask]
            Pnvg = Pnv_all[mask]
            Ng = N_all[mask]
            yg = y_all[mask]
            params[str(g)] = _fit_power_law_with_floor(Vg, Pnvg, Ng, yg)

    return params


def _write_explain_md(params: Dict[str, Dict[str, float]], path: str = "/app/explain.md") -> None:
    """
    Write a human-readable explanation of the law and the fitted parameters.
    """
    lines: List[str] = []
    lines.append("# Scaling Law for Unigram-Normalized Loss vs. Vocabulary Size\n")
    lines.append("This document describes the discovered scaling law relating the unigram-normalized loss to:\n")
    lines.append("- vocabulary size V (`vocab_size`)\n- non-vocabulary parameters P_nv (`non_vocab_parameters`)\n- number of training characters N (`num_characters`).\n")
    lines.append("\n## Functional Form\n")
    lines.append("We model the loss L as a power-law with a floor (irreducible loss):\n")
    lines.append("\nL_hat = L_inf(group) + K(group) * V^{a(group)} * P_nv^{b(group)} * N^{c(group)}\n")
    lines.append("\n- L_inf(group): irreducible loss floor for the group.\n- K(group): scale factor.\n- a(group), b(group), c(group): exponents capturing how loss changes with V, P_nv, and N.\n")
    lines.append("\nThe functional form is identical across groups; only coefficients differ by group.\n")
    lines.append("\n## Fitting Methodology\n")
    lines.append("We fit parameters per group using the dataset at `/app/data` (loaded with `datasets.load_from_disk`).\n")
    lines.append("For each group, we grid-search candidate values for L_inf strictly below the minimum observed loss,\n")
    lines.append("and for each candidate we perform linear least squares on the log-transformed relation:\n")
    lines.append("\nlog(L - L_inf) = log K + a log V + b log P_nv + c log N\n")
    lines.append("\nWe select the L_inf that minimizes the residual sum of squares. A global model is also fit over all groups.\n")
    lines.append("\n## Fitted Parameters by Group\n")
    lines.append("The table below is generated at runtime when `law.py` is imported. If this table is empty or stale, run:\n")
    lines.append("\n```\npython -c \"import importlib, sys; sys.path.append('/app'); import law\"\n```\n")
    lines.append("\n| Group | L_inf | K | a | b | c |\n")
    lines.append("|---|---:|---:|---:|---:|---:|\n")
    # Add global first
    if "_GLOBAL" in params:
        p = params["_GLOBAL"]
        lines.append(f"| _GLOBAL | {p['L_inf']:.6g} | {p['K']:.6g} | {p['a']:.6g} | {p['b']:.6g} | {p['c']:.6g} |\n")
    # Then other groups in sorted order
    for g in sorted([k for k in params.keys() if k != "_GLOBAL"]):
        p = params[g]
        lines.append(f"| {g} | {p['L_inf']:.6g} | {p['K']:.6g} | {p['a']:.6g} | {p['b']:.6g} | {p['c']:.6g} |\n")

    # Best-effort write (ignore errors)
    try:
        with open(path, "w", encoding="utf-8") as f:
            f.writelines(lines)
    except Exception:
        pass


# Fit parameters during import (best effort) so law() can use them directly
try:
    _PARAMS = _load_and_fit("/app/data")
    if _PARAMS:
        _write_explain_md(_PARAMS, "/app/explain.md")
except Exception:
    # Leave _PARAMS possibly empty; law() will handle fallback
    _PARAMS = _PARAMS or {}


def _get_params_for_group(group: str) -> Dict[str, float]:
    if group in _PARAMS:
        return _PARAMS[group]
    if "_GLOBAL" in _PARAMS:
        return _PARAMS["_GLOBAL"]
    # Final fallback
    return {"L_inf": 0.0, "K": 1.0, "a": -0.2, "b": -0.1, "c": -0.3}


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Required keys per item:
                    - 'vocab_size'
                    - 'non_vocab_parameters'
                    - 'num_characters'
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is shared across groups; coefficients
                differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s):
        - 'unigram_normalized_loss'
    """
    params = _get_params_for_group(str(group))
    L_inf = float(params["L_inf"])
    K = float(params["K"])
    a = float(params["a"])
    b = float(params["b"])
    c = float(params["c"])

    preds: List[Dict[str, float]] = []
    eps = 1e-12

    for row in input_data:
        try:
            V = float(row["vocab_size"])
            Pnv = float(row["non_vocab_parameters"])
            N = float(row["num_characters"])
        except Exception as e:
            raise ValueError("Each input row must contain 'vocab_size', 'non_vocab_parameters', and 'num_characters'") from e

        # Ensure positivity for exponentiation stability
        V = V if V > 0 else eps
        Pnv = Pnv if Pnv > 0 else eps
        N = N if N > 0 else eps

        y_hat = L_inf + K * (V ** a) * (Pnv ** b) * (N ** c)
        # Guard against NaN/inf
        if not np.isfinite(y_hat):
            y_hat = float(L_inf)

        preds.append({"unigram_normalized_loss": float(y_hat)})

    return preds

#3 Run 3 R² = 0.861121

▼

Python

from __future__ import annotations

import math
from typing import Dict, List

import numpy as np

# We intentionally import inside the fit routine to avoid import-time failures
# in environments where `datasets` might not be installed for static analysis.
_DATA_PATH = "/app/data"

# Cache for per-group coefficients: group -> np.ndarray of shape (4,)
_COEFFS_BY_GROUP: dict[str, np.ndarray] = {}
_FITTED: bool = False


def _safe_log(x: float, eps: float = 1e-12) -> float:
    return math.log(max(float(x), eps))


def _fit_from_disk() -> None:
    """
    Fit a log-linear scaling law per experimental group:
        y = β0 + βV * log(V) + βP * log(Pnv) + βN * log(Nchars)
    where:
        y = unigram_normalized_loss
        V = vocab_size
        Pnv = non_vocab_parameters
        Nchars = num_characters
    """
    global _COEFFS_BY_GROUP, _FITTED

    if _FITTED:
        return

    try:
        from datasets import load_from_disk
    except Exception as e:
        # If datasets isn't available, defer fitting; predictions will raise with a clear message.
        raise RuntimeError(
            "The 'datasets' package is required to fit the scaling law from /app/data."
        ) from e

    ds = load_from_disk(_DATA_PATH)

    # Collect all rows across splits if a DatasetDict, else single dataset
    splits = []
    try:
        # DatasetDict has .values()
        splits = list(ds.values())  # type: ignore[attr-defined]
    except Exception:
        # Single split Dataset
        splits = [ds]

    rows: List[Dict[str, float]] = []
    for split in splits:
        # Iterating over HF Datasets yields dicts
        for rec in split:
            rows.append(rec)

    if not rows:
        raise RuntimeError(f"No data rows found in {_DATA_PATH}")

    # Build per-group buckets
    by_group: Dict[str, List[Dict[str, float]]] = {}
    for r in rows:
        g = r.get("group", "GLOBAL")
        by_group.setdefault(g, []).append(r)
    # Also keep a GLOBAL group with all data for fallback
    by_group["GLOBAL"] = rows

    coeffs: dict[str, np.ndarray] = {}

    for g, grp_rows in by_group.items():
        X_list: List[List[float]] = []
        y_list: List[float] = []
        for r in grp_rows:
            try:
                V = float(r["vocab_size"])
                Pnv = float(r["non_vocab_parameters"])
                Nchars = float(r["num_characters"])
                y = float(r["unigram_normalized_loss"])
            except KeyError:
                # Skip rows missing required fields
                continue

            X_list.append(
                [
                    1.0,
                    _safe_log(V),
                    _safe_log(Pnv),
                    _safe_log(Nchars),
                ]
            )
            y_list.append(y)

        X = np.asarray(X_list, dtype=np.float64)
        yv = np.asarray(y_list, dtype=np.float64)

        if X.shape[0] < 4:
            # Not enough data to fit reliably; skip and use GLOBAL later
            continue

        # Solve least squares: minimize ||X*β - y||
        beta, *_ = np.linalg.lstsq(X, yv, rcond=None)
        coeffs[g] = beta

    # Ensure we have a GLOBAL fit; this should exist unless data was empty or malformed
    if "GLOBAL" not in coeffs:
        # Attempt to fit GLOBAL minimally, error if impossible
        X_list: List[List[float]] = []
        y_list: List[float] = []
        for r in rows:
            try:
                V = float(r["vocab_size"])
                Pnv = float(r["non_vocab_parameters"])
                Nchars = float(r["num_characters"])
                y = float(r["unigram_normalized_loss"])
            except KeyError:
                continue
            X_list.append([1.0, _safe_log(V), _safe_log(Pnv), _safe_log(Nchars)])
            y_list.append(y)

        X = np.asarray(X_list, dtype=np.float64)
        yv = np.asarray(y_list, dtype=np.float64)
        if X.shape[0] < 4:
            raise RuntimeError("Insufficient data to fit even a GLOBAL model.")
        beta, *_ = np.linalg.lstsq(X, yv, rcond=None)
        coeffs["GLOBAL"] = beta

    _COEFFS_BY_GROUP = coeffs
    _FITTED = True


def _get_coeffs_for_group(group: str) -> np.ndarray:
    if not _FITTED:
        _fit_from_disk()
    # Exact group, else fallback to GLOBAL
    if group in _COEFFS_BY_GROUP:
        return _COEFFS_BY_GROUP[group]
    return _COEFFS_BY_GROUP["GLOBAL"]


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    beta = _get_coeffs_for_group(group)

    preds: List[Dict[str, float]] = []
    for row in input_data:
        V = float(row.get("vocab_size", 0.0))
        Pnv = float(row.get("non_vocab_parameters", 0.0))
        Nchars = float(row.get("num_characters", 0.0))
        x = np.array([1.0, _safe_log(V), _safe_log(Pnv), _safe_log(Nchars)], dtype=np.float64)
        y_hat = float(x @ beta)
        preds.append({"unigram_normalized_loss": y_hat})
    return preds


def _format_coeffs(beta: np.ndarray) -> str:
    # β0, βV, βP, βN
    return (
        f"beta0={beta[0]:.6g}, beta_V_log={beta[1]:.6g}, "
        f"beta_Pnv_log={beta[2]:.6g}, beta_Nchars_log={beta[3]:.6g}"
    )


if __name__ == "__main__":
    # Simple CLI to print fitted coefficients per group
    try:
        _fit_from_disk()
        print("Fitted coefficients by group (y = β0 + βV*log(V) + βP*log(Pnv) + βN*log(Nchars))")
        for g, b in sorted(_COEFFS_BY_GROUP.items()):
            print(f"- {g}: {_format_coeffs(b)}")
    except Exception as e:
        print(f"Failed to fit coefficients: {e}")

#4 Run 4 R² = -1.000000

▼

Python

from __future__ import annotations

import json
import math
import os
from typing import Dict, List, Tuple

import numpy as np

try:
    from datasets import load_from_disk, Dataset, DatasetDict  # type: ignore
except Exception:  # datasets may not be available in some contexts
    load_from_disk = None
    Dataset = None
    DatasetDict = None


# Paths (as specified by the task)
_DATA_PATH = "/app/data"
_COEFFS_PATH = "/app/coefficients.json"

# Small epsilon to avoid divide-by-zero and log domain issues
_EPS = 1e-12

# Global cache of fitted coefficients per group
# Each value is a dict with keys: L_inf, C, alpha, beta, gamma
_COEFFS: Dict[str, Dict[str, float]] = {}


def _safe_log(x: np.ndarray) -> np.ndarray:
    """Numerically safe natural log."""
    return np.log(np.clip(x, _EPS, None))


def _get_groups_from_dataset(ds_obj) -> Dict[str, List[Tuple[float, float, float, float]]]:
    """
    Extracts and groups data from a HuggingFace dataset object.

    Returns:
        Mapping: group_name -> list of tuples (vocab_size, non_vocab_parameters, num_characters, unigram_normalized_loss)
    """
    groups: Dict[str, List[Tuple[float, float, float, float]]] = {}

    def _add_example(ex: dict):
        try:
            V = float(ex["vocab_size"])
            Pnv = float(ex["non_vocab_parameters"])
            Nch = float(ex["num_characters"])
            L = float(ex["unigram_normalized_loss"])
        except Exception:
            return  # skip rows with missing/invalid fields

        # Group name (default to "ALL" if not provided)
        g = ex.get("group", "ALL")
        if not isinstance(g, str):
            g = str(g)

        groups.setdefault(g, []).append((V, Pnv, Nch, L))

    # Handle both Dataset and DatasetDict
    try:
        from datasets import Dataset as HFDataset, DatasetDict as HFDatasetDict  # type: ignore
    except Exception:
        HFDataset = None
        HFDatasetDict = None

    if HFDatasetDict is not None and isinstance(ds_obj, HFDatasetDict):
        for split in ds_obj.values():
            for ex in split:
                _add_example(ex)
    else:
        # Treat as a single split dataset or a generic iterable of dicts
        for ex in ds_obj:
            _add_example(ex)

    return groups


def _fit_group(records: List[Tuple[float, float, float, float]]) -> Dict[str, float]:
    """
    Fit parameters for one group using a multiplicative power-law with a loss floor:
        L_hat = L_inf + C * V^{-alpha} * Pnv^{-beta} * Nch^{-gamma}
    where V = vocab_size, Pnv = non_vocab_parameters, Nch = num_characters.

    We estimate L_inf via a 1D grid search and for each candidate perform
    linear regression on:
        log(L - L_inf) = log C - alpha log V - beta log Pnv - gamma log Nch
    """
    arr = np.array(records, dtype=float)
    if arr.ndim != 2 or arr.shape[1] != 4:
        # Fallback defaults if data malformed
        return {"L_inf": 0.0, "C": 1.0, "alpha": 0.2, "beta": 0.2, "gamma": 0.2}

    V = np.clip(arr[:, 0], _EPS, None)
    P = np.clip(arr[:, 1], _EPS, None)
    N = np.clip(arr[:, 2], _EPS, None)
    L = np.clip(arr[:, 3], _EPS, None)

    # Filter to rows with all finite values
    mask = np.isfinite(V) & np.isfinite(P) & np.isfinite(N) & np.isfinite(L)
    V, P, N, L = V[mask], P[mask], N[mask], L[mask]

    if V.size < 5:
        # Not enough data; use reasonable defaults
        return {"L_inf": float(np.maximum(0.0, np.min(L) * 0.5)) if L.size else 0.0,
                "C": 1.0, "alpha": 0.2, "beta": 0.2, "gamma": 0.2}

    min_L = float(np.min(L))
    # Candidate grid for L_inf between 0 and just below min(L)
    upper = max(0.0, min_L * 0.99)
    if upper <= 0:
        grid = np.array([0.0], dtype=float)
    else:
        # Dense near zero and near min(L) to stabilize the search
        grid = np.unique(np.concatenate([
            np.linspace(0.0, upper, num=50, dtype=float),
            np.geomspace(max(_EPS, upper / 1e6), upper, num=50, dtype=float)
        ]))
        grid = grid[(grid >= 0.0) & (grid < min_L)]

    best = {
        "sse": math.inf,
        "L_inf": 0.0,
        "C": 1.0,
        "alpha": 0.2,
        "beta": 0.2,
        "gamma": 0.2,
    }

    x1 = _safe_log(V)
    x2 = _safe_log(P)
    x3 = _safe_log(N)

    # Design matrix (with intercept) will be built once per grid element
    for L_inf_cand in grid:
        # Exclude points where L - L_inf <= 0
        valid = L > (L_inf_cand + _EPS)
        if np.count_nonzero(valid) < 4:
            continue

        y = _safe_log(L[valid] - L_inf_cand)
        X = np.column_stack([
            np.ones_like(y),
            x1[valid],
            x2[valid],
            x3[valid],
        ])

        # Linear least squares fit
        try:
            coeffs, residuals, rank, s = np.linalg.lstsq(X, y, rcond=None)
        except Exception:
            continue

        # Compute SSE explicitly to be safe
        y_hat = X @ coeffs
        sse = float(np.sum((y - y_hat) ** 2))

        if sse < best["sse"]:
            # Map linear solution back to parameters
            logC, a1, a2, a3 = coeffs.tolist()
            C = float(np.exp(logC))
            alpha = float(-a1)
            beta = float(-a2)
            gamma = float(-a3)

            # Sanity constraints to avoid pathological exponents
            if not (np.isfinite(C) and np.isfinite(alpha) and np.isfinite(beta) and np.isfinite(gamma)):
                continue
            if C <= 0:
                continue
            # Clip exponents to a reasonable range
            alpha = float(np.clip(alpha, -4.0, 4.0))
            beta = float(np.clip(beta, -4.0, 4.0))
            gamma = float(np.clip(gamma, -4.0, 4.0))

            best.update({
                "sse": sse,
                "L_inf": float(L_inf_cand),
                "C": C,
                "alpha": alpha,
                "beta": beta,
                "gamma": gamma,
            })

    # If grid search failed to improve (e.g., due to degenerate data), try L_inf=0 fallback
    if not np.isfinite(best["sse"]) or best["sse"] == math.inf:
        L_inf_cand = 0.0
        valid = L > (L_inf_cand + _EPS)
        if np.count_nonzero(valid) >= 4:
            y = _safe_log(L[valid] - L_inf_cand)
            X = np.column_stack([np.ones_like(y), x1[valid], x2[valid], x3[valid]])
            coeffs, *_ = np.linalg.lstsq(X, y, rcond=None)
            logC, a1, a2, a3 = coeffs.tolist()
            best.update({
                "sse": 0.0,
                "L_inf": 0.0,
                "C": float(np.exp(logC)),
                "alpha": float(-a1),
                "beta": float(-a2),
                "gamma": float(-a3),
            })
        else:
            # Last resort defaults
            best.update({
                "sse": 0.0,
                "L_inf": float(np.maximum(0.0, min_L * 0.5)),
                "C": 1.0,
                "alpha": 0.2,
                "beta": 0.2,
                "gamma": 0.2,
            })

    # Drop SSE from output
    return {k: float(v) for k, v in best.items() if k != "sse"}


def _fit_all_groups() -> Dict[str, Dict[str, float]]:
    """
    Load the dataset from disk and fit coefficients per experimental group.
    Also fits an 'ALL' aggregate group as a fallback.
    """
    coeffs: Dict[str, Dict[str, float]] = {}

    if load_from_disk is None:
        return coeffs

    if not os.path.isdir(_DATA_PATH):
        return coeffs

    try:
        ds_obj = load_from_disk(_DATA_PATH)
    except Exception:
        return coeffs

    groups = _get_groups_from_dataset(ds_obj)

    # Fit per group
    for g, recs in groups.items():
        if recs:
            coeffs[g] = _fit_group(recs)

    # Also fit ALL (aggregate) if not already present
    if "ALL" not in coeffs:
        all_recs: List[Tuple[float, float, float, float]] = []
        for recs in groups.values():
            all_recs.extend(recs)
        if all_recs:
            coeffs["ALL"] = _fit_group(all_recs)

    # Persist for transparency and reproducibility
    try:
        with open(_COEFFS_PATH, "w", encoding="utf-8") as f:
            json.dump(coeffs, f, indent=2, sort_keys=True)
    except Exception:
        pass

    return coeffs


def _load_or_fit_coeffs() -> Dict[str, Dict[str, float]]:
    """
    Load coefficients from JSON if available; otherwise fit from the dataset.
    """
    # Try to load precomputed coefficients
    if os.path.isfile(_COEFFS_PATH):
        try:
            with open(_COEFFS_PATH, "r", encoding="utf-8") as f:
                data = json.load(f)
            # Ensure floats
            out: Dict[str, Dict[str, float]] = {}
            for g, d in data.items():
                out[g] = {
                    "L_inf": float(d["L_inf"]),
                    "C": float(d["C"]),
                    "alpha": float(d["alpha"]),
                    "beta": float(d["beta"]),
                    "gamma": float(d["gamma"]),
                }
            return out
        except Exception:
            pass

    # Otherwise fit now
    return _fit_all_groups()


# Initialize coefficients at import time for immediate availability
_COEFFS = _load_or_fit_coeffs()


def _predict_one(row: Dict[str, float], coefs: Dict[str, float]) -> float:
    """Compute prediction for one input row given fitted coefficients."""
    V = float(row.get("vocab_size", 0.0))
    Pnv = float(row.get("non_vocab_parameters", 0.0))
    Nch = float(row.get("num_characters", 0.0))

    # Safety clamps
    V = V if np.isfinite(V) and V > 0 else _EPS
    Pnv = Pnv if np.isfinite(Pnv) and Pnv > 0 else _EPS
    Nch = Nch if np.isfinite(Nch) and Nch > 0 else _EPS

    L_inf = float(coefs.get("L_inf", 0.0))
    C = float(coefs.get("C", 1.0))
    alpha = float(coefs.get("alpha", 0.2))
    beta = float(coefs.get("beta", 0.2))
    gamma = float(coefs.get("gamma", 0.2))

    # L_hat = L_inf + C * V^{-alpha} * Pnv^{-beta} * Nch^{-gamma}
    try:
        term = C * (V ** (-alpha)) * (Pnv ** (-beta)) * (Nch ** (-gamma))
        pred = L_inf + term
    except Exception:
        pred = L_inf + C  # worst-case fallback

    # Ensure non-negative prediction
    return float(max(0.0, pred))


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Required keys:
                      - 'vocab_size'
                      - 'non_vocab_parameters'
                      - 'num_characters'
        group: The name of the experimental group for which to make predictions.
               The functional form is identical across groups; parameters differ.

    Returns:
        A list of dictionaries, one per input item, each containing:
            {'unigram_normalized_loss': predicted_value}
    """
    # Choose coefficients for the requested group, with fallbacks
    coefs = _COEFFS.get(group)
    if coefs is None:
        coefs = _COEFFS.get("ALL")
    if coefs is None:
        # Final hardcoded fallback if fitting/loading failed
        coefs = {"L_inf": 0.0, "C": 1.0, "alpha": 0.2, "beta": 0.2, "gamma": 0.2}

    outputs: List[Dict[str, float]] = []
    for row in input_data:
        y = _predict_one(row, coefs)
        outputs.append({"unigram_normalized_loss": y})
    return outputs

#5 Run 5 R² = -1.000000

▼

Python

from typing import List, Dict

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
                    Required keys per item:
                      - 'vocab_size'
                      - 'non_vocab_parameters'
                      - 'num_characters'
        group: The name of the experimental group for which to make predictions.
               The functional form of the law is the same for all groups, but
               the coefficients are fitted per group from /app/data when first used.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s):
          - 'unigram_normalized_loss'
    """
    # Lazy, in-function cache so this file contains only a single top-level function as required.
    if not hasattr(law, "_cache"):
        setattr(law, "_cache", {
            "models": {},      # group -> {"w": np.ndarray, "mu": np.ndarray, "sigma": np.ndarray}
            "loaded": False,   # whether dataset has been attempted to load
            "ds": None,        # loaded dataset (train split)
            "group_col": None  # detected group column name
        })

    # Imports inside function to keep this file minimal and self-contained.
    import math
    import numpy as np

    cache = getattr(law, "_cache")

    def _safe_log(x: float, eps: float = 1e-12) -> float:
        return float(np.log(max(float(x), eps)))

    def _inv_sqrt(x: float, eps: float = 1e-12) -> float:
        return float((max(float(x), eps)) ** -0.5)

    def _features(v: float, p: float, c: float) -> np.ndarray:
        # Construct a fixed feature map (same for all groups):
        # 1, ln V, ln P, ln C, (ln V)^2, (ln P)^2, (ln C)^2,
        # ln V * ln P, ln V * ln C, ln P * ln C,
        # V^{-1/2}, P^{-1/2}, C^{-1/2}
        lv = _safe_log(v)
        lp = _safe_log(p)
        lc = _safe_log(c)
        iv = _inv_sqrt(v)
        ip = _inv_sqrt(p)
        ic = _inv_sqrt(c)
        return np.array([
            1.0,
            lv, lp, lc,
            lv * lv, lp * lp, lc * lc,
            lv * lp, lv * lc, lp * lc,
            iv, ip, ic,
        ], dtype=np.float64)

    def _load_dataset_once():
        if cache["loaded"]:
            return
        cache["loaded"] = True
        try:
            from datasets import load_from_disk, DatasetDict
            ds_any = load_from_disk("/app/data")
            # Pick a split if a DatasetDict is provided
            if isinstance(ds_any, dict) and not hasattr(ds_any, "column_names"):
                # Could be a plain dict-like; prefer 'train' if present
                ds = ds_any.get("train", next(iter(ds_any.values())))
            else:
                try:
                    # HuggingFace DatasetDict
                    if isinstance(ds_any, DatasetDict):
                        ds = ds_any["train"] if "train" in ds_any else next(iter(ds_any.values()))
                    else:
                        ds = ds_any
                except Exception:
                    ds = ds_any
            cache["ds"] = ds
            # Detect group column name, if any
            try:
                colnames = list(getattr(ds, "column_names"))
            except Exception:
                try:
                    colnames = list(getattr(ds, "features").keys())
                except Exception:
                    colnames = []
            for cand in ("group", "Group", "GROUP", "experiment_group", "variant", "condition"):
                if cand in colnames:
                    cache["group_col"] = cand
                    break
        except Exception:
            cache["ds"] = None
            cache["group_col"] = None

    def _fit_group_if_needed(g: str):
        if g in cache["models"]:
            return

        _load_dataset_once()
        ds = cache["ds"]

        # If dataset failed to load, provide a simple, safe fallback model.
        if ds is None:
            # Fallback: intercept-only model predicting a reasonable constant.
            import numpy as np
            w = np.array([1.0] + [0.0] * (13 - 1), dtype=np.float64)  # 13 features including intercept
            mu = np.zeros_like(w)
            sigma = np.ones_like(w)
            cache["models"][g] = {"w": w, "mu": mu, "sigma": sigma}
            return

        # Extract rows into Python lists without requiring pandas
        try:
            as_dict = ds.to_dict()  # {col: [vals]}
        except Exception:
            # Fallback slower path
            try:
                n = len(ds)
                as_dict = {name: [ds[i][name] for i in range(n)] for name in ds.column_names}
            except Exception:
                as_dict = {}

        def _col(name: str, default=None):
            return as_dict[name] if name in as_dict else default

        vs = _col("vocab_size", [])
        ps = _col("non_vocab_parameters", [])
        cs = _col("num_characters", [])
        ys = _col("unigram_normalized_loss", [])
        grp_col_name = cache["group_col"]
        grps = _col(grp_col_name, None) if grp_col_name is not None else None

        n_rows = min(len(vs), len(ps), len(cs), len(ys)) if all(isinstance(x, list) for x in (vs, ps, cs, ys)) else 0

        X_rows = []
        y_rows = []

        # Collect rows for the requested group; if insufficient, fall back to global (all groups)
        def _collect_rows(for_group: str | None):
            Xr, yr = [], []
            for i in range(n_rows):
                try:
                    v = float(vs[i]); p = float(ps[i]); c = float(cs[i]); y = float(ys[i])
                    if not (math.isfinite(v) and math.isfinite(p) and math.isfinite(c) and math.isfinite(y)):
                        continue
                    # Optional group filtering
                    if for_group is not None and grps is not None:
                        gi = grps[i]
                        if str(gi) != str(for_group):
                            continue
                    Xr.append(_features(v, p, c))
                    yr.append(y)
                except Exception:
                    continue
            return Xr, yr

        X_rows, y_rows = _collect_rows(g)
        # Fallback to all data if no or too few samples for this group
        if len(y_rows) < 5:
            X_rows, y_rows = _collect_rows(None)

        # If still empty, fallback to a trivial model
        if len(y_rows) == 0:
            import numpy as np
            w = np.array([1.0] + [0.0] * (13 - 1), dtype=np.float64)
            mu = np.zeros_like(w)
            sigma = np.ones_like(w)
            cache["models"][g] = {"w": w, "mu": mu, "sigma": sigma}
            return

        X = np.vstack(X_rows).astype(np.float64)  # shape [n, 13]
        y = np.asarray(y_rows, dtype=np.float64)  # shape [n]

        # Standardize non-intercept features for numerical stability
        n_features = X.shape[1]
        mu = np.zeros(n_features, dtype=np.float64)
        sigma = np.ones(n_features, dtype=np.float64)

        # Intercept at index 0 remains unstandardized
        for j in range(1, n_features):
            col = X[:, j]
            m = float(col.mean())
            s = float(col.std())
            if not math.isfinite(s) or s <= 1e-12:
                s = 1.0
            mu[j] = m
            sigma[j] = s
            X[:, j] = (X[:, j] - m) / s

        # Ridge-regularized least squares (no penalty on intercept)
        lam = 1e-6
        XtX = X.T @ X
        Xty = X.T @ y
        reg = np.eye(n_features, dtype=np.float64) * lam
        reg[0, 0] = 0.0  # do not regularize the intercept
        try:
            w = np.linalg.solve(XtX + reg, Xty)
        except np.linalg.LinAlgError:
            # Pseudo-inverse fallback
            w = np.linalg.pinv(XtX + reg) @ Xty

        cache["models"][g] = {"w": w, "mu": mu, "sigma": sigma}

    # Ensure a model exists for this group
    _fit_group_if_needed(group)
    model = cache["models"][group]
    w = model["w"]
    mu = model["mu"]
    sigma = model["sigma"]

    # Predict for each input row
    preds: List[Dict[str, float]] = []
    for row in input_data:
        v = float(row.get("vocab_size", 0.0))
        p = float(row.get("non_vocab_parameters", 0.0))
        c = float(row.get("num_characters", 0.0))
        phi = _features(v, p, c)
        # Standardize using training stats (except intercept)
        phi_std = phi.copy()
        if phi_std.shape[0] != w.shape[0]:
            # Feature dimension mismatch safeguard: fallback to a constant prediction
            y_hat = float(w[0])
        else:
            for j in range(1, phi_std.shape[0]):
                phi_std[j] = (phi_std[j] - mu[j]) / sigma[j]
            y_hat = float(phi_std.dot(w))
        # Ensure finite output; if not, fallback to intercept
        if not math.isfinite(y_hat):
            y_hat = float(w[0])
        preds.append({"unigram_normalized_loss": y_hat})

    return preds