SLD - Parallel Scaling Law

All Runs (sorted by R²)

Best Run 1 R² = 0.999572

▼

Python

from __future__ import annotations

import math
import os
from typing import Dict, List

import numpy as np

try:
    from datasets import load_from_disk  # type: ignore
except Exception:
    load_from_disk = None  # type: ignore


DATA_PATH = "/app/data"
_EPS = 1e-12


def _safe_log(x: np.ndarray) -> np.ndarray:
    return np.log(np.clip(x, _EPS, None))


def _load_rows() -> List[Dict[str, float]]:
    if load_from_disk is None:
        return []
    if not os.path.isdir(DATA_PATH):
        return []
    try:
        ds = load_from_disk(DATA_PATH)
    except Exception:
        return []

    # Handle DatasetDict or Dataset
    try:
        # DatasetDict: concatenate all splits
        from datasets import Dataset, concatenate_datasets  # type: ignore

        if hasattr(ds, "values"):
            parts = []
            for split in ds.values():
                if isinstance(split, Dataset):
                    parts.append(split)
            if parts:
                ds = concatenate_datasets(parts)
    except Exception:
        pass

    rows: List[Dict[str, float]] = []
    # Convert to python rows
    try:
        cols = ds.column_names  # type: ignore
        it = ds  # type: ignore
    except Exception:
        return rows

    want_cols = {"num_params", "parallel_size", "loss"}
    has_group = "group" in cols

    for ex in it:
        try:
            n = float(ex["num_params"])
            p = float(ex["parallel_size"])
            y = float(ex["loss"])
            g = str(ex["group"]) if has_group else "__all__"
            if not (math.isfinite(n) and math.isfinite(p) and math.isfinite(y)):
                continue
            rows.append({"group": g, "num_params": n, "parallel_size": p, "loss": y})
        except Exception:
            continue
    return rows


def _fit_group(rows: List[Dict[str, float]]) -> Dict[str, float]:
    # Model: loss ≈ L_inf + A * num_params^{-b} * parallel_size^{-d}
    # Take log(loss - L_inf) = log A - b log N - d log P
    if len(rows) < 3:
        return {"L_inf": 1.0, "A": 10.0, "b": 0.2, "d": 0.5}

    N = np.array([r["num_params"] for r in rows], dtype=float)
    P = np.array([r["parallel_size"] for r in rows], dtype=float)
    Y = np.array([r["loss"] for r in rows], dtype=float)

    # Filter invalid
    mask = np.isfinite(N) & np.isfinite(P) & np.isfinite(Y) & (N > 0) & (P > 0) & (Y > 0)
    N, P, Y = N[mask], P[mask], Y[mask]
    if N.size < 3:
        return {"L_inf": 1.0, "A": 10.0, "b": 0.2, "d": 0.5}

    min_y = float(np.min(Y))
    q10 = float(np.quantile(Y, 0.10))
    # Construct a grid for L_inf strictly below min(Y)
    lo = max(0.0, min(0.99 * min_y, 2 * min_y - q10))
    hi = 0.99 * min_y if min_y > 0 else 0.0
    # Ensure coverage including zero
    grid = np.unique(
        np.clip(
            np.concatenate(
                [
                    np.linspace(0.0, hi, num=25, dtype=float),
                    np.linspace(lo, hi, num=25, dtype=float),
                ]
            ),
            0.0,
            hi if hi > 0 else 0.0,
        )
    )
    if grid.size == 0:
        grid = np.array([0.0], dtype=float)

    best = None
    best_params = (1.0, 10.0, 0.2, 0.5)  # L_inf, A, b, d

    lnN = _safe_log(N)
    lnP = _safe_log(P)

    for L_inf in grid:
        # y' = y - L_inf must be positive
        Yp = Y - L_inf
        if np.any(Yp <= 0):
            continue
        lnYp = _safe_log(Yp)
        # Design matrix for linear regression: lnYp = c0 + c1*(-lnN) + c2*(-lnP)
        X = np.stack([np.ones_like(lnYp), -lnN, -lnP], axis=1)
        try:
            coef, residuals, rank, s = np.linalg.lstsq(X, lnYp, rcond=None)
        except Exception:
            continue
        if residuals.size == 0:
            # Compute residuals manually if lstsq didn't return them
            pred = X @ coef
            residuals_val = float(np.mean((lnYp - pred) ** 2))
        else:
            residuals_val = float(residuals[0] / max(1, lnYp.size - X.shape[1]))

        c0, b, d = float(coef[0]), float(coef[1]), float(coef[2])
        A = float(np.exp(c0))

        # Penalize extreme exponents to avoid overfitting
        penalty = 1e-4 * (b**2 + d**2)
        obj = residuals_val + penalty

        if (best is None) or (obj < best):
            best = obj
            best_params = (float(L_inf), float(A), float(b), float(d))

    L_inf, A, b, d = best_params

    # Final mild clipping to reasonable ranges
    b = float(np.clip(b, 0.0, 2.0))
    d = float(np.clip(d, 0.0, 2.0))
    L_inf = float(max(0.0, L_inf))
    A = float(max(_EPS, A))
    return {"L_inf": L_inf, "A": A, "b": b, "d": d}


def _fit_params() -> Dict[str, Dict[str, float]]:
    rows = _load_rows()
    if not rows:
        # Fallback defaults
        return {
            "__all__": {"L_inf": 1.0, "A": 10.0, "b": 0.2, "d": 0.5},
        }
    # Group rows
    by_group: Dict[str, List[Dict[str, float]]] = {}
    for r in rows:
        g = str(r.get("group", "__all__"))
        by_group.setdefault(g, []).append(r)
    params: Dict[str, Dict[str, float]] = {}
    for g, grp_rows in by_group.items():
        params[g] = _fit_group(grp_rows)
    # Also compute a global fit as fallback
    params["__all__"] = _fit_group(rows)
    return params


# Fit once and cache
_PARAMS = _fit_params()


def _predict_loss(num_params: float, parallel_size: float, params: Dict[str, float]) -> float:
    n = float(max(num_params, _EPS))
    p = float(max(parallel_size, 1.0))
    L_inf = params["L_inf"]
    A = params["A"]
    b = params["b"]
    d = params["d"]
    return float(L_inf + A * (n ** (-b)) * (p ** (-d)))


def get_params() -> Dict[str, Dict[str, float]]:
    "Return the fitted parameters per group (including '__all__')."
    return dict(_PARAMS)


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    params = _PARAMS.get(group) or _PARAMS.get("__all__")
    if params is None:
        params = {"L_inf": 1.0, "A": 10.0, "b": 0.2, "d": 0.5}

    outputs: List[Dict[str, float]] = []
    for row in input_data:
        n = float(row.get("num_params", 0.0))
        p = float(row.get("parallel_size", 1.0))
        y = _predict_loss(n, p, params)
        outputs.append({"loss": y})
    return outputs

#2 Run 2 R² = 0.999441

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
               The functional form of the law is the same for all groups, but
               the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s). Keys: 'loss'.

    Scaling law (shared functional form across groups):
        loss_hat = a_g + b_g * ln(num_params) + c_g * ln(parallel_size)

    Notes:
        - Coefficients (a_g, b_g, c_g) are fit per group from /app/data at first use.
        - If the dataset is unavailable or insufficient for the requested group,
          a reasonable fallback is used.
    """
    # Lazy, on-demand state; keep only a single top-level function in the module.
    state = getattr(law, "_state", None)
    if state is None:
        law._state = {"coeffs": {}, "loaded": False, "dataset": None}
        state = law._state

    from math import log
    import numpy as np  # numpy is used only inside this function

    def _safe_log(x: float, eps: float = 1e-12) -> float:
        # Ensure strictly positive before log; guard against zeros/negatives.
        if x is None:
            return 0.0
        try:
            xv = float(x)
        except Exception:
            return 0.0
        if not np.isfinite(xv) or xv <= 0.0:
            xv = eps
        return float(np.log(xv))

    def _load_dataset_once():
        if state["loaded"]:
            return
        try:
            # Load HF dataset from disk; supports Dataset or DatasetDict
            from datasets import load_from_disk  # type: ignore
            ds = load_from_disk("/app/data")
            state["dataset"] = ds
        except Exception:
            state["dataset"] = None
        finally:
            state["loaded"] = True

    def _iter_all_rows(ds_obj):
        # Yield dictionaries with at least keys: 'num_params', 'parallel_size', 'loss', optional 'group'
        # Handle DatasetDict (with splits) and Dataset.
        try:
            # Prefer 'train' split if present
            base = ds_obj["train"] if (hasattr(ds_obj, "keys") and ("train" in list(ds_obj.keys()))) else ds_obj
        except Exception:
            base = ds_obj

        # Try column-oriented access (fast)
        try:
            data_dict = base.to_dict()
            cols = data_dict.keys()
            nums = data_dict.get("num_params", [])
            pars = data_dict.get("parallel_size", [])
            losses = data_dict.get("loss", [])
            groups = data_dict.get("group", None)
            n = min(len(nums), len(pars), len(losses))
            for i in range(n):
                row = {
                    "num_params": nums[i],
                    "parallel_size": pars[i],
                    "loss": losses[i],
                }
                if groups is not None and i < len(groups):
                    row["group"] = groups[i]
                yield row
            return
        except Exception:
            pass

        # Fallback: instance iteration
        try:
            for ex in base:
                yield ex
        except Exception:
            return

    def _fit_group(g: str):
        if g in state["coeffs"]:
            return

        # Default fallback if fitting is impossible
        default_coeffs = (4.0, -0.08, -0.03)  # a, b, c

        _load_dataset_once()
        ds = state["dataset"]

        rows = []
        if ds is not None:
            try:
                for r in _iter_all_rows(ds):
                    # Accept rows missing 'group' as applicable to all groups
                    if ("group" not in r) or (r.get("group") == g):
                        rows.append(r)
            except Exception:
                rows = []

        # Build design matrix for linear regression: y = a + b*ln(N) + c*ln(K)
        X_parts = []
        y_vals = []
        if rows:
            for r in rows:
                n = r.get("num_params", None)
                k = r.get("parallel_size", None)
                y = r.get("loss", None)
                try:
                    n = float(n) if n is not None else None
                    k = float(k) if k is not None else None
                    y = float(y) if y is not None else None
                except Exception:
                    continue
                if y is None or not np.isfinite(y):
                    continue
                lnN = _safe_log(n)
                lnK = _safe_log(k)
                if not (np.isfinite(lnN) and np.isfinite(lnK)):
                    continue
                X_parts.append((1.0, lnN, lnK))
                y_vals.append(y)

        # Fit per-group if enough data
        a_b_c = None
        if len(y_vals) >= 3:
            X = np.array(X_parts, dtype=float)
            y_arr = np.array(y_vals, dtype=float)
            try:
                beta, *_ = np.linalg.lstsq(X, y_arr, rcond=None)
                a_b_c = (float(beta[0]), float(beta[1]), float(beta[2]))
            except Exception:
                a_b_c = None

        # If per-group failed, try global fit ignoring groups
        if a_b_c is None and ds is not None:
            try:
                X_parts = []
                y_vals = []
                for r in _iter_all_rows(ds):
                    n = r.get("num_params", None)
                    k = r.get("parallel_size", None)
                    y = r.get("loss", None)
                    try:
                        n = float(n) if n is not None else None
                        k = float(k) if k is not None else None
                        y = float(y) if y is not None else None
                    except Exception:
                        continue
                    if y is None or not np.isfinite(y):
                        continue
                    lnN = _safe_log(n)
                    lnK = _safe_log(k)
                    if not (np.isfinite(lnN) and np.isfinite(lnK)):
                        continue
                    X_parts.append((1.0, lnN, lnK))
                    y_vals.append(y)
                if len(y_vals) >= 3:
                    X = np.array(X_parts, dtype=float)
                    y_arr = np.array(y_vals, dtype=float)
                    beta, *_ = np.linalg.lstsq(X, y_arr, rcond=None)
                    a_b_c = (float(beta[0]), float(beta[1]), float(beta[2]))
            except Exception:
                a_b_c = None

        state["coeffs"][g] = a_b_c if a_b_c is not None else default_coeffs

    # Ensure coefficients for this group are available
    _fit_group(group)
    a, b, c = state["coeffs"][group]

    # Make predictions
    outputs: list[dict[str, float]] = []
    for row in input_data:
        n = row.get("num_params", 0.0)
        k = row.get("parallel_size", 1.0)
        lnN = _safe_log(n)
        lnK = _safe_log(k)
        pred = a + b * lnN + c * lnK
        # Do not force non-negativity; return as float
        outputs.append({"loss": float(pred)})

    return outputs

#3 Run 3 R² = 0.999134

▼

Python

from __future__ import annotations

import math
import os
from typing import Dict, List

import numpy as np

# We will try to learn group-specific parameters from /app/data at import time.
# If that fails (e.g., datasets isn't available or data is missing), we fall back
# to conservative defaults that tend to extrapolate smoothly.
_COEFFS: Dict[str, Dict[str, float]] = {}
_FITTED: bool = False
_DATA_PATH = "/app/data"


def _safe_log(x: np.ndarray) -> np.ndarray:
    """Numerically safe natural log."""
    return np.log(np.clip(x, 1e-300, None))


def _extract_group_key_name(rows: List[dict]) -> str | None:
    if not rows:
        return None
    candidate_keys = ["group", "Group", "grp", "experiment", "exp_group"]
    keys = set(rows[0].keys())
    for k in candidate_keys:
        if k in keys:
            return k
    return None


def _fit_group(rows: List[dict]) -> Dict[str, float]:
    """
    Fit parameters for the scaling law:
        loss = L_inf + A * num_params^(-alpha) * parallel_size^(-beta)
    using a simple grid over L_inf and linear regression in log-space.
    """
    # Extract and validate data
    N = np.array([float(r.get("num_params", np.nan)) for r in rows], dtype=float)
    P = np.array([float(r.get("parallel_size", np.nan)) for r in rows], dtype=float)
    y = np.array([float(r.get("loss", np.nan)) for r in rows], dtype=float)

    mask = np.isfinite(N) & np.isfinite(P) & np.isfinite(y) & (N > 0) & (P > 0) & (y > 0)
    N, P, y = N[mask], P[mask], y[mask]

    if len(y) < 3:
        # Not enough data, return defaults
        return {"L_inf": float(np.nanmin(y) if len(y) else 1.0) * 0.95 if len(y) else 1.0,
                "A": 1.0, "alpha": 0.2, "beta": 0.2}

    y_min = float(np.min(y))
    # Build candidate L_inf values as fractions of min observed loss (below it)
    fracs = np.array([0.90, 0.92, 0.94, 0.96, 0.98])
    L_candidates = np.minimum(y_min * fracs, y_min - 1e-6)

    best = None
    X_cols = [np.ones_like(N), _safe_log(N), _safe_log(P)]
    X = np.vstack(X_cols).T

    for L0 in L_candidates:
        t = y - L0
        m = t > 0
        if np.count_nonzero(m) < 3:
            continue
        y_lin = _safe_log(t[m])
        X_m = X[m]

        # Linear least squares: y_lin = w0 + w1*logN + w2*logP
        try:
            w, *_ = np.linalg.lstsq(X_m, y_lin, rcond=None)
        except np.linalg.LinAlgError:
            continue

        y_lin_hat = X_m @ w
        sse = float(np.sum((y_lin - y_lin_hat) ** 2))
        if (best is None) or (sse < best[0]):
            best = (sse, L0, w)

    if best is None:
        # Fallback robust defaults
        return {"L_inf": y_min * 0.95, "A": 1.0, "alpha": 0.2, "beta": 0.2}

    _, L_inf, w = best
    lnA, wN, wP = float(w[0]), float(w[1]), float(w[2])
    A = float(math.exp(lnA))
    alpha = float(max(1e-6, -wN))  # enforce positive exponent
    beta = float(max(1e-6, -wP))   # enforce positive exponent

    return {"L_inf": float(L_inf), "A": A, "alpha": alpha, "beta": beta}


def _load_and_fit() -> None:
    global _COEFFS, _FITTED
    if _FITTED:
        return
    _FITTED = True  # prevent re-entry

    # Defaults if anything goes wrong
    default_params = {"L_inf": 1.0, "A": 1.0, "alpha": 0.25, "beta": 0.25}
    _COEFFS = {"GLOBAL": default_params.copy()}

    try:
        # Lazy import to avoid hard dependency if not available
        from datasets import load_from_disk, Dataset, DatasetDict  # type: ignore
    except Exception:
        return  # keep defaults

    if not os.path.exists(_DATA_PATH):
        return

    try:
        ds_any = load_from_disk(_DATA_PATH)
    except Exception:
        return

    # Collect rows across splits if DatasetDict
    rows: List[dict] = []
    try:
        if "DatasetDict" in type(ds_any).__name__:
            # Concatenate all splits except a possible 'test' if present
            for split_name, split in ds_any.items():
                if hasattr(split, "to_list"):
                    rows.extend(split.to_list())
                else:
                    rows.extend([dict(zip(split.column_names, r)) for r in split])
        else:
            # Single Dataset
            if hasattr(ds_any, "to_list"):
                rows = ds_any.to_list()
            else:
                rows = [dict(zip(ds_any.column_names, r)) for r in ds_any]
    except Exception:
        # If conversion failed, abort to defaults
        return

    if not rows:
        return

    gkey = _extract_group_key_name(rows)
    if gkey is None:
        # Fit a single global model
        _COEFFS["GLOBAL"] = _fit_group(rows)
        return

    # Fit per-group and also a GLOBAL over all data
    _COEFFS["GLOBAL"] = _fit_group(rows)

    # Group rows
    groups: Dict[str, List[dict]] = {}
    for r in rows:
        g = str(r.get(gkey, "GLOBAL"))
        groups.setdefault(g, []).append(r)

    for g, gro in groups.items():
        _COEFFS[g] = _fit_group(gro)


def _get_params_for_group(group: str) -> Dict[str, float]:
    # Ensure coefficients are loaded/fitted
    if not _FITTED:
        _load_and_fit()

    if group in _COEFFS:
        return _COEFFS[group]
    # Fallbacks
    if "GLOBAL" in _COEFFS:
        return _COEFFS["GLOBAL"]
    return {"L_inf": 1.0, "A": 1.0, "alpha": 0.25, "beta": 0.25}


def _predict_one(n_params: float, p_size: float, params: Dict[str, float]) -> float:
    n = max(1e-12, float(n_params))
    p = max(1e-12, float(p_size))
    L_inf = float(params["L_inf"])
    A = float(params["A"])
    alpha = float(params["alpha"])
    beta = float(params["beta"])

    # loss = L_inf + A * n^{-alpha} * p^{-beta}
    pred = L_inf + A * (n ** (-alpha)) * (p ** (-beta))
    # Guard against numerical oddities
    if not np.isfinite(pred):
        pred = float(L_inf)
    return float(pred)


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
                    Expected keys: "num_params", "parallel_size".
        group: The name of the experimental group for which to make predictions.
               The functional form is identical across groups but coefficients differ.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s): {"loss": value}.
    """
    params = _get_params_for_group(str(group))
    preds: List[dict] = []
    for row in input_data:
        n = float(row.get("num_params", 0.0))
        p = float(row.get("parallel_size", 1.0))
        yhat = _predict_one(n, p, params)
        preds.append({"loss": yhat})
    return preds

#4 Run 4 R² = 0.999042

▼

Python

from __future__ import annotations

from typing import Dict, Iterable, List, Tuple
import math

# We attempt to learn group-specific coefficients from /app/data at import time.
# The functional form is shared across groups:
#   loss ≈ L0_g + s_g * (num_params ** a_g) * (parallel_size ** b_g)
#
# Where:
#   - L0_g is an irreducible loss floor for group g
#   - s_g is a scale factor
#   - a_g < 0 captures improvement with model size
#   - b_g < 0 captures improvement with the degree of parallelism (ensemble/aggregation)
#
# Coefficients are estimated by choosing L0 via a small grid search and
# fitting log(loss - L0) = log(s) + a*log(num_params) + b*log(parallel_size)
# with ordinary least squares. If the dataset is unavailable, we fall back to
# conservative defaults.

# Global, group-keyed coefficients: group -> (L0, s, a, b)
_COEFFS: Dict[str, Tuple[float, float, float, float]] = {}
# Fallback/global coefficients across all groups
_GLOBAL_COEFFS: Tuple[float, float, float, float] | None = None

_EPS = 1e-12
_DATA_PATH = "/app/data"


def _safe_log(x: Iterable[float]) -> List[float]:
    return [math.log(max(v, _EPS)) for v in x]


def _lstsq(X: List[List[float]], y: List[float]) -> Tuple[List[float], float]:
    """
    Minimal OLS using normal equations with 3 features (intercept, x1, x2).
    Returns (beta, sse) where beta = [b0, b1, b2].
    """
    # Build normal equations: (X^T X) beta = X^T y
    # X: n x 3
    n = len(y)
    if n == 0:
        return [0.0, 0.0, 0.0], float("inf")

    s00 = s01 = s02 = s11 = s12 = s22 = 0.0
    t0 = t1 = t2 = 0.0
    for i in range(n):
        xi0, xi1, xi2 = X[i]
        yi = y[i]
        s00 += xi0 * xi0
        s01 += xi0 * xi1
        s02 += xi0 * xi2
        s11 += xi1 * xi1
        s12 += xi1 * xi2
        s22 += xi2 * xi2
        t0 += xi0 * yi
        t1 += xi1 * yi
        t2 += xi2 * yi

    # Solve 3x3 system via Cramer's rule for robustness without numpy
    # Matrix:
    # [s00 s01 s02] [b0] = [t0]
    # [s01 s11 s12] [b1]   [t1]
    # [s02 s12 s22] [b2]   [t2]
    def det3(a00, a01, a02, a10, a11, a12, a20, a21, a22) -> float:
        return (
            a00 * (a11 * a22 - a12 * a21)
            - a01 * (a10 * a22 - a12 * a20)
            + a02 * (a10 * a21 - a11 * a20)
        )

    D = det3(s00, s01, s02, s01, s11, s12, s02, s12, s22)
    if abs(D) < 1e-18:
        # Degenerate; return zeros and high SSE
        return [0.0, 0.0, 0.0], float("inf")

    D0 = det3(t0, s01, s02, t1, s11, s12, t2, s12, s22)
    D1 = det3(s00, t0, s02, s01, t1, s12, s02, t2, s22)
    D2 = det3(s00, s01, t0, s01, s11, t1, s02, s12, t2)
    b0, b1, b2 = D0 / D, D1 / D, D2 / D

    # Compute SSE in original (linear) space after back-transform
    sse = 0.0
    for i in range(n):
        # Back-transform: z = X beta => pred_log = z => pred = exp(z)
        pred_log = b0 * X[i][0] + b1 * X[i][1] + b2 * X[i][2]
        pred = math.exp(pred_log)
        # The caller accounts for L0 outside
        # Here we return SSE of log-fit as diagnostic; linear SSE computed by caller.
        # For stability, return SSE in log space to compare fits consistently.
        e = y[i] - pred_log
        sse += e * e

    return [b0, b1, b2], sse


def _fit_group(
    y: List[float], n_params: List[float], p_size: List[float]
) -> Tuple[float, float, float, float]:
    """
    Fit parameters (L0, s, a, b) for one group using grid search over L0 and OLS in log-space.
    """
    # Sanity: ensure strictly positive features
    n_params = [max(v, _EPS) for v in n_params]
    p_size = [max(v, _EPS) for v in p_size]
    y = [float(v) for v in y]

    y_min = min(y)
    y_max = max(y)
    if not math.isfinite(y_min) or not math.isfinite(y_max):
        return (0.0, 1.0, -0.2, -0.5)

    # Grid L0 below the minimum observed loss
    span = max(y_max - y_min, 1e-6)
    # 41 candidates from (y_min - 0.5*span) up to (y_min - 1e-6)
    grid = [
        (y_min - 0.5 * span) + i * (0.5 * span - 1e-6) / 40.0 for i in range(41)
    ]

    best = None  # (lin_sse, L0, b0, b1, b2)
    x1 = _safe_log(n_params)
    x2 = _safe_log(p_size)

    for L0 in grid:
        # Ensure y - L0 > 0
        diff = [max(val - L0, _EPS) for val in y]
        # Prepare OLS in log space: log(diff) = b0*1 + b1*log(n) + b2*log(p)
        z = [math.log(d) for d in diff]
        X = [[1.0, x1[i], x2[i]] for i in range(len(z))]
        beta, _ = _lstsq(X, z)
        b0, b1, b2 = beta

        # Evaluate SSE in original space
        sse = 0.0
        for i in range(len(y)):
            pred = L0 + math.exp(b0 + b1 * x1[i] + b2 * x2[i])
            e = y[i] - pred
            sse += e * e

        if (best is None) or (sse < best[0]):
            best = (sse, L0, b0, b1, b2)

    if best is None:
        return (0.0, 1.0, -0.2, -0.5)

    _, L0, b0, a, b = best
    s = math.exp(b0)
    return (L0, s, a, b)


def _attempt_learn_coeffs() -> None:
    global _COEFFS, _GLOBAL_COEFFS
    try:
        from datasets import load_from_disk, Dataset, DatasetDict, concatenate_datasets  # type: ignore
    except Exception:
        # Datasets library is unavailable; use defaults
        _COEFFS = {}
        _GLOBAL_COEFFS = (0.0, 1.0, -0.25, -0.5)
        return

    try:
        ds = load_from_disk(_DATA_PATH)
    except Exception:
        # Dataset not present; defaults
        _COEFFS = {}
        _GLOBAL_COEFFS = (0.0, 1.0, -0.25, -0.5)
        return

    # Flatten to a single dataset
    if isinstance(ds, DatasetDict):
        parts = [v for k, v in ds.items()]
        try:
            flat = concatenate_datasets(parts)
        except Exception:
            # Fallback: use the first split
            flat = parts[0]
    else:
        flat = ds  # type: ignore[assignment]

    # Determine group field
    cand_group_fields = ["group", "group_name", "dataset", "split"]
    group_field = None
    for k in cand_group_fields:
        if k in flat.column_names:
            group_field = k
            break

    # Required fields
    required = ["num_params", "parallel_size", "loss"]
    for r in required:
        if r not in flat.column_names:
            # Can't fit; leave defaults
            _COEFFS = {}
            _GLOBAL_COEFFS = (0.0, 1.0, -0.25, -0.5)
            return

    # Collect per-group data
    by_group: Dict[str, Dict[str, List[float]]] = {}
    for ex in flat:
        g = str(ex[group_field]) if group_field is not None else "default"
        d = by_group.setdefault(g, {"y": [], "n": [], "p": []})
        try:
            n = float(ex["num_params"])
            p = float(ex["parallel_size"])
            y = float(ex["loss"])
        except Exception:
            # Skip malformed rows
            continue
        if not (math.isfinite(n) and math.isfinite(p) and math.isfinite(y)):
            continue
        d["y"].append(y)
        d["n"].append(n)
        d["p"].append(p)

    # Fit global coefficients
    all_y: List[float] = []
    all_n: List[float] = []
    all_p: List[float] = []
    for g, d in by_group.items():
        all_y.extend(d["y"])
        all_n.extend(d["n"])
        all_p.extend(d["p"])
    if len(all_y) >= 3:
        _GLOBAL_COEFFS = _fit_group(all_y, all_n, all_p)
    else:
        _GLOBAL_COEFFS = (0.0, 1.0, -0.25, -0.5)

    # Fit each group
    coeffs: Dict[str, Tuple[float, float, float, float]] = {}
    for g, d in by_group.items():
        if len(d["y"]) >= 3:
            coeffs[g] = _fit_group(d["y"], d["n"], d["p"])
        else:
            coeffs[g] = _GLOBAL_COEFFS  # fallback
    _COEFFS = coeffs


_attempt_learn_coeffs()


def _predict_one(
    n_params: float, p_size: float, coeffs: Tuple[float, float, float, float]
) -> float:
    n_params = max(float(n_params), _EPS)
    p_size = max(float(p_size), _EPS)
    L0, s, a, b = coeffs
    return L0 + s * (n_params ** a) * (p_size ** b)


def _coeffs_for_group(group: str) -> Tuple[float, float, float, float]:
    if group in _COEFFS:
        return _COEFFS[group]
    if _GLOBAL_COEFFS is not None:
        return _GLOBAL_COEFFS
    # Ultimate fallback
    return (0.0, 1.0, -0.25, -0.5)


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
               The functional form of the law is shared, coefficients vary per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    coeffs = _coeffs_for_group(group)
    out: List[Dict[str, float]] = []
    for row in input_data:
        n = float(row.get("num_params", 0.0))
        p = float(row.get("parallel_size", 0.0))
        pred = _predict_one(n, p, coeffs)
        out.append({"loss": float(pred)})
    return out


def _write_explain(path: str = "/app/explain.md") -> None:
    """
    Utility to write a human-readable explanation file with fitted parameters.
    """
    lines: List[str] = []
    lines.append("# Parallel Scaling Law for Language Modeling Loss")
    lines.append("")
    lines.append("We model the final loss as a shared functional form across groups:")
    lines.append("")
    lines.append("loss_hat = L0_g + s_g * num_params^{a_g} * parallel_size^{b_g}")
    lines.append("")
    lines.append("Interpretation:")
    lines.append("- L0_g: irreducible loss floor for group g")
    lines.append("- a_g < 0: larger models reduce loss via a power law")
    lines.append("- b_g < 0: aggregating parallel outputs reduces loss (akin to ensembling)")
    lines.append("")
    lines.append("Fitting procedure:")
    lines.append("- Grid search over L0 below min(loss) for numerical stability.")
    lines.append("- For each L0, fit log(loss - L0) = log(s) + a*log(num_params) + b*log(parallel_size)")
    lines.append("- Choose the L0 and coefficients minimizing squared error in the original space.")
    lines.append("")
    if _GLOBAL_COEFFS is not None:
        L0, s, a, b = _GLOBAL_COEFFS
        lines.append("Global coefficients (all groups pooled):")
        lines.append(f"- L0 = {L0:.6g}, s = {s:.6g}, a = {a:.6g}, b = {b:.6g}")
        lines.append("")
    if _COEFFS:
        lines.append("Per-group fitted coefficients:")
        for g, (L0, s, a, b) in sorted(_COEFFS.items(), key=lambda kv: str(kv[0])):
            lines.append(f"- {g}: L0 = {L0:.6g}, s = {s:.6g}, a = {a:.6g}, b = {b:.6g}")
        lines.append("")
    lines.append("Notes:")
    lines.append("- The same functional form is used for every group; only the constants differ.")
    lines.append("- The exponent b often trends near -0.5, consistent with variance reduction from aggregating parallel outputs.")
    lines.append("- The model is intentionally simple to support extrapolation.")
    content = "\n".join(lines) + "\n"
    try:
        with open(path, "w", encoding="utf-8") as f:
            f.write(content)
    except Exception:
        # Best-effort; ignore write errors
        pass

#5 Run 5 R² = 0.955343

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    import math

    # Lazy-fit and cache parameters on the function object
    if getattr(law, "_params", None) is None:
        params_by_group: dict[str, dict[str, float]] = {}

        # Attempt to load the dataset and fit parameters per group.
        dataset_ok = False
        try:
            # HuggingFace datasets is optional; handle absence gracefully.
            from datasets import load_from_disk, concatenate_datasets  # type: ignore

            ds = load_from_disk("/app/data")
            # ds can be a Dataset or DatasetDict. If DatasetDict, concatenate all splits.
            try:
                # DatasetDict-like
                values = list(ds.values())  # type: ignore[attr-defined]
                dataset = concatenate_datasets(values)
            except Exception:
                # Already a Dataset
                dataset = ds

            # Iterate rows and collect per-group data.
            has_group = "group" in dataset.column_names
            groups: dict[str, list[tuple[float, float, float]]] = {}
            for ex in dataset:
                n = float(ex.get("num_params", 0.0))
                p = float(ex.get("parallel_size", 0.0))
                l = float(ex.get("loss", 0.0))
                g = str(ex["group"]) if has_group else "default"
                groups.setdefault(g, []).append((n, p, l))

            # Helper to perform a small, dependency-light OLS on log-transformed data.
            def _fit_group(triples: list[tuple[float, float, float]]) -> dict[str, float]:
                # Model: loss = L0 + C * N^{-alpha} * P^{-beta}
                # => ln(loss - L0) = ln C - alpha ln N - beta ln P
                # Choose L0 slightly below the minimum observed loss to keep positivity.
                ls = [t[2] for t in triples]
                min_l = min(ls)
                # Ensure strictly positive margin; scale epsilon to data magnitude.
                span = (max(ls) - min_l) if len(ls) > 1 else max(1e-6, abs(min_l))
                L0 = min_l - max(1e-6, 1e-6 * span)

                # Prepare design matrix components using safe logs.
                lnN: list[float] = []
                lnP: list[float] = []
                y: list[float] = []
                for n, p, l in triples:
                    n_safe = max(1.0, float(n))
                    p_safe = max(1.0, float(p))
                    resid = max(l - L0, 1e-12)
                    lnN.append(math.log(n_safe))
                    lnP.append(math.log(p_safe))
                    y.append(math.log(resid))

                # Try numpy if available; otherwise, solve 3x3 normal equations manually.
                try:
                    import numpy as np  # type: ignore

                    X = np.column_stack([np.ones(len(y)), np.array(lnN), np.array(lnP)])
                    Y = np.array(y)
                    # Solve least squares: b = (X^T X)^{-1} X^T Y
                    b, *_ = np.linalg.lstsq(X, Y, rcond=None)
                    b0, b1, b2 = float(b[0]), float(b[1]), float(b[2])
                except Exception:
                    # Manual normal equations for 3 parameters: [1, lnN, lnP]
                    n_samp = float(len(y))
                    s1 = sum(lnN)
                    s2 = sum(lnP)
                    s11 = sum(v * v for v in lnN)
                    s22 = sum(v * v for v in lnP)
                    s12 = sum(a * b for a, b in zip(lnN, lnP))

                    t0 = sum(y)
                    t1 = sum(a * b for a, b in zip(lnN, y))
                    t2 = sum(a * b for a, b in zip(lnP, y))

                    # Build normal equations matrix A and rhs vector rhs
                    A00 = n_samp
                    A01 = s1
                    A02 = s2
                    A10 = s1
                    A11 = s11
                    A12 = s12
                    A20 = s2
                    A21 = s12
                    A22 = s22

                    # Solve A * b = rhs using Cramer's rule for 3x3
                    def det3(a00, a01, a02, a10, a11, a12, a20, a21, a22) -> float:
                        return (
                            a00 * (a11 * a22 - a12 * a21)
                            - a01 * (a10 * a22 - a12 * a20)
                            + a02 * (a10 * a21 - a11 * a20)
                        )

                    D = det3(A00, A01, A02, A10, A11, A12, A20, A21, A22)
                    if abs(D) < 1e-12:
                        # Fall back: near-singular, use simple averages
                        b0 = t0 / max(n_samp, 1.0)
                        b1 = 0.0
                        b2 = 0.0
                    else:
                        D0 = det3(t0, A01, A02, t1, A11, A12, t2, A21, A22)
                        D1 = det3(A00, t0, A02, A10, t1, A12, A20, t2, A22)
                        D2 = det3(A00, A01, t0, A10, A11, t1, A20, A21, t2)
                        b0 = D0 / D
                        b1 = D1 / D
                        b2 = D2 / D

                C = math.exp(b0)
                alpha = -b1
                beta = -b2
                # Sanity: constrain to reasonable ranges
                alpha = float(max(0.0, min(alpha, 5.0)))
                beta = float(max(0.0, min(beta, 5.0)))
                C = float(max(1e-12, C))

                return {"L0": float(L0), "C": C, "alpha": alpha, "beta": beta}

            for gname, triples in groups.items():
                if len(triples) >= 2:
                    params_by_group[gname] = _fit_group(triples)
                else:
                    # Not enough data; use a conservative default.
                    params_by_group[gname] = {"L0": 1.0, "C": 1.0, "alpha": 0.1, "beta": 0.1}

            dataset_ok = True
        except Exception:
            dataset_ok = False

        if not dataset_ok:
            # Single default parameter set if dataset unavailable.
            params_by_group = {"default": {"L0": 1.0, "C": 1.0, "alpha": 0.2, "beta": 0.2}}

        # Save cache on function
        setattr(law, "_params", params_by_group)

    params_by_group = getattr(law, "_params")

    # Choose parameter set for the requested group, with sensible fallbacks.
    if group in params_by_group:
        pset = params_by_group[group]
    elif "default" in params_by_group:
        pset = params_by_group["default"]
    else:
        # Arbitrarily pick the first available group
        first_key = next(iter(params_by_group.keys()))
        pset = params_by_group[first_key]

    L0 = float(pset["L0"])
    C = float(pset["C"])
    alpha = float(pset["alpha"])
    beta = float(pset["beta"])

    # Predict for each input item.
    outputs: list[dict[str, float]] = []
    for item in input_data or []:
        n = float(item.get("num_params", 1.0))
        p = float(item.get("parallel_size", 1.0))
        n_safe = max(1.0, n)
        p_safe = max(1.0, p)
        pred = L0 + C * (n_safe ** (-alpha)) * (p_safe ** (-beta))
        outputs.append({"loss": float(pred)})

    return outputs