SLD - Domain Mixture Scaling Law

All Runs (sorted by R²)

Best Run 1 R² = 0.971147

▼

Python

from __future__ import annotations

import math
import os
from typing import Dict, List

# Public API
def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Fit/load coefficients once (lazy on first invocation)
    _ensure_fitted()
    coeffs_for_group = _COEFFS.get(group, _COEFFS.get(_GLOBAL_KEY, _default_coeffs()))

    results: List[Dict[str, float]] = []
    for row in input_data:
        pred: Dict[str, float] = {}
        for d in _DOMAINS:
            p = float(row.get(f"proportion_domain_{d}", 0.0))
            pred[f"loss_domain_{d}"] = _predict_single(p, coeffs_for_group[d])
        results.append(pred)
    return results


# ------------------------
# Internal implementation
# ------------------------

# Model/Formula:
# For each domain i in {1..5}, and for any group g:
#     loss_domain_i = a_{g,i} + b_{g,i} * log(p_i + eps) + c_{g,i} * [log(p_i + eps)]^2
# where p_i is the mixture proportion for domain i, eps = 1e-12.
# This "quadratic-in-log" model captures a wide class of power-law-like curves
# without requiring nonlinear optimization, improving stability and extrapolation.

_EPS = 1e-12
_DOMAINS = (1, 2, 3, 4, 5)
_GLOBAL_KEY = "__GLOBAL__"

# Coefficients structure:
# _COEFFS[group][domain] = (a, b, c)
_COEFFS: Dict[str, Dict[int, tuple[float, float, float]]] = {}

# R^2 scores for reporting (per group/domain)
_R2: Dict[str, Dict[int, float]] = {}

# Guard for one-time fit
_FITTED = False


def _predict_single(p: float, abc: tuple[float, float, float]) -> float:
    a, b, c = abc
    lp = math.log(max(p, _EPS))
    return a + b * lp + c * (lp * lp)


def _default_coeffs() -> Dict[int, tuple[float, float, float]]:
    # Neutral fallback: constant ~1.0 loss if fitting is unavailable
    return {d: (1.0, 0.0, 0.0) for d in _DOMAINS}


def _ensure_fitted() -> None:
    global _FITTED
    if _FITTED:
        return
    try:
        ds = _load_dataset("/app/data")
        if ds is None:
            # Could not load dataset; use defaults
            _COEFFS[_GLOBAL_KEY] = _default_coeffs()
            _FITTED = True
            _write_explain_file()
            return

        # Determine available groups
        groups = _collect_groups(ds)
        if not groups:
            groups = {_GLOBAL_KEY}

        # Fit per group
        for g in groups:
            rows = (r for r in ds if (g == _GLOBAL_KEY or r.get("group") == g))
            coeffs_g, r2_g = _fit_group(rows)
            _COEFFS[g] = coeffs_g
            _R2[g] = r2_g

        # Also fit global across all data for robustness/fallback
        rows_all = (r for r in ds)
        coeffs_global, r2_global = _fit_group(rows_all)
        _COEFFS[_GLOBAL_KEY] = coeffs_global
        _R2[_GLOBAL_KEY] = r2_global

    except Exception:
        # Any failure => ensure safe defaults
        _COEFFS[_GLOBAL_KEY] = _default_coeffs()
    finally:
        _FITTED = True
        # Best-effort write explanation (ignore errors)
        try:
            _write_explain_file()
        except Exception:
            pass


def _load_dataset(path: str):
    try:
        from datasets import load_from_disk  # type: ignore
    except Exception:
        return None
    if not os.path.exists(path):
        return None
    ds = load_from_disk(path)
    # Support DatasetDict or Dataset
    try:
        # DatasetDict
        if hasattr(ds, "keys"):
            if "train" in ds:
                ds_split = ds["train"]
            else:
                # Pick the first available split
                first_key = next(iter(ds.keys()))
                ds_split = ds[first_key]
        else:
            ds_split = ds
    except Exception:
        ds_split = ds
    return ds_split


def _collect_groups(ds) -> set:
    groups = set()
    try:
        for r in ds:
            g = r.get("group")
            if g is not None:
                groups.add(g)
    except Exception:
        return set()
    return groups


def _fit_group(rows_iter):
    # Linear regression (ridge-regularized normal equations) for each domain
    # y = a*1 + b*lp + c*lp^2  with lp = log(p + eps)
    # We accumulate X^T X and X^T y in streaming fashion to avoid extra deps.
    coeffs: Dict[int, tuple[float, float, float]] = {}
    r2s: Dict[int, float] = {}

    # Materialize rows for reuse (single pass needed for each domain)
    rows = list(rows_iter)

    for d in _DOMAINS:
        # Initialize 3x3 matrix and 3x1 vector
        xtx = [[0.0, 0.0, 0.0],
               [0.0, 0.0, 0.0],
               [0.0, 0.0, 0.0]]
        xty = [0.0, 0.0, 0.0]

        y_vals = []
        f_list = []

        for r in rows:
            p = float(r.get(f"proportion_domain_{d}", 0.0))
            y = r.get(f"loss_domain_{d}")
            if y is None:
                continue
            y = float(y)
            lp = math.log(max(p, _EPS))
            f0 = 1.0
            f1 = lp
            f2 = lp * lp
            f = (f0, f1, f2)
            # Accumulate
            xtx[0][0] += f0 * f0; xtx[0][1] += f0 * f1; xtx[0][2] += f0 * f2
            xtx[1][0] += f1 * f0; xtx[1][1] += f1 * f1; xtx[1][2] += f1 * f2
            xtx[2][0] += f2 * f0; xtx[2][1] += f2 * f1; xtx[2][2] += f2 * f2

            xty[0] += f0 * y; xty[1] += f1 * y; xty[2] += f2 * y

            y_vals.append(y)
            f_list.append(f)

        n = len(y_vals)
        if n == 0:
            coeffs[d] = (1.0, 0.0, 0.0)
            r2s[d] = 0.0
            continue

        # Ridge regularization to stabilize
        lam = 1e-8
        xtx[0][0] += lam
        xtx[1][1] += lam
        xtx[2][2] += lam

        a, b, c = _solve_3x3(xtx, xty)

        coeffs[d] = (a, b, c)

        # Compute R^2
        y_mean = sum(y_vals) / n
        ss_tot = sum((yy - y_mean) ** 2 for yy in y_vals) or 1e-12
        ss_res = 0.0
        for (f0, f1, f2), yy in zip(f_list, y_vals):
            yhat = a + b * f1 + c * f2
            ss_res += (yy - yhat) ** 2
        r2s[d] = 1.0 - (ss_res / ss_tot)

    return coeffs, r2s


def _solve_3x3(a: List[List[float]], b: List[float]) -> tuple[float, float, float]:
    # Gaussian elimination with partial pivoting for 3x3
    # Solve A x = b
    A = [row[:] for row in a]
    x = [0.0, 0.0, 0.0]
    rhs = b[:]

    # Forward elimination
    for i in range(3):
        # Pivot
        pivot = i
        max_abs = abs(A[i][i])
        for r in range(i + 1, 3):
            if abs(A[r][i]) > max_abs:
                max_abs = abs(A[r][i])
                pivot = r
        if max_abs < 1e-18:
            # Ill-conditioned; fallback identity
            return (0.0, 0.0, 0.0)
        if pivot != i:
            A[i], A[pivot] = A[pivot], A[i]
            rhs[i], rhs[pivot] = rhs[pivot], rhs[i]

        # Normalize and eliminate
        piv = A[i][i]
        for r in range(i + 1, 3):
            if A[r][i] == 0.0:
                continue
            f = A[r][i] / piv
            rhs[r] -= f * rhs[i]
            for c in range(i, 3):
                A[r][c] -= f * A[i][c]

    # Back substitution
    for i in reversed(range(3)):
        s = rhs[i]
        for c in range(i + 1, 3):
            s -= A[i][c] * x[c]
        if abs(A[i][i]) < 1e-18:
            x[i] = 0.0
        else:
            x[i] = s / A[i][i]

    return (x[0], x[1], x[2])


def _write_explain_file() -> None:
    # Write a detailed explanation with fitted coefficients to /app/explain.md
    lines: List[str] = []
    lines.append("# Discovered Scaling Law for Domain Mixture\n")
    lines.append("This document is auto-generated by /app/law.py when imported or first used.\n")
    lines.append("## Formula\n")
    lines.append(
        "For each domain i in {1,2,3,4,5}, and for any experimental group G, the validation loss is modeled as:\n"
    )
    lines.append(
        "    loss_domain_i = a_{G,i} + b_{G,i} * log(proportion_domain_i + 1e-12) + c_{G,i} * [log(proportion_domain_i + 1e-12)]^2\n"
    )
    lines.append(
        "This quadratic-in-log model approximates power-law behavior with a smooth curvature term and is fit via linear regression (normal equations with a small ridge regularizer).\n"
    )
    lines.append("\n## Methodology\n")
    lines.append(
        "- Loaded the dataset from /app/data using datasets.load_from_disk.\n"
        "- For each group and each domain, constructed features [1, log(p+1e-12), (log(p+1e-12))^2].\n"
        "- Solved for coefficients (a,b,c) with closed-form least squares per domain.\n"
        "- Report R² per fit to indicate goodness-of-fit. If a group is unknown at inference time, a global fit over all groups is used.\n"
    )
    lines.append("\n## Fitted Coefficients by Group and Domain\n")

    if not _COEFFS:
        lines.append("\nNo coefficients available; using defaults (1.0, 0.0, 0.0).\n")
    else:
        for g in sorted(_COEFFS.keys()):
            lines.append(f"\n### Group: {g}\n")
            lines.append("| Domain | a | b | c | R^2 |\n")
            lines.append("|---:|---:|---:|---:|---:|\n")
            for d in _DOMAINS:
                a, b, c = _COEFFS[g][d]
                r2 = _R2.get(g, {}).get(d, float('nan'))
                lines.append(f"| {d} | {a:.6f} | {b:.6f} | {c:.6f} | {r2:.4f} |\n")

    path = "/app/explain.md"
    try:
        with open(path, "w", encoding="utf-8") as f:
            f.writelines(lines)
    except Exception:
        # Swallow IO errors to avoid breaking runtime
        pass

#2 Run 2 R² = 0.899200

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Implementation notes:
    # - Functional form (shared across groups):
    #     For each domain k:
    #         loss_domain_k = a_k + sum_j w_{k,j} * log(proportion_domain_j + eps)
    #   where eps is a small constant to stabilize log near zero.
    # - Coefficients (a_k, w_{k,j}) are fitted per `group` by ridge regression
    #   on the dataset loaded from /app/data (if available).
    # - Fitted parameters are cached on the function object (law._params)
    #   and also written to /app/fit.json for inspection.
    import re
    import os
    import json
    import math

    # Lazy imports to avoid hard dependency unless actually called.
    try:
        import numpy as _np
    except Exception as _e:
        raise RuntimeError("numpy is required to run the scaling law") from _e

    # Create persistent cache on the function object
    if not hasattr(law, "_params"):
        law._params = {}

    def _load_dataset():
        try:
            from datasets import load_from_disk as _load_from_disk
            return _load_from_disk("/app/data")
        except Exception:
            return None

    def _extract_columns(ds):
        # Determine proportion and loss columns from dataset schema
        def _cols_from_split(split_ds):
            # datasets.Dataset has attribute column_names
            return list(getattr(split_ds, "column_names", []))
        if hasattr(ds, "keys"):
            # DatasetDict
            first_key = next(iter(ds.keys()))
            cols = _cols_from_split(ds[first_key])
        else:
            cols = _cols_from_split(ds)
        prop_cols = [c for c in cols if c.startswith("proportion_domain_")]
        loss_cols = [c for c in cols if c.startswith("loss_domain_")]

        def _idx(c):
            m = re.search(r"(\d+)$", c)
            return int(m.group(1)) if m else 10**9

        prop_cols = sorted(prop_cols, key=_idx)
        loss_cols = sorted(loss_cols, key=_idx)
        group_col = "group" if "group" in cols else None
        return prop_cols, loss_cols, group_col

    def _iter_train_rows(ds):
        # Normalize to a list of dict rows, preferring 'train' if present
        if hasattr(ds, "keys"):
            # DatasetDict
            if "train" in ds:
                return list(ds["train"])
            # else merge all splits
            rows = []
            for sp in ds.keys():
                rows.extend(ds[sp])
            return rows
        # Single Dataset
        rows = list(ds)
        if rows and isinstance(rows[0], dict) and "split" in rows[0]:
            rows = [r for r in rows if r.get("split") == "train"]
        return rows

    def _fit_for_group(g: str):
        ds = _load_dataset()
        eps = 1e-6
        reg = 1e-3  # L2 regularization to mitigate collinearity (proportions sum to 1)

        # If dataset is unavailable, create a degenerate model based on input_data structure.
        if ds is None:
            # Infer columns from input_data if possible
            prop_cols = []
            loss_cols = []
            if input_data:
                keys = list(input_data[0].keys())
                prop_cols = sorted([k for k in keys if k.startswith("proportion_domain_")],
                                   key=lambda c: int(re.search(r"(\d+)$", c).group(1)) if re.search(r"(\d+)$", c) else 10**9)
                # Default order for losses mirrors proportions
                loss_cols = [f"loss_domain_{i+1}" for i in range(len(prop_cols))]
            beta = _np.zeros((len(prop_cols) + 1, len(loss_cols) if loss_cols else 0), dtype=float)
            return {"beta": beta, "prop_cols": prop_cols, "loss_cols": loss_cols, "eps": eps}

        prop_cols, loss_cols, group_col = _extract_columns(ds)
        rows = _iter_train_rows(ds)

        # Filter by group if a group column is present; otherwise use all rows
        if group_col is not None:
            grp_rows = [r for r in rows if r.get(group_col) == g]
            if len(grp_rows) == 0:
                # Fallback to all rows if the requested group does not exist
                grp_rows = rows
        else:
            grp_rows = rows

        # Assemble design matrix X (log proportions) and targets Y (losses)
        X_list = []
        Y_list = []
        for r in grp_rows:
            try:
                xrow = [math.log(float(r.get(c, 0.0)) + eps) for c in prop_cols]
                yrow = [float(r[c]) for c in loss_cols if c in r]
            except Exception:
                continue
            if len(yrow) != len(loss_cols):
                continue
            if any(math.isinf(v) or math.isnan(v) for v in xrow):
                continue
            X_list.append(xrow)
            Y_list.append(yrow)

        if len(X_list) == 0:
            beta = _np.zeros((len(prop_cols) + 1, len(loss_cols)), dtype=float)
        else:
            X = _np.asarray(X_list, dtype=float)
            Y = _np.asarray(Y_list, dtype=float)
            # Add bias column
            Xb = _np.concatenate([_np.ones((X.shape[0], 1), dtype=float), X], axis=1)
            XtX = Xb.T @ Xb
            I = _np.eye(XtX.shape[0], dtype=float)
            I[0, 0] = 0.0  # do not regularize intercept
            A = XtX + reg * I
            XtY = Xb.T @ Y
            try:
                beta = _np.linalg.solve(A, XtY)
            except _np.linalg.LinAlgError:
                beta = _np.linalg.pinv(A) @ XtY

        params = {"beta": beta, "prop_cols": prop_cols, "loss_cols": loss_cols, "eps": eps}

        # Persist for inspection
        try:
            serial = {
                "prop_cols": prop_cols,
                "loss_cols": loss_cols,
                "eps": eps,
                "beta": beta.tolist(),
            }
            fit_path = "/app/fit.json"
            blob = {}
            if os.path.exists(fit_path):
                try:
                    with open(fit_path, "r") as f:
                        blob = json.load(f) or {}
                except Exception:
                    blob = {}
            blob[g] = serial
            with open(fit_path, "w") as f:
                json.dump(blob, f, indent=2)
        except Exception:
            # Best-effort; do not fail predictions because of I/O issues.
            pass

        return params

    # Ensure we have fitted parameters for this group
    if group not in law._params:
        law._params[group] = _fit_for_group(group)

    params = law._params[group]
    beta = params["beta"]
    prop_cols = params["prop_cols"]
    loss_cols = params["loss_cols"]
    eps = params["eps"]

    # Produce predictions
    results: list[dict[str, float]] = []
    for row in input_data:
        # If dataset unavailable and we couldn't infer loss columns, return empty predictions
        if not loss_cols:
            results.append({})
            continue
        x = [_np.log(float(row.get(c, 0.0)) + eps) for c in prop_cols]
        xb = _np.concatenate([_np.array([1.0], dtype=float), _np.asarray(x, dtype=float)])
        y = xb @ beta  # shape (num_losses,)
        pred = {loss_cols[j]: float(y[j]) for j in range(len(loss_cols))}
        results.append(pred)

    return results

#3 Run 3 R² = 0.857740

▼

Python

from __future__ import annotations

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
               The functional form of the law is the same for all groups, but the
               coefficients are learned per-group from /app/data (cached).

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    import math

    # Initialize caches as function attributes (keeps module surface minimal: single function)
    if not hasattr(law, "_coeff_cache"):
        law._coeff_cache = {}  # type: ignore[attr-defined]
    if not hasattr(law, "_domains_by_group"):
        law._domains_by_group = {}  # type: ignore[attr-defined]
    if not hasattr(law, "_fit_attempted"):
        law._fit_attempted = set()  # type: ignore[attr-defined]

    def _safe_log(p: float, eps: float = 1e-12) -> float:
        # Avoid log(0) while keeping behavior stable for tiny proportions.
        return math.log(p if p > eps else eps)

    def _discover_domain_indices_from_columns(columns: list[str]) -> list[int]:
        idxs = []
        prefix = "proportion_domain_"
        for c in columns:
            if c.startswith(prefix):
                try:
                    i = int(c[len(prefix):])
                    idxs.append(i)
                except ValueError:
                    continue
        return sorted(set(idxs))

    def _fit_group_coeffs(_group: str) -> None:
        # Skip refit if already attempted (prevents repeated I/O on missing env)
        if _group in law._fit_attempted:  # type: ignore[attr-defined]
            return
        law._fit_attempted.add(_group)  # type: ignore[attr-defined]

        # Defaults if dataset is unavailable: reasonable monotone log response.
        default_coeffs = {}
        # If we can infer domain ids from incoming data later, we'll update per call.
        try:
            # Lazy import to avoid hard dependency if environment lacks datasets
            from datasets import load_from_disk, Dataset, DatasetDict  # type: ignore
            import numpy as np  # type: ignore

            ds_any = load_from_disk("/app/data")
            if isinstance(ds_any, DatasetDict):
                # Prefer 'train' split when available; otherwise choose the first split.
                if "train" in ds_any:
                    ds = ds_any["train"]
                else:
                    first_split = list(ds_any.keys())[0]
                    ds = ds_any[first_split]
            else:
                ds = ds_any  # Already a Dataset

            colnames = list(getattr(ds, "column_names", []))
            domain_idxs = _discover_domain_indices_from_columns(colnames)

            # Filter by group if the column exists, else fit globally.
            use_ds = ds
            if "group" in colnames and _group not in ("", None):
                try:
                    use_ds = ds.filter(lambda x: x.get("group") == _group)
                    # If filter produced empty set, fall back to global
                    if len(use_ds) == 0:
                        use_ds = ds
                except Exception:
                    use_ds = ds

            coeffs = {}
            # Ordinary Least Squares per domain: y = a + b * ln(p)
            for i in domain_idxs:
                p_col = f"proportion_domain_{i}"
                y_col = f"loss_domain_{i}"
                if p_col not in colnames or y_col not in colnames:
                    continue
                try:
                    p_vals = use_ds[p_col]
                    y_vals = use_ds[y_col]
                except Exception:
                    continue

                # Convert to numpy arrays and robustly filter invalid/edge entries
                try:
                    x = np.array([_safe_log(float(p)) for p in p_vals], dtype=float)
                    y = np.array([float(v) for v in y_vals], dtype=float)
                    mask = np.isfinite(x) & np.isfinite(y)
                    x = x[mask]
                    y = y[mask]
                except Exception:
                    x = []
                    y = []

                # If insufficient variance or data, fall back to defaults
                if len(x) < 2 or float(np.var(x)) <= 1e-18:
                    # Reasonable fallback: a ~ mean(y), b ~ 0 if no variance; else mild slope
                    if len(y) >= 1 and all(np.isfinite(y)):
                        a = float(np.mean(y))
                    else:
                        a = 2.5
                    b = -0.2  # gentle improvement with more data
                else:
                    # Closed-form OLS for simple linear regression
                    x_mean = float(np.mean(x))
                    y_mean = float(np.mean(y))
                    cov = float(np.mean((x - x_mean) * (y - y_mean)))
                    var = float(np.mean((x - x_mean) ** 2))
                    if var <= 1e-18:
                        b = 0.0
                        a = y_mean
                    else:
                        b = cov / var
                        a = y_mean - b * x_mean

                coeffs[i] = (a, b)

            law._coeff_cache[_group] = coeffs  # type: ignore[attr-defined]
            law._domains_by_group[_group] = sorted(coeffs.keys())  # type: ignore[attr-defined]

        except Exception:
            # Dataset not available or fitting failed; set a permissive default.
            # Assume 5 domains if unseen; coefficients encourage modest loss decrease with higher p.
            for i in range(1, 6):
                default_coeffs[i] = (2.5, -0.2)
            law._coeff_cache[_group] = default_coeffs  # type: ignore[attr-defined]
            law._domains_by_group[_group] = sorted(default_coeffs.keys())  # type: ignore[attr-defined]

    # Ensure coefficients for this group are fitted/cached
    _fit_group_coeffs(group)

    # Inference: for each input point, predict loss_domain_i for available proportion_domain_i
    coeffs_for_group = law._coeff_cache.get(group, {})  # type: ignore[attr-defined]
    results: list[dict[str, float]] = []
    for row in input_data:
        out: dict[str, float] = {}
        # Use domains inferred from the row's keys; if empty, fall back to known domains for the group
        keys = [k for k in row.keys() if k.startswith("proportion_domain_")]
        if not keys:
            # Predict for all known domains for this group with p assumed ~0 (minimal exposure)
            for i in law._domains_by_group.get(group, []):  # type: ignore[attr-defined]
                a, b = coeffs_for_group.get(i, (2.5, -0.2))
                p = 0.0
                out[f"loss_domain_{i}"] = a + b * _safe_log(p)
        else:
            for k in keys:
                try:
                    i = int(k.split("proportion_domain_")[1])
                except Exception:
                    continue
                a, b = coeffs_for_group.get(i, (2.5, -0.2))
                p_val = float(row.get(k, 0.0))
                out[f"loss_domain_{i}"] = a + b * _safe_log(p_val)
        results.append(out)

    return results

#4 Run 4 R² = 0.840301

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Lazy-fit parameters on first call and cache them on the function object.
    # We keep all helper logic enclosed to satisfy the "single function" constraint.
    import math

    # Small numerical constants
    EPS_P = 1e-12   # floor for proportions (avoids log(0))
    EPS_Y = 1e-12   # floor for losses in fitting
    L2 = 1e-6       # small ridge regularization for stability

    # Initialize cache containers if missing
    if not hasattr(law, "_fitted"):
        law._fitted = False
    if not hasattr(law, "_model"):
        law._model = {}
    if not hasattr(law, "_names"):
        law._names = {"prop": [], "loss": []}

    def _int_suffix(s: str) -> int:
        try:
            return int(s.rsplit("_", 1)[-1])
        except Exception:
            return 1 << 30  # push non-numeric to the end deterministically

    def _ensure_fit():
        if law._fitted:
            return
        try:
            from datasets import load_from_disk
            import numpy as np
        except Exception:
            # If required libs are unavailable, fall back to a neutral, constant model.
            law._model = {"groups": {"ALL": {}}, "is_fallback": True}
            law._names = {"prop": [], "loss": []}
            law._fitted = True
            return

        # Load dataset
        rows = []
        prop_cols = []
        loss_cols = []
        group_col = None
        try:
            ds_any = load_from_disk("/app/data")
        except Exception:
            # Dataset missing; use fallback
            law._model = {"groups": {"ALL": {}}, "is_fallback": True}
            law._names = {"prop": [], "loss": []}
            law._fitted = True
            return

        # Iterate rows from either a DatasetDict or a single Dataset
        def _iter_rows(ds):
            try:
                ds = ds.with_format("python")
            except Exception:
                pass
            for r in ds:
                yield r

        if hasattr(ds_any, "items"):
            # DatasetDict
            for _, ds in ds_any.items():
                for r in _iter_rows(ds):
                    rows.append(r)
        else:
            # Single split
            for r in _iter_rows(ds_any):
                rows.append(r)

        if not rows:
            law._model = {"groups": {"ALL": {}}, "is_fallback": True}
            law._names = {"prop": [], "loss": []}
            law._fitted = True
            return

        # Detect columns
        all_cols = set()
        for r in rows:
            all_cols.update(r.keys())

        prop_cols = sorted(
            [c for c in all_cols if c.startswith("proportion_domain_")],
            key=_int_suffix,
        )
        loss_cols = sorted(
            [c for c in all_cols if c.startswith("loss_domain_")],
            key=_int_suffix,
        )
        for cand in ("group", "Group", "group_name", "group_id", "subset", "task", "exp_group"):
            if cand in all_cols:
                group_col = cand
                break

        if not prop_cols or not loss_cols:
            # No usable columns; fallback to constant model
            law._model = {"groups": {"ALL": {}}, "is_fallback": True}
            law._names = {"prop": [], "loss": []}
            law._fitted = True
            return

        # Group rows
        by_group = {}
        for r in rows:
            g = r.get(group_col, "ALL") if group_col else "ALL"
            by_group.setdefault(str(g), []).append(r)

        # Utility: fit per group and per loss target using log-linear regression with ridge
        def _fit_group(g_rows):
            # Returns: dict loss_col -> (intercept, [beta_j for each prop_col])
            params = {}
            # Prepare X rows per record (log-proportions)
            X_logp = []
            mask_valid = []
            for r in g_rows:
                # Build log-proportion vector with safe normalization and floors
                vals = [r.get(c, None) for c in prop_cols]
                present = [(i, v) for i, v in enumerate(vals) if v is not None]
                m = len(prop_cols)
                if not present:
                    p = [1.0 / m] * m
                else:
                    # Normalize present proportions to sum 1; add tiny floor to all dims and renormalize
                    p = [0.0] * m
                    s = 0.0
                    for i, v in present:
                        try:
                            fv = float(v)
                        except Exception:
                            fv = 0.0
                        if not math.isfinite(fv) or fv < 0:
                            fv = 0.0
                        p[i] = fv
                        s += fv
                    if s <= 0.0:
                        p = [1.0 / m] * m
                    else:
                        p = [pi / s for pi in p]
                        # add small floor and renormalize
                        p = [pi + EPS_P for pi in p]
                        tot = sum(p)
                        p = [pi / tot for pi in p]
                X_logp.append([math.log(max(pi, EPS_P)) for pi in p])
                mask_valid.append(True)
            # Fit a separate model for each loss dimension
            import numpy as np
            X = np.asarray(X_logp, dtype=float)
            n = X.shape[0]
            d = X.shape[1]
            ones = np.ones((n, 1), dtype=float)
            X_aug = np.concatenate([ones, X], axis=1)  # [1, log p_1, ..., log p_d]
            XtX = X_aug.T @ X_aug
            XtX += L2 * np.eye(d + 1)

            for loss_col in loss_cols:
                y_list = []
                idx_list = []
                for idx, r in enumerate(g_rows):
                    v = r.get(loss_col, None)
                    if v is None:
                        continue
                    try:
                        fv = float(v)
                    except Exception:
                        continue
                    if not math.isfinite(fv) or fv <= 0:
                        continue
                    y_list.append(math.log(max(fv, EPS_Y)))
                    idx_list.append(idx)
                if len(y_list) < d + 1:
                    # Not enough data; fall back to neutral params
                    params[loss_col] = (0.0, [0.0] * d)
                    continue
                y = np.asarray(y_list, dtype=float)
                Xa = X_aug[idx_list, :]
                A = Xa.T @ Xa + L2 * np.eye(d + 1)
                b = Xa.T @ y
                try:
                    w = np.linalg.solve(A, b)
                except np.linalg.LinAlgError:
                    w = np.linalg.lstsq(Xa, y, rcond=None)[0]
                intercept = float(w[0])
                betas = [float(bi) for bi in w[1:].tolist()]
                params[loss_col] = (intercept, betas)
            return params

        groups_params = {}
        for g, g_rows in by_group.items():
            groups_params[g] = _fit_group(g_rows)

        # Also fit a global "ALL" group over every row for fallback/extrapolation
        if "ALL" not in groups_params or len(groups_params) > 1:
            all_rows = rows
            groups_params["ALL"] = _fit_group(all_rows)

        law._model = {"groups": groups_params, "is_fallback": False}
        law._names = {"prop": prop_cols, "loss": loss_cols}
        law._fitted = True

    # Ensure model is fitted
    _ensure_fit()

    # If input_data is empty, allow callers to trigger fitting/caching without predicting.
    if not input_data:
        return []

    # Choose group parameters; if unknown, fall back to "ALL" then any available group
    groups = law._model.get("groups", {})
    group_key = str(group)
    if group_key not in groups:
        if "ALL" in groups:
            group_key = "ALL"
        elif groups:
            group_key = next(iter(groups.keys()))
        else:
            group_key = "ALL"
    params_for_group = groups.get(group_key, {})
    prop_cols = law._names.get("prop", [])
    loss_cols = law._names.get("loss", [])

    # Prediction helper: build log-proportions vector aligned to fitted prop_cols
    def _row_logp(row):
        m = len(prop_cols)
        if m == 0:
            return []
        present_vals = []
        for c in prop_cols:
            v = row.get(c, None)
            if v is None:
                continue
            try:
                fv = float(v)
            except Exception:
                continue
            if not math.isfinite(fv) or fv < 0:
                continue
            present_vals.append((c, fv))
        if not present_vals:
            # Default to uniform if nothing provided
            p = [1.0 / m] * m
        else:
            # Start with zeros, set present, normalize to sum 1
            p = [0.0] * m
            s = sum(v for _, v in present_vals)
            if s <= 0:
                p = [1.0 / m] * m
            else:
                for idx, c in enumerate(prop_cols):
                    val = row.get(c, None)
                    if val is None:
                        continue
                    try:
                        fv = float(val)
                    except Exception:
                        fv = 0.0
                    if not math.isfinite(fv) or fv < 0:
                        fv = 0.0
                    p[idx] = fv
                # normalize and apply tiny floor to avoid zero
                s = sum(p)
                if s > 0:
                    p = [pi / s for pi in p]
                else:
                    p = [1.0 / m] * m
                p = [pi + EPS_P for pi in p]
                total = sum(p)
                p = [pi / total for pi in p]
        return [math.log(max(pi, EPS_P)) for pi in p]

    # Produce predictions
    outputs: list[dict[str, float]] = []
    for row in input_data:
        logp = _row_logp(row)
        # If no fitted names, return empty dict
        if not loss_cols or not prop_cols:
            outputs.append({})
            continue
        pred = {}
        for lc in loss_cols:
            intercept, betas = params_for_group.get(lc, (0.0, [0.0] * len(prop_cols)))
            # If group didn't have params for this target, try ALL
            if (intercept, betas) == (0.0, [0.0] * len(prop_cols)) and "ALL" in groups:
                intercept, betas = groups["ALL"].get(lc, (0.0, [0.0] * len(prop_cols)))
            s = intercept
            for bj, lj in zip(betas, logp):
                s += bj * lj
            yhat = math.exp(s)
            pred[lc] = float(yhat)
        outputs.append(pred)
    return outputs

#5 Run 5 R² = -1.000000

▼

Python

from __future__ import annotations

import json
import math
import os
import re
from dataclasses import dataclass
from typing import Dict, Iterable, List, Tuple

import numpy as np

# Try to import datasets lazily and robustly.
try:
    from datasets import load_from_disk, Dataset, DatasetDict  # type: ignore
except Exception:  # pragma: no cover - optional dependency
    load_from_disk = None  # type: ignore
    Dataset = None  # type: ignore
    DatasetDict = None  # type: ignore


@dataclass
class FittedModel:
    # theta: shape (n_targets, n_features), where n_features = 1 (intercept) + n_proportions
    theta: np.ndarray
    proportion_keys: List[str]  # ordered list of feature keys used during fitting
    target_keys: List[str]      # ordered list of target keys used during fitting


# Global store of fitted coefficients per group.
_FITTED_BY_GROUP: Dict[str, FittedModel] = {}
# Numerical stabilizers
_EPS_P: float = 1e-12
_EPS_L: float = 1e-12
_RIDGE: float = 1e-3


def _safe_log(x: np.ndarray, eps: float) -> np.ndarray:
    return np.log(np.clip(x, eps, None))


def _detect_keys(columns: Iterable[str]) -> Tuple[List[str], List[str], str | None]:
    cols = list(columns)
    # Proportion keys like "proportion_domain_1" ... "proportion_domain_5"
    prop_keys = sorted(
        [c for c in cols if re.fullmatch(r"proportion_domain_\d+", c)],
        key=lambda k: int(k.rsplit("_", 1)[1]),
    )
    # Target keys like "loss_domain_1" ... "loss_domain_5"
    tgt_keys = sorted(
        [c for c in cols if re.fullmatch(r"loss_domain_\d+", c)],
        key=lambda k: int(k.rsplit("_", 1)[1]),
    )
    # Group column
    group_col: str | None = None
    if "group" in cols:
        group_col = "group"
    else:
        # Fallback to any column ending with '_group' or named 'Group'
        for cand in cols:
            if cand.lower().ends_with("_group") or cand == "Group":
                group_col = cand
                break
    return prop_keys, tgt_keys, group_col


def _dataset_to_rows(ds_obj) -> List[Dict[str, float]]:
    # Convert Dataset or DatasetDict to a list of dict rows
    rows: List[Dict[str, float]] = []
    if DatasetDict is not None and isinstance(ds_obj, DatasetDict):
        for split in ds_obj.values():
            rows.extend(_dataset_to_rows(split))
        return rows
    # ds_obj is a Dataset or something iterable over dicts
    try:
        # Iterating a datasets.Dataset yields dicts row-wise efficiently
        for row in ds_obj:  # type: ignore
            rows.append(row)
    except Exception:
        # Fallback: try to_dict
        try:
            data_dict = ds_obj.to_dict()  # type: ignore
            n = len(next(iter(data_dict.values())))
            for i in range(n):
                rows.append({k: v[i] for k, v in data_dict.items()})
        except Exception:
            pass
    return rows


def _fit_group(rows: List[Dict[str, float]], proportion_keys: List[str], target_keys: List[str]) -> FittedModel:
    # Build X (design) and Y (targets)
    X_list: List[List[float]] = []
    Y_lists: List[List[float]] = [[] for _ in target_keys]

    for r in rows:
        try:
            p_vec = np.array([float(r[k]) for k in proportion_keys], dtype=float)
            if np.any(~np.isfinite(p_vec)):
                continue
            # Require all targets present and finite
            y_vals = []
            valid = True
            for tk in target_keys:
                val = float(r[tk])
                if not (np.isfinite(val) and val > 0):
                    valid = False
                    break
                y_vals.append(val)
            if not valid:
                continue
        except Exception:
            continue

        x = [1.0]
        x.extend(_safe_log(p_vec, _EPS_P).tolist())
        X_list.append(x)
        for i, y in enumerate(y_vals):
            Y_lists[i].append(float(y))

    if not X_list or any(len(yc) == 0 for yc in Y_lists):
        # Fallback: identity-like tiny coefficients to keep the model running
        n_features = 1 + len(proportion_keys)
        theta = np.zeros((len(target_keys), n_features), dtype=float)
        # Intercepts default to log(1.0) = 0
        return FittedModel(theta=theta, proportion_keys=proportion_keys, target_keys=target_keys)

    X = np.asarray(X_list, dtype=float)  # shape (n_samples, n_features)
    n_features = X.shape[1]
    XtX = X.T @ X
    reg = np.eye(n_features, dtype=float)
    reg[0, 0] = 0.0  # do not regularize intercept
    XtX_reg = XtX + _RIDGE * reg

    theta_rows: List[np.ndarray] = []
    Xt = X.T
    for y_vals in Y_lists:
        y = _safe_log(np.asarray(y_vals, dtype=float), _EPS_L)
        Xty = Xt @ y
        try:
            beta = np.linalg.solve(XtX_reg, Xty)
        except np.linalg.LinAlgError:
            beta = np.linalg.lstsq(XtX_reg, Xty, rcond=None)[0]
        theta_rows.append(beta)

    theta = np.vstack(theta_rows)  # (n_targets, n_features)
    return FittedModel(theta=theta, proportion_keys=proportion_keys, target_keys=target_keys)


def _fit_all_groups() -> None:
    global _FITTED_BY_GROUP
    if _FITTED_BY_GROUP:
        return  # already fit

    # Attempt to load the dataset from disk
    rows: List[Dict[str, float]] = []
    if load_from_disk is not None:
        try:
            ds = load_from_disk("/app/data")
            rows = _dataset_to_rows(ds)
        except Exception:
            rows = []
    # If no rows, leave empty and we will populate a default model
    all_columns = set()
    for r in rows:
        all_columns.update(r.keys())
    prop_keys, tgt_keys, group_col = _detect_keys(all_columns)

    # Ensure we have expected 5 domains; if not, try to infer from any present keys
    if not prop_keys:
        # Default to proportion_domain_1..5 if not present
        prop_keys = [f"proportion_domain_{i}" for i in range(1, 6)]
    if not tgt_keys:
        tgt_keys = [f"loss_domain_{i}" for i in range(1, 6)]

    if not rows:
        # Build a default "ALL" model
        _FITTED_BY_GROUP["ALL"] = _fit_group([], prop_keys, tgt_keys)
        _write_explain_md(_FITTED_BY_GROUP)
        return

    # Partition rows by group (or single ALL group)
    grouped: Dict[str, List[Dict[str, float]]] = {}
    if group_col is None:
        grouped["ALL"] = rows
    else:
        for r in rows:
            g = r.get(group_col, "ALL")
            grouped.setdefault(str(g), []).append(r)

    # Fit per group
    for g, gr_rows in grouped.items():
        _FITTED_BY_GROUP[g] = _fit_group(gr_rows, prop_keys, tgt_keys)

    # Also fit a global ALL group across everything for fallback
    if "ALL" not in _FITTED_BY_GROUP:
        _FITTED_BY_GROUP["ALL"] = _fit_group(rows, prop_keys, tgt_keys)

    # Write explanation file including fitted parameters
    _write_explain_md(_FITTED_BY_GROUP)


def _write_explain_md(fitted: Dict[str, FittedModel]) -> None:
    # Prepare a deterministic JSON-like dump of parameters
    payload = {}
    for g, fm in fitted.items():
        payload[g] = {
            "proportion_keys": fm.proportion_keys,
            "target_keys": fm.target_keys,
            "theta": np.asarray(fm.theta, dtype=float).round(8).tolist(),
            "model": "log-linear power law: log(loss_i) = theta[i,0] + sum_j theta[i,j] * log(p_j + eps); eps = %.0e"
            % _EPS_P,
            "ridge": _RIDGE,
        }

    section = [
        "<!-- PARAMS START -->",
        "Fitted parameter tensors by group (JSON):",
        "",
        "```json",
        json.dumps(payload, indent=2),
        "```",
        "<!-- PARAMS END -->",
    ]
    section_text = "\n".join(section) + "\n"

    path = "/app/explain.md"
    # If explain.md exists, replace the PARAM section; else, create a full file.
    try:
        if os.path.exists(path):
            with open(path, "r", encoding="utf-8") as f:
                content = f.read()
            new_content: str
            if "<!-- PARAMS START -->" in content and "<!-- PARAMS END -->" in content:
                new_content = re.sub(
                    r"<!-- PARAMS START -->.*?<!-- PARAMS END -->",
                    section_text,
                    content,
                    flags=re.DOTALL,
                )
            else:
                new_content = content.rstrip() + "\n\n" + section_text
            with open(path, "w", encoding="utf-8") as f:
                f.write(new_content)
        else:
            with open(path, "w", encoding="utf-8") as f:
                f.write(_default_explain_md_header().rstrip() + "\n\n" + section_text)
    except Exception:
        # Best-effort; ignore file writing errors.
        pass


def _default_explain_md_header() -> str:
    return """# Scaling law for domain mixture in language model pre-training

This document describes the discovered scaling law that predicts per-domain validation loss from the domain mixture proportions used during pre-training.

## Functional form (shared across groups)

We model each domain's validation loss as a multiplicative power-law function of the mixture proportions:

- For domain i in {1..5}, with mixture proportions p_j for j in {1..5}:
  loss_i = A_i * Π_j (p_j + ε)^{β_{i,j}}

Equivalently in log-space (which is what we fit):

- log(loss_i) = θ_{i,0} + Σ_j θ_{i,j} * log(p_j + ε)
  where A_i = exp(θ_{i,0}) and β_{i,j} = θ_{i,j}.

We fix ε = 1e-12 for numerical stability when a proportion is zero.
We estimate parameters with ridge-regularized least squares on the log-transformed variables, with L2 regularization λ = 1e-3 applied to non-intercept weights.

This form captures:
- Diminishing returns (via negative exponents).
- Cross-domain transfer (exponents β_{i,j} coupling domains).
- Scale invariance with respect to multiplicative changes in the mixture.

## Methodology

- Load the dataset at /app/data using datasets.load_from_disk().
- Identify input features: proportion_domain_1..proportion_domain_5.
- Identify targets: loss_domain_1..loss_domain_5.
- If a 'group' column is present, fit a separate parameter set per group; otherwise, fit a single ALL group.
- Optimize θ for each target independently using ridge regression in log space.
- Use the fitted θ to predict losses for new inputs by exponentiating the linear predictor.

## Fitted parameters per group

The exact fitted coefficients depend on the dataset available at runtime.
The block below is automatically populated by /app/law.py when the module is imported and the model is fit.
"""


# Fit on import so that law() can immediately use the parameters and /app/explain.md is populated.
_fit_all_groups()


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Ensure models are fit (idempotent)
    _fit_all_groups()

    # Choose fitted coefficients for the requested group; fallback to "ALL"
    fm = _FITTED_BY_GROUP.get(group) or _FITTED_BY_GROUP.get("ALL")
    if fm is None:
        # Last-resort fallback with default keys and zero coefficients
        proportion_keys = [f"proportion_domain_{i}" for i in range(1, 6)]
        target_keys = [f"loss_domain_{i}" for i in range(1, 6)]
        fm = FittedModel(theta=np.zeros((len(target_keys), 1 + len(proportion_keys))),  # type: ignore
                         proportion_keys=proportion_keys,
                         target_keys=target_keys)

    # Build predictions
    out: List[Dict[str, float]] = []
    prop_keys = fm.proportion_keys
    theta = np.asarray(fm.theta, dtype=float)

    # If incoming dicts have a different set of proportion keys, try to realign
    # to the canonical order based on numeric suffix.
    def canonicalize_keys(keys: List[str]) -> List[str]:
        return sorted(keys, key=lambda k: int(k.rsplit("_", 1)[1]) if re.fullmatch(r".*_\d+", k) else math.inf)

    for row in input_data:
        # If row has all canonical keys, use them; else, try to align using suffix.
        if not all(k in row for k in prop_keys):
            candidate = [k for k in row.keys() if re.fullmatch(r"proportion_domain_\d+", k)]
            if candidate:
                prop_keys_runtime = canonicalize_keys(candidate)
            else:
                prop_keys_runtime = prop_keys  # fall back
        else:
            prop_keys_runtime = prop_keys

        p_vec = np.array([float(row.get(k, 0.0)) for k in prop_keys_runtime], dtype=float)
        x = np.empty(1 + p_vec.size, dtype=float)
        x[0] = 1.0
        x[1:] = _safe_log(p_vec, _EPS_P)

        # Predict each target independently in log-space then exponentiate.
        yhat_log = theta @ x  # shape (n_targets,)
        yhat = np.exp(yhat_log)
        # Produce outputs with canonical target keys (loss_domain_1..5)
        pred: Dict[str, float] = {}
        # Map predictions to fm.target_keys order; also ensure we output exactly loss_domain_1..5
        for idx, tk in enumerate(fm.target_keys):
            pred[tk] = float(yhat[idx])
        # If any expected loss_domain_i missing (e.g., dataset had different naming), fill canonical keys
        for i in range(1, 6):
            key = f"loss_domain_{i}"
            if key not in pred and idx < yhat.size:
                pred[key] = float(yhat[min(i - 1, yhat.size - 1)])
        out.append(pred)

    return out