SLD - LR & Batch Size Scaling Law

All Runs (sorted by R²)

Best Run 1 R² = 0.025235

▼

Python

from __future__ import annotations

import json
import math
import os
from typing import Dict, List

import numpy as np

# Attempt to import datasets lazily. If unavailable, we can still run with a fallback.
try:
    from datasets import load_from_disk  # type: ignore
except Exception:  # pragma: no cover
    load_from_disk = None  # type: ignore


# Global container for fitted coefficients, filled at import-time (lazily) from /app/data if available.
# Keys are group names; values are dicts with the coefficients.
COEFFS: Dict[str, Dict[str, float]] = {}
GLOBAL_GROUP_KEY = "__ALL__"

# Numerical safety epsilon for logs
_EPS = 1e-12

# Description of the functional form
FORMULA_DESC = (
    "log(lm_loss) = beta0_g + a_g*log(lr) + a2_g*(log(lr))^2 + "
    "b_g*log(bsz) + c_g*log(data_size) + d_g*log(non_embedding_param_size)\n"
    "=> lm_loss = exp(beta0_g + a_g*log(lr) + a2_g*(log(lr))^2 + "
    "b_g*log(bsz) + c_g*log(data_size) + d_g*log(non_embedding_param_size))"
)


def _safe_log(x: float) -> float:
    return math.log(max(float(x), _EPS))


def _design_row(lr: float, bsz: float, data_size: float, non_emb_params: float) -> np.ndarray:
    """
    Build a single feature row for the regression:
    [1, log(lr), (log(lr))^2, log(bsz), log(data_size), log(non_embedding_param_size)]
    """
    z_lr = _safe_log(lr)
    return np.array(
        [
            1.0,
            z_lr,
            z_lr * z_lr,
            _safe_log(bsz),
            _safe_log(data_size),
            _safe_log(non_emb_params),
        ],
        dtype=np.float64,
    )


def _fit_group(X: np.ndarray, y: np.ndarray, lam: float = 1e-6) -> np.ndarray:
    """
    Ridge-regularized least squares:
        (X^T X + lam I) w = X^T y
    """
    XT = X.T
    A = XT @ X
    # Ridge on all parameters including bias (small lam)
    A[np.diag_indices_from(A)] += lam
    b = XT @ y
    w = np.linalg.solve(A, b)
    return w


def _extract_dataset_rows(ds_item: dict) -> tuple[float, float, float, float, float, str | None]:
    """
    Extract lr, bsz, data_size, non_embedding_param_size, lm_loss, group (if present) from a dataset item.
    Returns tuple: (lr, bsz, data_size, non_emb_params, lm_loss, group)
    """
    lr = float(ds_item.get("lr"))
    bsz = float(ds_item.get("bsz"))
    data_size = float(ds_item.get("data_size"))
    non_emb = float(ds_item.get("non_embedding_param_size"))
    lm_loss = float(ds_item.get("lm_loss"))
    group = ds_item.get("group")
    if group is not None:
        group = str(group)
    return lr, bsz, data_size, non_emb, lm_loss, group


def _load_and_fit(path: str = "/app/data") -> Dict[str, Dict[str, float]]:
    """
    Load dataset from disk and fit per-group coefficients according to FORMULA_DESC.
    If datasets API is not available or loading fails, return a robust default.
    """
    coeffs: Dict[str, Dict[str, float]] = {}

    if load_from_disk is None:
        # Fallback: very conservative defaults (weak dependence)
        coeffs[GLOBAL_GROUP_KEY] = {
            "beta0": 1.0,
            "a_lr": 0.0,
            "a2_lr2": 0.1,
            "b_bsz": -0.02,
            "c_data": -0.1,
            "d_param": -0.1,
        }
        return coeffs

    # Load dataset (can be Dataset or DatasetDict)
    try:
        ds = load_from_disk(path)
    except Exception:
        # Fallback defaults if loading fails
        coeffs[GLOBAL_GROUP_KEY] = {
            "beta0": 1.0,
            "a_lr": 0.0,
            "a2_lr2": 0.1,
            "b_bsz": -0.02,
            "c_data": -0.1,
            "d_param": -0.1,
        }
        return coeffs

    # Collect all rows across splits if needed
    rows = []
    if hasattr(ds, "values"):  # DatasetDict
        for split in ds.values():
            rows.extend(list(split))
    else:  # Single Dataset
        rows = list(ds)

    # Partition by group (or GLOBAL group if group missing)
    groups: Dict[str, list[tuple[float, float, float, float, float]]] = {}
    for it in rows:
        try:
            lr, bsz, data_size, non_emb, lm_loss, group = _extract_dataset_rows(it)
        except Exception:
            continue

        # Filter invalid values
        if not all(v is not None for v in (lr, bsz, data_size, non_emb, lm_loss)):
            continue
        if lr <= 0 or bsz <= 0 or data_size <= 0 or non_emb <= 0 or lm_loss <= 0:
            continue

        gname = group if group is not None else GLOBAL_GROUP_KEY
        groups.setdefault(gname, []).append((lr, bsz, data_size, non_emb, lm_loss))

    # If no groups found, bail to fallback
    if not groups:
        coeffs[GLOBAL_GROUP_KEY] = {
            "beta0": 1.0,
            "a_lr": 0.0,
            "a2_lr2": 0.1,
            "b_bsz": -0.02,
            "c_data": -0.1,
            "d_param": -0.1,
        }
        return coeffs

    # Also fit a global group across all data to use as fallback for unknown groups
    all_data = [rec for glist in groups.values() for rec in glist]
    groups_with_global = dict(groups)
    groups_with_global[GLOBAL_GROUP_KEY] = all_data

    # Fit per group
    for gname, glist in groups_with_global.items():
        if len(glist) < 6:  # Need at least as many points as parameters for a good fit
            continue
        X = np.vstack([_design_row(*rec[:4]) for rec in glist])  # n x 6
        y = np.array([_safe_log(rec[4]) for rec in glist], dtype=np.float64)  # log(lm_loss)

        try:
            w = _fit_group(X, y, lam=1e-6)
        except np.linalg.LinAlgError:
            # Very small increase in regularization if ill-conditioned
            w = _fit_group(X, y, lam=1e-3)

        coeffs[gname] = {
            "beta0": float(w[0]),
            "a_lr": float(w[1]),
            "a2_lr2": float(w[2]),
            "b_bsz": float(w[3]),
            "c_data": float(w[4]),
            "d_param": float(w[5]),
        }

    # In rare case fitting failed for some groups, ensure we at least have a global fallback
    if GLOBAL_GROUP_KEY not in coeffs:
        # Fit a quick global from whatever we have (if any), else use defaults
        if all_data:
            X = np.vstack([_design_row(*rec[:4]) for rec in all_data])
            y = np.array([_safe_log(rec[4]) for rec in all_data], dtype=np.float64)
            try:
                w = _fit_group(X, y, lam=1e-6)
            except np.linalg.LinAlgError:
                w = _fit_group(X, y, lam=1e-3)
            coeffs[GLOBAL_GROUP_KEY] = {
                "beta0": float(w[0]),
                "a_lr": float(w[1]),
                "a2_lr2": float(w[2]),
                "b_bsz": float(w[3]),
                "c_data": float(w[4]),
                "d_param": float(w[5]),
            }
        else:
            coeffs[GLOBAL_GROUP_KEY] = {
                "beta0": 1.0,
                "a_lr": 0.0,
                "a2_lr2": 0.1,
                "b_bsz": -0.02,
                "c_data": -0.1,
                "d_param": -0.1,
            }

    return coeffs


def _write_explain_md(coeffs: Dict[str, Dict[str, float]], path: str = "/app/explain.md") -> None:
    """
    Generate a detailed explanation file including the functional form and fitted coefficients.
    """
    lines: List[str] = []
    lines.append("# Scaling Law for Final Language Modeling Loss\n")
    lines.append("This document describes the discovered scaling law relating the final language modeling loss (lm_loss) to training hyperparameters.\n")
    lines.append("## Functional Form\n")
    lines.append("We fit a log-linear model with a quadratic term in log(learning rate) to capture the typical U-shaped dependence on learning rate:\n")
    lines.append("log(lm_loss) = beta0_g + a_g*log(lr) + a2_g*(log(lr))^2 + b_g*log(bsz) + c_g*log(data_size) + d_g*log(non_embedding_param_size)\n")
    lines.append("\nEquivalently:\n")
    lines.append("lm_loss = exp(beta0_g + a_g*log(lr) + a2_g*(log(lr))^2 + b_g*log(bsz) + c_g*log(data_size) + d_g*log(non_embedding_param_size))\n")
    lines.append("\n- g denotes the experimental group. The functional form is identical across groups, while coefficients vary per group.\n")
    lines.append("\n## Methodology\n")
    lines.append("- Loaded the dataset from `/app/data` using `datasets.load_from_disk()`.\n")
    lines.append("- Filtered rows to ensure all variables are positive (required for logarithms).\n")
    lines.append("- Regressed log(lm_loss) on [1, log(lr), (log(lr))^2, log(bsz), log(data_size), log(non_embedding_param_size)] using ridge-regularized least squares (λ = 1e-6).\n")
    lines.append("- Fitted the model per group and also a global model across all data as a fallback.\n")
    lines.append("\n## Fitted Coefficients by Group\n")
    lines.append("The following coefficients were fitted programmatically at import time of `law.py`:\n")
    lines.append("\n")
    # Nicely format coefficients per group
    # Sort groups, showing GLOBAL first if present
    keys = list(coeffs.keys())
    if GLOBAL_GROUP_KEY in keys:
        keys.remove(GLOBAL_GROUP_KEY)
        keys = [GLOBAL_GROUP_KEY] + sorted(keys)
    else:
        keys = sorted(keys)
    for g in keys:
        c = coeffs[g]
        lines.append(f"### Group: {g}\n")
        lines.append(f"- beta0: {c['beta0']:.8f}\n")
        lines.append(f"- a (log lr): {c['a_lr']:.8f}\n")
        lines.append(f"- a2 (log lr)^2: {c['a2_lr2']:.8f}\n")
        lines.append(f"- b (log bsz): {c['b_bsz']:.8f}\n")
        lines.append(f"- c (log data_size): {c['c_data']:.8f}\n")
        lines.append(f"- d (log non_embedding_param_size): {c['d_param']:.8f}\n")
        lines.append("\n")

    try:
        with open(path, "w", encoding="utf-8") as f:
            f.write("\n".join(lines))
    except Exception:
        # If writing fails, silently ignore (not critical for predictions)
        pass


def _ensure_fitted() -> None:
    """
    Ensure that COEFFS is populated. If empty, attempt to load and fit.
    Also writes/updates /app/explain.md with the fitted coefficients.
    """
    global COEFFS
    if COEFFS:
        return
    coeffs = _load_and_fit("/app/data")
    COEFFS = coeffs
    # Best-effort write explain.md so the fitted numbers are visible
    _write_explain_md(COEFFS, "/app/explain.md")


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Required keys per item:
                    - 'lr'
                    - 'bsz'
                    - 'data_size'
                    - 'non_embedding_param_size'
        group: The name of the experimental group for which to make predictions.
               The functional form of the law is the same for all groups, but the
               coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s): {'lm_loss': float}.
    """
    _ensure_fitted()

    # Select coefficients for the requested group, fallback to global, then to any available group
    c = COEFFS.get(group)
    if c is None:
        c = COEFFS.get(GLOBAL_GROUP_KEY)
    if c is None and COEFFS:
        # Fallback to any one group deterministically
        any_group = sorted(COEFFS.keys())[0]
        c = COEFFS[any_group]
    if c is None:
        # Last-resort defaults (should not happen if fitting succeeded)
        c = {
            "beta0": 1.0,
            "a_lr": 0.0,
            "a2_lr2": 0.1,
            "b_bsz": -0.02,
            "c_data": -0.1,
            "d_param": -0.1,
        }

    beta0 = c["beta0"]
    a_lr = c["a_lr"]
    a2_lr2 = c["a2_lr2"]
    b_bsz = c["b_bsz"]
    c_data = c["c_data"]
    d_param = c["d_param"]

    outputs: list[dict[str, float]] = []
    for item in input_data:
        try:
            lr = float(item["lr"])
            bsz = float(item["bsz"])
            data_size = float(item["data_size"])
            non_emb_params = float(item["non_embedding_param_size"])
        except Exception as e:
            raise ValueError("Each input item must contain 'lr', 'bsz', 'data_size', and 'non_embedding_param_size' as numeric values.") from e

        z_lr = _safe_log(lr)
        z_bsz = _safe_log(bsz)
        z_data = _safe_log(data_size)
        z_param = _safe_log(non_emb_params)

        ln_loss = (
            beta0
            + a_lr * z_lr
            + a2_lr2 * (z_lr * z_lr)
            + b_bsz * z_bsz
            + c_data * z_data
            + d_param * z_param
        )
        loss = float(math.exp(ln_loss))
        outputs.append({"lm_loss": loss})

    return outputs


# Ensure coefficients are fitted when the module is imported, so explain.md is generated.
try:
    _ensure_fitted()
except Exception:
    # Don't fail import in case of transient dataset issues; prediction will use defaults if necessary.
    pass

#2 Run 2 R² = -0.773483

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Lazy-fit group-specific coefficients the first time this function is called.
    if not hasattr(law, "_coefs"):
        # Delayed imports to keep this file self-contained with a single function.
        import math
        import numpy as np
        from datasets import load_from_disk
        from typing import Dict, Any, Iterable

        # Load and prepare training data
        ds_obj = load_from_disk("/app/data")

        # Flatten to a single dataset (prefer 'train' if present)
        try:
            from datasets import Dataset, DatasetDict, concatenate_datasets  # type: ignore
        except Exception:
            Dataset = object  # type: ignore
            DatasetDict = dict  # type: ignore

        if isinstance(ds_obj, dict) and "train" in ds_obj:
            ds = ds_obj["train"]
        elif hasattr(ds_obj, "keys"):
            # Likely a DatasetDict without explicit 'train'; merge all splits
            try:
                from datasets import concatenate_datasets  # type: ignore
                ds = None
                for k in ds_obj.keys():
                    ds = ds_obj[k] if ds is None else concatenate_datasets([ds, ds_obj[k]])
            except Exception:
                # Fallback: pick an arbitrary split
                first_key = next(iter(ds_obj.keys()))
                ds = ds_obj[first_key]
        else:
            ds = ds_obj  # Already a Dataset

        colnames = set(ds.column_names)

        # Required columns
        required = {"lr", "bsz", "data_size", "non_embedding_param_size", "lm_loss"}
        missing = [c for c in required if c not in colnames]
        if missing:
            raise KeyError(f"Dataset at /app/data is missing required columns: {missing}")

        # Identify group column if present; otherwise treat as a single global group
        group_col = "group" if "group" in colnames else ( "Group" if "Group" in colnames else None )

        # Extract arrays
        lr = np.asarray(ds["lr"], dtype=np.float64)
        bsz = np.asarray(ds["bsz"], dtype=np.float64)
        data_size = np.asarray(ds["data_size"], dtype=np.float64)
        nparam = np.asarray(ds["non_embedding_param_size"], dtype=np.float64)
        lm_loss = np.asarray(ds["lm_loss"], dtype=np.float64)
        groups = np.asarray(ds[group_col], dtype=object) if group_col is not None else np.asarray(["__global__"] * len(lm_loss), dtype=object)

        # Build design matrix using log-features
        eps = 1e-12
        x1 = np.log(np.clip(lr, eps, None))
        x2 = np.log(np.clip(bsz, eps, None))
        x3 = np.log(np.clip(data_size, eps, None))
        x4 = np.log(np.clip(nparam, eps, None))
        y = np.log(np.clip(lm_loss, eps, None))

        X = np.stack([np.ones_like(x1), x1, x2, x3, x4], axis=1)

        finite_mask = np.isfinite(X).all(axis=1) & np.isfinite(y)
        X = X[finite_mask]
        y = y[finite_mask]
        groups = groups[finite_mask]

        if X.shape[0] < 5:
            raise RuntimeError("Not enough valid training examples after filtering to fit the scaling law.")

        # Ridge-regularized closed-form solver
        def ridge_ols(Xm: np.ndarray, ym: np.ndarray, lam: float = 1e-6) -> np.ndarray:
            XT = Xm.T
            A = XT @ Xm
            # Tikhonov regularization (do not penalize intercept excessively)
            I = np.eye(A.shape[0], dtype=Xm.dtype)
            I[0, 0] = 0.0
            A_reg = A + lam * I
            b = XT @ ym
            return np.linalg.solve(A_reg, b)

        # Global coefficients
        global_coef = ridge_ols(X, y, lam=1e-6)

        # Group-specific coefficients (same functional form, coefficients differ by group)
        coefs: Dict[str, np.ndarray] = {}
        unique_groups = np.unique(groups)
        for g in unique_groups:
            mask = (groups == g)
            # Require a minimal number of samples; otherwise fall back to global
            if np.count_nonzero(mask) >= 5:
                try:
                    coefs[str(g)] = ridge_ols(X[mask], y[mask], lam=1e-6)
                except Exception:
                    coefs[str(g)] = global_coef
            else:
                coefs[str(g)] = global_coef

        # Cache for subsequent calls
        law._coefs = coefs  # type: ignore[attr-defined]
        law._global = global_coef  # type: ignore[attr-defined]

    # Prepare predictions
    import math
    import numpy as np

    eps = 1e-12

    # Pick coefficients for requested group
    coefs = getattr(law, "_coefs")  # type: ignore[attr-defined]
    coef_vec = coefs.get(group, getattr(law, "_global"))  # type: ignore[attr-defined]

    def to_float(v: float) -> float:
        try:
            return float(v)
        except Exception:
            return float("nan")

    preds: list[dict[str, float]] = []
    for row in input_data:
        lr = to_float(row.get("lr", float("nan")))
        bsz = to_float(row.get("bsz", float("nan")))
        data_size = to_float(row.get("data_size", float("nan")))
        nparam = to_float(row.get("non_embedding_param_size", float("nan")))

        if not (math.isfinite(lr) and math.isfinite(bsz) and math.isfinite(data_size) and math.isfinite(nparam)):
            preds.append({"lm_loss": float("nan")})
            continue

        x = np.array(
            [
                1.0,
                math.log(max(lr, eps)),
                math.log(max(bsz, eps)),
                math.log(max(data_size, eps)),
                math.log(max(nparam, eps)),
            ],
            dtype=np.float64,
        )
        y_log = float(x.dot(coef_vec))
        y_hat = float(math.exp(y_log))
        preds.append({"lm_loss": y_hat})

    return preds

#3 Run 3 R² = -0.773483

▼

Python

from __future__ import annotations

import math
from typing import Dict, List

import numpy as np

# We load lazily to avoid import-time dependency failures if datasets is unavailable in some contexts.
_DATASET_PATH = "/app/data"
_FEATURES = ("lr", "bsz", "data_size", "non_embedding_param_size")
_TARGET = "lm_loss"

# Global store for fitted parameters. Filled on first call to `law`.
PARAMS: Dict[str, Dict] | None = None


def _safe_log(x: np.ndarray) -> np.ndarray:
    """Numerically stable natural log."""
    return np.log(np.clip(x, 1e-12, None))


def _coerce_float(v) -> float:
    try:
        return float(v)
    except Exception:
        # If coercion fails, return NaN; caller should handle.
        return math.nan


def _fit_group_power_with_offset(X: np.ndarray, y: np.ndarray) -> Dict[str, float]:
    """
    Fit y = L0 + A * prod_i X_i ** a_i via log-linear regression over a grid of L0.

    Args:
        X: shape (n, d) positive features (lr, bsz, data_size, non_embedding_param_size)
        y: shape (n,) target scalar (lm_loss)

    Returns:
        dict with keys: L0, A, exponents (list length d), rmse_log, intercept
    """
    n, d = X.shape
    assert d == 4, "Expected 4 features"

    # Ensure strictly positive (for log)
    X = np.clip(X, 1e-12, None)
    y = np.asarray(y, dtype=float)

    # Precompute logs of X
    Xlog = np.column_stack([np.ones(n), _safe_log(X)])  # (n, d+1) including intercept
    Id = np.eye(d + 1)
    Id[0, 0] = 0.0  # don't regularize intercept heavily

    # L0 grid: from 0 up to just below the min observed loss (reserve margin)
    y_min = float(np.nanmin(y))
    # Guard: if data are degenerate, fall back.
    if not np.isfinite(y_min) or y_min <= 0:
        return {
            "L0": 1.0,
            "A": 1.0,
            "exponents": [-0.05, -0.10, -0.20, -0.20],
            "rmse_log": float("inf"),
            "intercept": 0.0,
        }

    # Create a grid that includes values close to 0 and close to y_min but less than it.
    # Use a mixture of linear and geometric spacing for robustness.
    n_lin = 25
    n_geo = 25
    lin_grid = np.linspace(0.0, max(0.0, 0.98 * y_min), num=n_lin, endpoint=False)
    # geometric grid avoids 0; start from a tiny fraction of y_min
    geo_start = max(1e-8, 1e-6 * y_min)
    geo_grid = np.geomspace(geo_start, 0.9 * y_min, num=n_geo, endpoint=True)
    L0_candidates = np.unique(np.clip(np.concatenate([lin_grid, geo_grid]), 0.0, y_min - 1e-12))

    best = {
        "score": float("inf"),
        "L0": 0.0,
        "w": np.zeros(d + 1),
    }
    # Ridge regularization for stability
    lam = 1e-8

    for L0 in L0_candidates:
        resid = y - L0
        # Must be strictly positive for log
        if np.any(resid <= 0):
            continue
        z = _safe_log(resid)  # log(y - L0)
        # Solve (X^T X + lam I) w = X^T z
        XtX = Xlog.T @ Xlog + lam * Id
        Xtz = Xlog.T @ z
        try:
            w = np.linalg.solve(XtX, Xtz)
        except np.linalg.LinAlgError:
            w, *_ = np.linalg.lstsq(XtX, Xtz, rcond=None)
        z_hat = Xlog @ w
        mse = float(np.mean((z - z_hat) ** 2))  # MSE in log-space
        if mse < best["score"]:
            best.update({"score": mse, "L0": float(L0), "w": w})

    w = best["w"]
    L0 = float(best["L0"])
    A = float(np.exp(w[0]))
    exponents = w[1:].tolist()
    return {
        "L0": L0,
        "A": A,
        "exponents": [float(e) for e in exponents],
        "rmse_log": float(best["score"]) ** 0.5,
        "intercept": float(w[0]),
    }


def _load_and_fit() -> Dict[str, Dict]:
    """
    Load the dataset from disk and fit parameters per group and a global fallback.

    Returns:
        Dict mapping group name -> params dict
    """
    try:
        from datasets import Dataset, DatasetDict, concatenate_datasets, load_from_disk  # type: ignore
    except Exception:
        # No datasets library available: return default generic parameters.
        return {
            "_GLOBAL_": {
                "L0": 1.0,
                "A": 1.0,
                "exponents": [-0.05, -0.10, -0.20, -0.20],
                "rmse_log": float("inf"),
                "intercept": 0.0,
            }
        }

    try:
        ds = load_from_disk(_DATASET_PATH)
    except Exception:
        # Dataset not available; return defaults.
        return {
            "_GLOBAL_": {
                "L0": 1.0,
                "A": 1.0,
                "exponents": [-0.05, -0.10, -0.20, -0.20],
                "rmse_log": float("inf"),
                "intercept": 0.0,
            }
        }

    # Merge splits if a DatasetDict
    if isinstance(ds, (dict,)):
        # Unexpected type, fallback: no data
        merged = None
    else:
        try:
            from datasets import DatasetDict as _DD  # noqa
            if isinstance(ds, _DD):
                merged = concatenate_datasets(list(ds.values()))
            else:
                merged = ds
        except Exception:
            # Fallback: try attribute existence
            merged = getattr(ds, "train", ds)
    if merged is None:
        return {
            "_GLOBAL_": {
                "L0": 1.0,
                "A": 1.0,
                "exponents": [-0.05, -0.10, -0.20, -0.20],
                "rmse_log": float("inf"),
                "intercept": 0.0,
            }
        }

    # Identify group column if present
    try:
        columns = list(merged.column_names)
    except Exception:
        try:
            columns = list(merged.features.keys())
        except Exception:
            columns = []

    candidate_group_cols = ["group", "Group", "grp", "family", "cluster", "exp_group"]
    group_col = next((c for c in candidate_group_cols if c in columns), None)

    # Extract arrays
    feats = {f: [] for f in _FEATURES}
    y = []
    groups = []

    for row in merged:
        try:
            vals = [row.get(f, None) for f in _FEATURES]
            if any(v is None for v in vals):
                continue
            vals = [_coerce_float(v) for v in vals]
            if any(not np.isfinite(v) for v in vals):
                continue
            target = _coerce_float(row.get(_TARGET, math.nan))
            if not np.isfinite(target):
                continue
        except Exception:
            continue

        for f, v in zip(_FEATURES, vals):
            feats[f].append(v)
        y.append(target)
        if group_col is not None:
            groups.append(str(row.get(group_col, "unknown")))
        else:
            groups.append("_GLOBAL_")

    if len(y) == 0:
        return {
            "_GLOBAL_": {
                "L0": 1.0,
                "A": 1.0,
                "exponents": [-0.05, -0.10, -0.20, -0.20],
                "rmse_log": float("inf"),
                "intercept": 0.0,
            }
        }

    X = np.column_stack([np.asarray(feats[f], dtype=float) for f in _FEATURES])
    y_arr = np.asarray(y, dtype=float)
    groups_arr = np.asarray(groups, dtype=object)

    params_by_group: Dict[str, Dict] = {}

    # Fit per group
    unique_groups = np.unique(groups_arr)
    for g in unique_groups:
        mask = groups_arr == g
        params_by_group[str(g)] = _fit_group_power_with_offset(X[mask], y_arr[mask])

    # Also fit a global fallback on all data (in case unseen group appears)
    params_by_group["_GLOBAL_"] = _fit_group_power_with_offset(X, y_arr)

    return params_by_group


def _ensure_fitted() -> None:
    global PARAMS
    if PARAMS is None:
        PARAMS = _load_and_fit()


def _predict_from_params(row: Dict[str, float], params: Dict) -> float:
    # Extract features in canonical order, with clipping for numerical stability
    xs = []
    for f in _FEATURES:
        v = _coerce_float(row.get(f, math.nan))
        if not np.isfinite(v):
            raise ValueError(f"Missing or non-finite feature '{f}' in input: {row}")
        xs.append(max(1e-12, float(v)))
    xs = np.asarray(xs, dtype=float)
    L0 = float(params["L0"])
    A = float(params["A"])
    exps = np.asarray(params["exponents"], dtype=float)
    # y = L0 + A * prod_i x_i ** a_i
    return float(L0 + A * float(np.prod(xs ** exps)))


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    The functional form is a power law with an additive irreducible-loss offset, fit per group:
        lm_loss_g = L0_g + A_g * lr^{a_g} * bsz^{b_g} * data_size^{c_g} * non_embedding_param_size^{d_g}

    The exponents and coefficients (L0_g, A_g, a_g..d_g) are learned from /app/data the first time
    this function (or module) is used.

    Args:
        input_data: List of dicts with keys: 'lr', 'bsz', 'data_size', 'non_embedding_param_size'
        group: Experimental group name. If unseen, a global fallback fit is used.

    Returns:
        List of dicts with key 'lm_loss' for each input row.
    """
    _ensure_fitted()
    assert PARAMS is not None
    params = PARAMS.get(group)
    if params is None:
        params = PARAMS.get("_GLOBAL_", next(iter(PARAMS.values())))
    preds = []
    for row in input_data:
        yhat = _predict_from_params(row, params)
        preds.append({"lm_loss": yhat})
    return preds

#4 Run 4 R² = -0.773483

▼

Python

from __future__ import annotations

import math
from typing import Dict, List

import numpy as np


_DATA_PATH = "/app/data"
_EPS = 1e-12  # numeric floor for logs
_MODELS = None  # lazy-fit cache: {"per_group": {group: model}, "global": model}


def _safe_log(x: float) -> float:
    return math.log(max(float(x), _EPS))


def _detect_column(columns: List[str], preferred: str, alternatives: List[str]) -> str:
    if preferred in columns:
        return preferred
    for alt in alternatives:
        if alt in columns:
            return alt
    raise KeyError(
        f"Expected column '{preferred}' not found. Available: {columns}. "
        f"Tried alternatives: {alternatives}"
    )


def _load_dataset_dicts() -> List[dict]:
    """
    Loads the dataset from disk and returns it as a list of dicts.
    Supports either a Dataset or a DatasetDict (uses 'train' split if present,
    otherwise the first available split).
    """
    try:
        from datasets import load_from_disk
    except Exception as e:
        raise RuntimeError(
            "Failed to import 'datasets'. Ensure the 'datasets' package is installed."
        ) from e

    ds = load_from_disk(_DATA_PATH)
    # Normalize to a Dataset instance
    try:
        # DatasetDict (mapping of splits)
        if hasattr(ds, "keys"):
            if "train" in ds:
                ds = ds["train"]
            else:
                # Pick the first available split
                first_key = next(iter(ds.keys()))
                ds = ds[first_key]
    except Exception:
        pass

    # Convert to Python list of records without requiring pandas
    # HuggingFace Dataset supports to_dict() returning column-wise dict of lists
    cols = ds.column_names
    coldict = ds.to_dict()
    n = len(next(iter(coldict.values()))) if coldict else 0
    records = []
    for i in range(n):
        rec = {c: coldict[c][i] for c in cols}
        records.append(rec)
    return records


def _fit_group_models() -> Dict[str, dict]:
    """
    Fit a log-linear (power-law) model per group:
        ln(L) = b0 + b1 ln(lr) + b2 ln(bsz) + b3 ln(data_size) + b4 ln(non_embedding_param_size)

    Returns a dict with:
      {
        "per_group": { group_name: {"beta": np.array, "rmse": float, "n": int} },
        "global": {"beta": np.array, "rmse": float, "n": int}
      }
    """
    data = _load_dataset_dicts()
    if not data:
        # No data found; return a sane fallback
        beta_fallback = np.array([math.log(3.5), 0.0, 0.0, 0.0, 0.0], dtype=float)
        return {
            "per_group": {},
            "global": {"beta": beta_fallback, "rmse": float("nan"), "n": 0},
        }

    # Detect columns
    columns = list(data[0].keys())
    # Required numeric columns (use exact names specified by the user prompt, with a few safe fallbacks)
    col_lr = _detect_column(columns, "lr", ["learning_rate"])
    col_bsz = _detect_column(columns, "bsz", ["batch_size"])
    col_data = _detect_column(columns, "data_size", ["tokens", "num_tokens", "dataset_size"])
    col_params = _detect_column(
        columns, "non_embedding_param_size", ["non_embedding_params", "non_embedding_parameters"]
    )
    # Target
    col_loss = _detect_column(columns, "lm_loss", ["loss", "val_loss", "final_loss"])
    # Group column (optional)
    group_col = None
    for cand in ["group", "group_name", "exp_group", "dataset_group", "task_group"]:
        if cand in columns:
            group_col = cand
            break

    # Filter invalid rows and build per-group buckets
    buckets: Dict[str, List[dict]] = {}
    for row in data:
        try:
            lr = float(row[col_lr])
            bsz = float(row[col_bsz])
            size = float(row[col_data])
            params = float(row[col_params])
            loss = float(row[col_loss])
        except Exception:
            continue

        if not (lr > 0 and bsz > 0 and size > 0 and params > 0 and loss > 0):
            continue

        g = str(row[group_col]) if group_col is not None else "default"
        buckets.setdefault(g, []).append(
            {  # keep only needed keys
                "lr": lr,
                "bsz": bsz,
                "data_size": size,
                "non_embedding_param_size": params,
                "lm_loss": loss,
            }
        )

    # If no valid groups, fallback
    if not buckets:
        beta_fallback = np.array([math.log(3.5), 0.0, 0.0, 0.0, 0.0], dtype=float)
        return {
            "per_group": {},
            "global": {"beta": beta_fallback, "rmse": float("nan"), "n": 0},
        }

    def _fit(X: np.ndarray, y: np.ndarray) -> Dict[str, float | np.ndarray]:
        # Solve least squares
        beta, *_ = np.linalg.lstsq(X, y, rcond=None)
        pred = X @ beta
        rmse = float(math.sqrt(np.mean((pred - y) ** 2))) if y.size else float("nan")
        return {"beta": beta, "rmse": rmse, "n": int(y.size)}

    models_per_group: Dict[str, dict] = {}
    # Pooled/global data
    X_all, y_all = [], []

    for g, rows in buckets.items():
        # Build design matrix: [1, ln(lr), ln(bsz), ln(data_size), ln(non_embedding_param_size)]
        Xg, yg = [], []
        for r in rows:
            Xg.append(
                [
                    1.0,
                    _safe_log(r["lr"]),
                    _safe_log(r["bsz"]),
                    _safe_log(r["data_size"]),
                    _safe_log(r["non_embedding_param_size"]),
                ]
            )
            yg.append(_safe_log(r["lm_loss"]))
        Xg = np.asarray(Xg, dtype=float)
        yg = np.asarray(yg, dtype=float)

        model_g = _fit(Xg, yg)
        models_per_group[g] = model_g

        X_all.append(Xg)
        y_all.append(yg)

    X_all = np.vstack(X_all) if X_all else np.zeros((0, 5))
    y_all = np.concatenate(y_all) if y_all else np.zeros((0,))

    model_global = _fit(X_all, y_all) if y_all.size else {
        "beta": np.array([math.log(3.5), 0.0, 0.0, 0.0, 0.0], dtype=float),
        "rmse": float("nan"),
        "n": 0,
    }

    return {"per_group": models_per_group, "global": model_global}


def _ensure_models():
    global _MODELS
    if _MODELS is None:
        try:
            _MODELS = _fit_group_models()
        except Exception:
            # Absolute fallback to avoid hard failure during import/prediction
            beta_fallback = np.array([math.log(3.5), 0.0, 0.0, 0.0, 0.0], dtype=float)
            _MODELS = {"per_group": {}, "global": {"beta": beta_fallback, "rmse": float("nan"), "n": 0}}


def _predict_one(row: Dict[str, float], group: str) -> float:
    """
    Predict lm_loss for a single row using the fitted model for `group`.
    """
    _ensure_models()
    assert _MODELS is not None
    model = _MODELS["per_group"].get(group) or _MODELS["global"]
    beta: np.ndarray = model["beta"]

    # feature order must match training: [1, ln(lr), ln(bsz), ln(data_size), ln(non_embedding_param_size)]
    try:
        x = [
            1.0,
            _safe_log(row["lr"]),
            _safe_log(row["bsz"]),
            _safe_log(row["data_size"]),
            _safe_log(row["non_embedding_param_size"]),
        ]
    except KeyError as e:
        missing = str(e).strip("'")
        raise KeyError(
            f"Missing required input variable '{missing}'. "
            "Expected keys: lr, bsz, data_size, non_embedding_param_size"
        )
    y_log = float(np.dot(beta, np.asarray(x, dtype=float)))
    # Ensure strictly positive prediction
    return max(math.exp(y_log), _EPS)


def get_fitted_params() -> Dict[str, dict]:
    """
    Returns a dictionary with fitted parameters per group and global model.
    {
      "per_group": {
          group: {
              "beta": [b0, b1, b2, b3, b4],
              "A": exp(b0),
              "rmse": ...,
              "n": ...
          }, ...
      },
      "global": { ... }
    }
    """
    _ensure_models()
    assert _MODELS is not None
    # Add derived A = exp(b0)
    def enrich(model: dict) -> dict:
        beta = np.asarray(model["beta"], dtype=float)
        return {
            "beta": beta.tolist(),
            "A": float(math.exp(beta[0])),
            "rmse": float(model.get("rmse", float("nan"))),
            "n": int(model.get("n", 0)),
        }

    per_group = {g: enrich(m) for g, m in _MODELS["per_group"].items()}
    global_m = enrich(_MODELS["global"])
    return {"per_group": per_group, "global": global_m}


def dump_explanation(path: str = "/app/explain.md") -> None:
    """
    Writes a detailed explanation and the fitted parameters per group to a Markdown file.
    """
    params = get_fitted_params()

    lines = []
    lines.append("# Scaling Law Explanation")
    lines.append("")
    lines.append("We model the final language modeling loss as a multiplicative power-law in the")
    lines.append("training hyperparameters, fit in log-space using ordinary least squares (OLS):")
    lines.append("")
    lines.append("    ln(L) = b0 + b1 ln(lr) + b2 ln(bsz) + b3 ln(data_size) + b4 ln(non_embedding_param_size)")
    lines.append("")
    lines.append("Equivalently, in the original scale:")
    lines.append("")
    lines.append("    L = A * lr^b1 * bsz^b2 * data_size^b3 * non_embedding_param_size^b4, where A = exp(b0)")
    lines.append("")
    lines.append("We fit one set of coefficients per experimental group (same functional form for all groups).")
    lines.append("")
    lines.append("## Fitted Coefficients")
    lines.append("")
    lines.append("| Group | A (exp(b0)) | b1 (lr) | b2 (bsz) | b3 (data_size) | b4 (non_emb_params) | RMSE (ln L) | N |")
    lines.append("|---|---:|---:|---:|---:|---:|---:|---:|")

    def fmt(x: float) -> str:
        if math.isnan(x):
            return "NaN"
        return f"{x:.6g}"

    for g, m in sorted(params["per_group"].items()):
        b0, b1, b2, b3, b4 = m["beta"]
        A = m["A"]
        lines.append(
            f"| {g} | {fmt(A)} | {fmt(b1)} | {fmt(b2)} | {fmt(b3)} | {fmt(b4)} | {fmt(m['rmse'])} | {m['n']} |"
        )

    # Global/pool summary
    gm = params["global"]
    b0, b1, b2, b3, b4 = gm["beta"]
    lines.append("")
    lines.append("### Global (pooled) fit")
    lines.append("")
    lines.append(f"- A = {fmt(gm['A'])}")
    lines.append(f"- b1 (lr) = {fmt(b1)}, b2 (bsz) = {fmt(b2)}, b3 (data_size) = {fmt(b3)}, b4 (non_emb_params) = {fmt(b4)}")
    lines.append(f"- RMSE (ln L) = {fmt(gm['rmse'])}, N = {gm['n']}")
    lines.append("")
    lines.append("## Notes")
    lines.append("- Rows with non-positive values are excluded prior to log transform.")
    lines.append("- Predictions for unseen groups fall back to the global pooled model.")
    lines.append("- This simple log-linear law is robust and extrapolates smoothly; if stronger")
    lines.append("  curvature is present, augmenting with interaction terms is a straightforward extension.")

    content = "\n".join(lines)
    try:
        with open(path, "w", encoding="utf-8") as f:
            f.write(content)
    except Exception:
        # Best-effort; ignore write failures
        pass


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Required keys:
                    - lr
                    - bsz
                    - data_size
                    - non_embedding_param_size
        group: The name of the experimental group for which to make predictions.
               The functional form of the law is the same for all groups,
               but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s): {"lm_loss": float}.
    """
    preds = []
    for row in input_data:
        pred = _predict_one(row, group)
        preds.append({"lm_loss": float(pred)})
    return preds

#5 Run 5 R² = -1.000000

▼

Python

from __future__ import annotations

import math
import os
from typing import Dict, Iterable, List, Tuple

# Keep third-party deps minimal and robust to absence.
# datasets is expected to be available in this environment per the prompt.
try:
    from datasets import load_from_disk, Dataset, DatasetDict, concatenate_datasets
except Exception:
    load_from_disk = None  # type: ignore
    Dataset = None  # type: ignore
    DatasetDict = None  # type: ignore
    concatenate_datasets = None  # type: ignore

try:
    import numpy as np
except Exception:  # very unlikely, but keep a safe fallback
    np = None  # type: ignore

# Canonical feature names required by the law()
_CANON_FEATURES = [
    "lr",
    "bsz",
    "data_size",
    "non_embedding_param_size",
]
_TARGET = "lm_loss"

# Possible aliases to be resilient to column naming differences in the dataset.
_FEATURE_ALIASES: Dict[str, List[str]] = {
    "lr": ["lr", "learning_rate"],
    "bsz": ["bsz", "batch_size", "global_batch_size"],
    "data_size": ["data_size", "tokens", "n_tokens", "train_tokens", "total_tokens"],
    "non_embedding_param_size": [
        "non_embedding_param_size",
        "params_no_embed",
        "non_embedding_params",
        "non_embedding_param_count",
        "non_embedding_parameters",
    ],
    "lm_loss": ["lm_loss", "val_loss", "validation_loss", "eval_loss", "loss"],
}

_GROUP_CANDIDATES = [
    "group",
    "grp",
    "exp_group",
    "experiment",
    "suite",
    "dataset",
    "setting",
]

_MODELS: Dict[str, Dict[str, object]] = {}
_FEATURE_ORDER: List[str] = _CANON_FEATURES[:]  # order of features in design matrix (logs)
_GROUP_COL: str | None = None
_FITTED: bool = False


def _safe_log(x: float, eps: float = 1e-12) -> float:
    # Guard against non-positive inputs for log transforms
    return math.log(max(float(x), eps))


def _detect_columns(column_names: Iterable[str]) -> Tuple[Dict[str, str], str | None]:
    """
    Map canonical names to actual dataset column names and detect the group column.
    """
    cols = set(column_names)
    mapping: Dict[str, str] = {}
    for canon, aliases in _FEATURE_ALIASES.items():
        for a in aliases:
            if a in cols:
                mapping[canon] = a
                break

    group_col = None
    for g in _GROUP_CANDIDATES:
        if g in cols:
            group_col = g
            break

    # Ensure required canon features and target exist in mapping
    missing = [k for k in _CANON_FEATURES + [_TARGET] if k not in mapping]
    if missing:
        # If something is missing, we still return what we found; the caller may fallback.
        pass
    return mapping, group_col


def _concat_all_splits(ds_obj):
    if DatasetDict is not None and isinstance(ds_obj, DatasetDict):
        # Concatenate all splits into one dataset
        parts = [ds_obj[k] for k in ds_obj.keys()]
        if len(parts) == 1:
            return parts[0]
        if concatenate_datasets is None:
            # Fallback: naive chaining via .flatten_indices() and .select()
            base = parts[0]
            for p in parts[1:]:
                base = base.concatenate(p)  # type: ignore[attr-defined]
            return base
        return concatenate_datasets(parts)
    return ds_obj


def _fit_group_linear_model(X: "np.ndarray", y: "np.ndarray", ridge: float = 1e-6) -> "np.ndarray":
    """
    Fit beta via ridge-regularized normal equations: (X^T X + λI)^{-1} X^T y
    """
    XT = X.T
    XTX = XT @ X
    # Ridge on all params except the intercept (index 0)
    I = np.eye(XTX.shape[0])
    I[0, 0] = 0.0
    beta = np.linalg.solve(XTX + ridge * I, XT @ y)
    return beta


def _design_row(d: Dict[str, float]) -> List[float]:
    """
    Build a single design-row from an input dict of canonical features.
    Intercept + log-features.
    """
    return [1.0] + [_safe_log(d[k]) for k in _FEATURE_ORDER]


def _ensure_fitted() -> None:
    global _MODELS, _FEATURE_ORDER, _GROUP_COL, _FITTED

    if _FITTED:
        return

    models: Dict[str, Dict[str, object]] = {}
    feature_order = _CANON_FEATURES[:]
    group_col: str | None = None

    # Attempt to load and fit from /app/data
    ds = None
    if load_from_disk is not None:
        try:
            ds = load_from_disk("/app/data")
        except Exception:
            ds = None

    if ds is not None:
        ds = _concat_all_splits(ds)
        try:
            column_names = list(ds.column_names)  # type: ignore[attr-defined]
        except Exception:
            try:
                column_names = list(ds.features.keys())  # type: ignore[attr-defined]
            except Exception:
                column_names = []

        mapping, group_col = _detect_columns(column_names)

        # Verify that all required features and target are available
        has_all = all((k in mapping) for k in _CANON_FEATURES + [_TARGET])

        if has_all and np is not None:
            # Prepare rows grouped by group_col (or a single default group)
            groups: Dict[str, List[Dict[str, float]]] = {}
            default_group = "all"
            # Iterate rows
            for row in ds:  # type: ignore[assignment]
                # Extract canonical dict
                try:
                    canon = {
                        "lr": float(row[mapping["lr"]]),
                        "bsz": float(row[mapping["bsz"]]),
                        "data_size": float(row[mapping["data_size"]]),
                        "non_embedding_param_size": float(row[mapping["non_embedding_param_size"]]),
                    }
                    y = float(row[mapping["lm_loss"]])
                except Exception:
                    continue

                g = str(row[group_col]) if (group_col is not None and mapping.get(group_col) is None and group_col in row) else (
                    str(row[group_col]) if (group_col is not None and group_col in row) else default_group
                )

                # Stash both x and y
                item = dict(canon)
                item[_TARGET] = y
                groups.setdefault(g, []).append(item)

            # If no group column or empty groups, fallback to all data in one group
            if not groups:
                groups[default_group] = []
                for row in ds:  # type: ignore[assignment]
                    try:
                        groups[default_group].append(
                            {
                                "lr": float(row[mapping["lr"]]),
                                "bsz": float(row[mapping["bsz"]]),
                                "data_size": float(row[mapping["data_size"]]),
                                "non_embedding_param_size": float(row[mapping["non_embedding_param_size"]]),
                                _TARGET: float(row[mapping["lm_loss"]]),
                            }
                        )
                    except Exception:
                        continue

            # Fit per-group models
            for g, rows in groups.items():
                if len(rows) < 2:
                    continue
                X = np.array([_design_row(r) for r in rows], dtype=float)
                y = np.array([r[_TARGET] for r in rows], dtype=float)
                try:
                    beta = _fit_group_linear_model(X, y, ridge=1e-6)
                except Exception:
                    # Fallback to pseudo-inverse if needed
                    try:
                        beta = np.linalg.pinv(X) @ y
                        # Ensure length matches by padding/truncating
                        if beta.shape[0] != len(_FEATURE_ORDER) + 1:
                            beta = np.resize(beta, len(_FEATURE_ORDER) + 1)
                    except Exception:
                        # Ultimate fallback: simple mean model
                        beta = np.zeros(len(_FEATURE_ORDER) + 1, dtype=float)
                        beta[0] = float(y.mean())
                models[g] = {"beta": beta, "feature_order": feature_order}

            # Also fit a global model across all data for fallback
            all_rows: List[Dict[str, float]] = [r for rs in groups.values() for r in rs]
            if all_rows:
                X_all = np.array([_design_row(r) for r in all_rows], dtype=float)
                y_all = np.array([r[_TARGET] for r in all_rows], dtype=float)
                try:
                    beta_all = _fit_group_linear_model(X_all, y_all, ridge=1e-6)
                except Exception:
                    try:
                        beta_all = np.linalg.pinv(X_all) @ y_all
                    except Exception:
                        beta_all = np.zeros(len(_FEATURE_ORDER) + 1, dtype=float)
                        beta_all[0] = float(y_all.mean())
                models.setdefault("all", {"beta": beta_all, "feature_order": feature_order})

    # If fitting failed for any reason, create a conservative default model.
    if not models:
        # Default: constant loss ~3.5 (a typical LM cross-entropy scale) with zero log-coeffs.
        default_beta = [3.5] + [0.0] * len(_FEATURE_ORDER)
        models = {"all": {"beta": default_beta, "feature_order": _FEATURE_ORDER}}

    _MODELS = models
    _FEATURE_ORDER = feature_order
    _GROUP_COL = group_col
    _FITTED = True

    # Attempt to materialize an explain.md with the fitted parameters
    try:
        _write_explain_markdown("/app/explain.md")
    except Exception:
        # Do not fail the import if we cannot write the explanation.
        pass


def _format_coeff_table() -> str:
    lines = []
    header = ["group", "beta0(intercept)"] + [f"beta_{k}=coef(log({k}))" for k in _FEATURE_ORDER]
    lines.append("| " + " | ".join(header) + " |")
    lines.append("| " + " | ".join(["---"] * len(header)) + " |")
    for g, info in sorted(_MODELS.items(), key=lambda kv: kv[0]):
        beta = info["beta"]
        if hasattr(beta, "tolist"):
            beta_vals = list(beta.tolist())
        else:
            beta_vals = list(beta)  # type: ignore
        row = [g] + [f"{float(v):.6g}" for v in beta_vals]
        lines.append("| " + " | ".join(row) + " |")
    return "\n".join(lines)


def _write_explain_markdown(path: str) -> None:
    """
    Write an explanation file describing the discovered law and fitted coefficients.
    """
    template = f"""# Scaling Law for Language Model Training Loss

We model the final language modeling loss (lm_loss) as an affine function of the logarithms of core training hyperparameters:

Formula (shared functional form across all groups):
    lm_loss ≈ β0_g
              + β1_g · log(lr)
              + β2_g · log(bsz)
              + β3_g · log(data_size)
              + β4_g · log(non_embedding_param_size)

- Functional form is identical for all experimental groups g, but coefficients β•_g are fitted per-group.
- The log-transform captures empirically observed power-law-like scaling of loss with respect to optimization hyperparameters, data scale, and model size.

Fitting methodology:
- Data source: /app/data loaded via datasets.load_from_disk().
- Features: logarithms of lr, bsz, data_size, non_embedding_param_size with an intercept term.
- Target: lm_loss.
- Estimator: per-group ridge-regularized least squares on the design matrix [1, log(lr), log(bsz), log(data_size), log(non_embedding_param_size)].
- Regularization: λ = 1e-6 on non-intercept coefficients to improve numerical stability.
- A global 'all' model is also fit as a fallback when a group is unseen or has insufficient data.

Fitted coefficients by group (β0_g, β1_g, β2_g, β3_g, β4_g):
{_format_coeff_table()}

Notes:
- log denotes the natural logarithm.
- If any input is non-positive, a small epsilon is used internally for numerical stability.
- For unseen groups at prediction time, the 'all' model is used as a robust default.
"""
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        f.write(template)


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Expected keys:
                      - 'lr'
                      - 'bsz'
                      - 'data_size'
                      - 'non_embedding_param_size'
        group: The name of the experimental group for which to make predictions.
               The functional form is the same for all groups, but the coefficients
               can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s), currently:
          - 'lm_loss'
    """
    _ensure_fitted()

    # Choose the model for the specified group, fallback to 'all'
    model = _MODELS.get(group) or _MODELS.get("all")
    if model is None:
        # Should not happen; final guard
        return [{"lm_loss": 3.5} for _ in input_data]

    beta = model["beta"]
    # Normalize beta to a Python list for computation
    if hasattr(beta, "tolist"):
        beta_vals = list(beta.tolist())  # type: ignore
    else:
        beta_vals = list(beta)  # type: ignore

    preds: List[Dict[str, float]] = []
    for row in input_data:
        # Build canonical dict (allowing both exact and aliased keys)
        canon: Dict[str, float] = {}
        for k in _FEATURE_ORDER:
            v = None
            if k in row:
                v = row[k]
            else:
                # Attempt aliases if user passed a different name
                for alias in _FEATURE_ALIASES.get(k, []):
                    if alias in row:
                        v = row[alias]  # type: ignore[index]
                        break
            if v is None:
                # Missing value: use a neutral default (1.0 for logs -> 0 contribution)
                v = 1.0
            canon[k] = float(v)

        x_vec = [1.0] + [_safe_log(canon[k]) for k in _FEATURE_ORDER]
        # Dot product
        lm = 0.0
        for bi, xi in zip(beta_vals, x_vec):
            lm += float(bi) * float(xi)
        preds.append({"lm_loss": float(lm)})

    return preds