SLD - U-shaped Scaling Law

All Runs (sorted by R²)

Best Run 1 R² = 0.380703

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Cache learned coefficients on the function object to avoid repeated I/O/fits.
    if not hasattr(law, "_coeffs"):
        # Fit a U-shaped (convex) scaling law per group on first call:
        #   brier_score_hat = y0_g + k_g * (log_flops - m_g)^2
        # where k_g >= 0 ensures a U-shape. We determine m_g by 1D search and
        # solve y0_g, k_g by closed-form least squares for each candidate m_g.
        def _load_dataset():
            try:
                from datasets import load_from_disk, Dataset, DatasetDict  # type: ignore
            except Exception:
                return None
            try:
                ds = load_from_disk("/app/data")
            except Exception:
                return None
            return ds

        def _iter_rows(ds):
            # Yield dictionaries with keys including 'log_flops', 'brier_score', and 'group' (if present)
            try:
                from datasets import Dataset, DatasetDict  # type: ignore
            except Exception:
                Dataset = object  # type: ignore
                DatasetDict = dict  # type: ignore
            if isinstance(ds, dict) or str(type(ds)).endswith("DatasetDict'>"):
                for split in ds.values():
                    for row in split:
                        yield dict(row)
            else:
                for row in ds:
                    yield dict(row)

        def _fit_group(points):
            # Fit y = y0 + k * (x - m)^2 with k >= 0 by grid-search over m and
            # closed-form LS for (y0, k) at each m.
            xs = [p[0] for p in points]
            ys = [p[1] for p in points]
            n = len(xs)
            if n == 0:
                return (0.2, 0.01, 10.0, float("nan"))  # y0, k, m, mse
            if n == 1:
                # With one point, place vertex at x and set k very small.
                return (ys[0], 1e-6, xs[0], 0.0)
            xmin, xmax = min(xs), max(xs)
            # Expand search range slightly to allow vertex just outside observed x.
            margin = max(1e-6, 0.05 * (xmax - xmin) if xmax > xmin else 0.5)
            lo, hi = xmin - margin, xmax + margin
            best = (float("inf"), 0.0, 0.0, 0.0)  # mse, y0, k, m
            # Build a small grid over m; denser if we have more data
            steps = max(21, min(101, 5 * n))
            for i in range(steps):
                m = lo + (hi - lo) * i / (steps - 1)
                # Features: z = (x - m)^2, model: y = y0 + k*z
                z = [(x - m) ** 2 for x in xs]
                Sz = sum(z)
                Sz2 = sum(zz * zz for zz in z)
                Sy = sum(ys)
                Szy = sum(z[i] * ys[i] for i in range(n))
                lam = 1e-12  # tiny ridge for numerical stability
                a11 = n + lam
                a12 = Sz
                a22 = Sz2 + lam
                det = a11 * a22 - a12 * a12
                if det == 0.0:
                    continue
                # Solve 2x2 system:
                y0 = (Sy * a22 - a12 * Szy) / det
                k = (a11 * Szy - a12 * Sy) / det
                # Enforce convexity (U-shape)
                if k < 0.0:
                    k = 0.0
                preds = [y0 + k * z[i] for i in range(n)]
                mse = sum((preds[i] - ys[i]) ** 2 for i in range(n)) / n
                if mse < best[0]:
                    best = (mse, y0, k, m)
            _, y0b, kb, mb = best
            return (y0b, kb, mb, best[0])

        # Try to load and fit from dataset; if unavailable, fall back to a generic prior.
        ds = _load_dataset()
        coeffs = {}  # group -> (y0, k, m, mse, n)
        all_points = []
        group_key = "group"
        if ds is not None:
            # Peek first row to detect group key if different
            try:
                first_row = next(_iter_rows(ds))
                # Detect a plausible group key if 'group' not present
                if group_key not in first_row:
                    for cand in ("group", "dataset", "family", "arch", "setting"):
                        if cand in first_row:
                            group_key = cand
                            break
                # Include the first row back (we consumed it)
                rows_iter = (r for r in ([first_row] + list(_iter_rows(ds))))
            except StopIteration:
                rows_iter = iter([])
            # Collect points per group
            grouped = {}
            for row in rows_iter:
                try:
                    x = float(row["log_flops"])
                    y = float(row["brier_score"])
                except Exception:
                    continue
                g = str(row.get(group_key, "ALL"))
                grouped.setdefault(g, []).append((x, y))
                all_points.append((x, y))
            # Fit per group
            for g, pts in grouped.items():
                y0, k, m, mse = _fit_group(pts)
                coeffs[g] = (y0, k, m, mse, len(pts))
            # Also fit a global fallback across all data
            if all_points:
                y0, k, m, mse = _fit_group(all_points)
                coeffs.setdefault("ALL", (y0, k, m, mse, len(all_points)))
        # Fallback if dataset couldn't be loaded
        if not coeffs:
            # Reasonable, convex U-shape prior in log_flops
            coeffs = {
                "ALL": (0.2, 0.01, 10.0, float("nan"), 0),
            }

        # Store cache
        law._coeffs = coeffs  # type: ignore[attr-defined]

        # Try to write a human-readable report to /app/explain.md
        try:
            lines = []
            lines.append("# U-shaped scaling law for Brier score vs. log_flops\n")
            lines.append("We model final performance (lower Brier is better) as a convex quadratic in log compute:\n")
            lines.append("\n")
            lines.append("brier_score_hat = y0_g + k_g * (log_flops - m_g)^2\n")
            lines.append("\n")
            lines.append("where the functional form is shared across groups g, and (y0_g, k_g, m_g) are group-specific parameters fit via least squares with a grid-search over the vertex location m_g, enforcing k_g >= 0.\n")
            lines.append("\n")
            lines.append("## Fitted coefficients by group\n")
            lines.append("\n")
            lines.append("| group | y0 | k | m | MSE (fit) | n |\n")
            lines.append("|---|---:|---:|---:|---:|---:|\n")
            # Sort keys for reproducibility
            for g in sorted(law._coeffs.keys()):  # type: ignore[attr-defined]
                y0, k, m, mse, n = law._coeffs[g]  # type: ignore[index]
                def _fmt(v):
                    if v != v:  # NaN
                        return "NaN"
                    return f"{v:.6g}"
                lines.append(f"| {g} | {_fmt(y0)} | {_fmt(k)} | {_fmt(m)} | {_fmt(mse)} | {n} |\n")
            with open("/app/explain.md", "w", encoding="utf-8") as f:
                f.writelines(lines)
        except Exception:
            # Silently ignore if we cannot write the report (read-only FS, etc.)
            pass

    # Do predictions using cached coefficients.
    coeffs = law._coeffs  # type: ignore[attr-defined]
    results: list[dict[str, float]] = []
    # Choose coeffs: exact group -> fallback to "ALL" -> last resort prior
    cg = coeffs.get(group)
    if cg is None:
        cg = coeffs.get("ALL", (0.2, 0.01, 10.0, float("nan"), 0))
    y0, k, m = cg[0], cg[1], cg[2]
    for row in (input_data or []):
        try:
            x = float(row["log_flops"])
        except Exception:
            # If missing, predict baseline y0
            results.append({"brier_score": float(y0)})
            continue
        yhat = y0 + k * (x - m) ** 2
        results.append({"brier_score": float(yhat)})
    return results

#2 Run 2 R² = 0.246827

▼

Python

"""
Scaling law predictor for U-shaped scaling of Brier score vs. log_flops.

We model, for each experimental group g:
    brier_score ≈ a_g * (log_flops - c_g) ** 2 + b_g
with a_g ≥ 0 enforcing a U-shaped curve that achieves its minimum at log_flops = c_g.

Parameters are estimated once from the dataset at /app/data (if available) using:
- A coarse-to-fine grid search over c_g (vertex) within the observed range of log_flops
- Closed-form ordinary least squares for (a_g, b_g) given c_g via the design [ (x - c_g)^2, 1 ]
- A small non-negativity clamp on a_g to maintain the U-shape

If the dataset is not available at runtime, a conservative global default is used.
"""

from __future__ import annotations

from typing import Dict, List, Tuple, Iterable
import math

DATASET_PATH = "/app/data"
GLOBAL_KEY = "__GLOBAL__"
# Small positive curvature to ensure U-shaped behavior even in degenerate cases
_A_MIN = 1e-12

# Cache of fitted parameters per group: {group: {"a": float, "b": float, "c": float}}
_PARAMETERS: Dict[str, Dict[str, float]] = {}
# Cache of per-group sample sizes and fit quality (for optional introspection/printing)
_FIT_STATS: Dict[str, Dict[str, float]] = {}

def _iter_rows_from_hf(ds) -> Iterable[dict]:
    """Yield rows from a HuggingFace Dataset or DatasetDict in a safe, unified way."""
    try:
        from datasets import Dataset, DatasetDict
    except Exception:
        # If datasets isn't available, nothing to iterate
        return []
    if hasattr(ds, "keys") and callable(getattr(ds, "keys", None)):
        # Likely a DatasetDict
        # Prefer 'train' if present; otherwise iterate all splits
        if "train" in ds:
            for row in ds["train"]:
                yield row
        else:
            for split in ds.keys():
                for row in ds[split]:
                    yield row
    else:
        # Single Dataset
        for row in ds:
            yield row

def _safe_float(v) -> float | None:
    try:
        f = float(v)
        if math.isfinite(f):
            return f
        return None
    except Exception:
        return None

def _ols_two_feature(zs: List[float], ys: List[float]) -> Tuple[float, float]:
    """
    Closed-form OLS for y ≈ a * z + b given feature z and intercept.
    Returns (a, b). Uses numerically stable sums and handles degeneracies.
    """
    n = len(zs)
    if n == 0:
        return (_A_MIN, 0.0)

    sz = 0.0
    szz = 0.0
    sy = 0.0
    syz = 0.0
    for z, y in zip(zs, ys):
        sz += z
        szz += z * z
        sy += y
        syz += y * z

    det = szz * n - sz * sz
    if abs(det) > 0.0:
        inv00 = n / det
        inv01 = -sz / det
        inv10 = -sz / det
        inv11 = szz / det
        a = inv00 * syz + inv01 * sy
        b = inv10 * syz + inv11 * sy
    else:
        # Degenerate: fall back to mean-based slope if possible
        mean_z = sz / n if n else 0.0
        mean_y = sy / n if n else 0.0
        num = 0.0
        den = 0.0
        for z, y in zip(zs, ys):
            dz = z - mean_z
            num += (y - mean_y) * dz
            den += dz * dz
        a = (num / den) if den > 0.0 else 0.0
        b = mean_y - a * mean_z

    # Enforce non-negativity on a (U-shape opening upwards). If clamped, adjust b optimally.
    if a < _A_MIN:
        a = _A_MIN
        # Optimal b given fixed a minimizes MSE: b = mean(y - a*z)
        b = (sy - a * sz) / n if n else b
    return (a, b)

def _fit_u_shape(xs: List[float], ys: List[float]) -> Tuple[float, float, float, float]:
    """
    Fit y ≈ a*(x - c)^2 + b with a ≥ 0 using:
        - Grid search over c in [min(xs), max(xs)]
        - Closed-form OLS for (a, b) given c
    Returns (a, b, c, mse).
    """
    n = len(xs)
    if n == 0:
        return (_A_MIN, 0.0, 0.0, float("inf"))

    x_min = min(xs)
    x_max = max(xs)
    if not math.isfinite(x_min) or not math.isfinite(x_max):
        return (_A_MIN, 0.0, 0.0, float("inf"))

    # If all xs equal, set c to that value and fit a,b
    if x_max == x_min:
        c = x_min
        zs = [(x - c) ** 2 for x in xs]
        a, b = _ols_two_feature(zs, ys)
        mse = sum((a * z + b - y) ** 2 for z, y in zip(zs, ys)) / n
        return (a, b, c, mse)

    # Coarse-to-fine grid for c
    # Coarse grid
    best = (float("inf"), _A_MIN, 0.0, (x_min + x_max) * 0.5)  # (mse, a, b, c)
    for num in (41, 81, 161):  # progressively finer
        best_mse, best_a, best_b, best_c = best
        if num <= 1:
            candidates = [best_c]
        else:
            step = (x_max - x_min) / (num - 1)
            candidates = [x_min + i * step for i in range(num)]
        for c in candidates:
            zs = [(x - c) ** 2 for x in xs]
            a, b = _ols_two_feature(zs, ys)
            mse = sum((a * z + b - y) ** 2 for z, y in zip(zs, ys)) / n
            if mse < best_mse:
                best = (mse, a, b, c)

        # Narrow the search window around current best c for the next iteration
        best_mse, best_a, best_b, best_c = best
        span = (x_max - x_min) * 0.25
        x_min = max(min(best_c - span, best_c), min(xs))
        x_max = min(max(best_c + span, best_c), max(xs))

    mse, a, b, c = best
    # Final small local refinement around best c
    local_span = (max(xs) - min(xs)) * 0.05
    if local_span > 0:
        local_candidates = [c + t * local_span for t in (-1.0, -0.5, 0.0, 0.5, 1.0)]
        for c2 in local_candidates:
            zs = [(x - c2) ** 2 for x in xs]
            a2, b2 = _ols_two_feature(zs, ys)
            mse2 = sum((a2 * z + b2 - y) ** 2 for z, y in zip(zs, ys)) / n
            if mse2 < mse:
                mse, a, b, c = mse2, a2, b2, c2

    return (a, b, c, mse)

def _fit_parameters_from_dataset() -> Tuple[Dict[str, Dict[str, float]], Dict[str, Dict[str, float]]]:
    """
    Load the dataset, fit parameters per group, and return:
      (parameters, fit_stats)
    """
    params: Dict[str, Dict[str, float]] = {}
    stats: Dict[str, Dict[str, float]] = {}

    try:
        from datasets import load_from_disk  # type: ignore
        ds = load_from_disk(DATASET_PATH)
    except Exception:
        # Dataset not available; return empty and let caller handle defaults
        return (params, stats)

    # Collect per-group data
    per_group_xs: Dict[str, List[float]] = {}
    per_group_ys: Dict[str, List[float]] = {}
    all_xs: List[float] = []
    all_ys: List[float] = []

    for row in _iter_rows_from_hf(ds):
        x = _safe_float(row.get("log_flops"))
        y = _safe_float(row.get("brier_score"))
        if x is None or y is None:
            continue
        g = row.get("group")
        # Fallbacks if 'group' column is absent
        if g is None:
            g = row.get("Group") or row.get("dataset") or row.get("family") or GLOBAL_KEY
        g = str(g)

        per_group_xs.setdefault(g, []).append(x)
        per_group_ys.setdefault(g, []).append(y)
        all_xs.append(x)
        all_ys.append(y)

    # Fit per group
    for g, xs in per_group_xs.items():
        ys = per_group_ys[g]
        a, b, c, mse = _fit_u_shape(xs, ys)
        params[g] = {"a": float(a), "b": float(b), "c": float(c)}
        stats[g] = {"n": float(len(xs)), "mse": float(mse)}

    # Fit global as fallback
    if all_xs:
        a, b, c, mse = _fit_u_shape(all_xs, all_ys)
        params.setdefault(GLOBAL_KEY, {"a": float(a), "b": float(b), "c": float(c)})
        stats.setdefault(GLOBAL_KEY, {"n": float(len(all_xs)), "mse": float(mse)})

    return (params, stats)

def _ensure_fitted() -> None:
    """Fit parameters once per process if not already fitted."""
    global _PARAMETERS, _FIT_STATS
    if _PARAMETERS:
        return
    params, stats = _fit_parameters_from_dataset()
    if params:
        _PARAMETERS = params
        _FIT_STATS = stats
    else:
        # Dataset missing; set conservative defaults
        _PARAMETERS = {
            GLOBAL_KEY: {"a": 1e-3, "b": 0.2, "c": 0.0},
        }
        _FIT_STATS = {GLOBAL_KEY: {"n": 0.0, "mse": float("nan")}}

def _predict_one(x: float, p: Dict[str, float]) -> float:
    a = p["a"]
    b = p["b"]
    c = p["c"]
    return a * (x - c) ** 2 + b

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    _ensure_fitted()

    # Pick group-specific parameters if available; otherwise global fallback
    params = _PARAMETERS.get(group)
    if params is None:
        params = _PARAMETERS.get(GLOBAL_KEY)
        # As an extra guard, if even global is missing, synthesize a trivial fallback
        if params is None:
            params = {"a": 1e-3, "b": 0.2, "c": 0.0}

    outputs: List[Dict[str, float]] = []
    for row in input_data:
        x = row.get("log_flops")
        xf = _safe_float(x)
        if xf is None:
            # If log_flops is missing or invalid, return NaN to signal unusable input
            outputs.append({"brier_score": float("nan")})
        else:
            yhat = _predict_one(xf, params)
            outputs.append({"brier_score": float(yhat)})
    return outputs

if __name__ == "__main__":
    # Optional CLI to inspect fitted parameters and fit quality per group.
    _ensure_fitted()
    # Pretty print results in a stable order
    groups = sorted(_PARAMETERS.keys())
    # Column header
    print("group\tn\ta\tb\tc\tmse")
    for g in groups:
        p = _PARAMETERS[g]
        s = _FIT_STATS.get(g, {})
        print(
            f"{g}\t{s.get('n', float('nan')):.0f}\t{p['a']:.6g}\t{p['b']:.6g}\t{p['c']:.6g}\t{s.get('mse', float('nan')):.6g}"
        )

#3 Run 3 R² = -1.000000

▼

Python

from __future__ import annotations

from typing import Dict, List, Tuple
import math
import os

# We only import numpy and datasets lazily to keep import-time failures graceful.
# Fitting is attempted at import; if unavailable, we fall back to sane defaults.
_params_by_group: Dict[str, Tuple[float, float, float]] = {}
_default_params: Tuple[float, float, float] = (0.0, 0.0, 0.0)  # a, b, c
_DATA_PATH = "/app/data"
_EXPLAIN_MD = "/app/explain.md"


def _try_imports():
    try:
        import numpy as np  # type: ignore
    except Exception:
        np = None  # type: ignore
    try:
        from datasets import load_from_disk  # type: ignore
    except Exception:
        load_from_disk = None  # type: ignore
    return np, load_from_disk


def _fit_quadratic(xs, ys, l2: float = 1e-8):
    """
    Fit y = a x^2 + b x + c by ridge-regularized least squares.
    Returns (a, b, c)
    """
    import numpy as np  # local import ensured by _try_imports already
    x = np.asarray(xs, dtype=float)
    y = np.asarray(ys, dtype=float)
    # Design matrix: [x^2, x, 1]
    X = np.column_stack([x * x, x, np.ones_like(x)])
    # Ridge term
    A = X.T @ X + l2 * np.eye(3)
    b = X.T @ y
    try:
        beta = np.linalg.solve(A, b)
    except np.linalg.LinAlgError:
        beta = np.linalg.pinv(A) @ b
    a, bb, c = map(float, beta)
    # Encourage U-shape if grossly non-U-shaped fit occurs due to noise:
    # If a <= 0, refit a constrained form y = k + m*(x - x0)^2 by searching x0 on a coarse grid.
    if a <= 0:
        x_min, x_max = float(x.min()), float(x.max())
        grid = np.linspace(x_min, x_max, num=max(25, min(200, len(x) * 5)))
        best = None
        for x0 in grid:
            Z = np.column_stack([(x - x0) ** 2, np.ones_like(x)])  # [ (x - x0)^2, 1 ]
            A2 = Z.T @ Z + l2 * np.eye(2)
            b2 = Z.T @ y
            try:
                k_m = np.linalg.solve(A2, b2)
            except np.linalg.LinAlgError:
                k_m = np.linalg.pinv(A2) @ b2
            m, k = float(k_m[0]), float(k_m[1])
            y_hat = m * (x - x0) ** 2 + k
            err = float(np.mean((y_hat - y) ** 2))
            if best is None or err < best[0]:
                best = (err, m, k, x0)
        if best is not None and best[1] > 0:
            # Convert y = m (x - x0)^2 + k back to a,b,c
            m, k, x0 = best[1], best[2], best[3]
            a = m
            bb = -2.0 * m * x0
            c = m * (x0 ** 2) + k
    return float(a), float(bb), float(c)


def _load_and_fit(path: str = _DATA_PATH):
    """
    Loads dataset from disk and fits per-group quadratic parameters.

    Expected columns:
      - log_flops (input, float)
      - brier_score (target, float)
      - group (categorical string), if missing, all data treated as one group.

    Produces:
      - _params_by_group mapping for all groups found
      - _default_params as the 'all' fit across groups
      - Writes /app/explain.md with a summary table (best-effort)
    """
    global _params_by_group, _default_params

    np, load_from_disk = _try_imports()
    if np is None or load_from_disk is None:
        # Unable to fit; keep defaults and write a basic explanation.
        _params_by_group = {}
        _default_params = (0.0, 0.0, 0.0)
        _write_explain_md(_params_by_group, _default_params, fitted=False)
        return

    try:
        ds = load_from_disk(path)
    except Exception:
        # Dataset not found or unreadable
        _params_by_group = {}
        _default_params = (0.0, 0.0, 0.0)
        _write_explain_md(_params_by_group, _default_params, fitted=False)
        return

    # Normalize to a flat list of dict rows
    def _to_rows(dataset_obj):
        # dataset_obj could be a Dataset, DatasetDict, or dict-like of splits
        rows = []
        try:
            # DatasetDict-like
            keys = list(dataset_obj.keys())  # type: ignore
            for k in keys:
                split = dataset_obj[k]
                rows.extend(split.to_dict(batch_size=len(split)) if hasattr(split, "to_dict") else list(split))
        except Exception:
            # Single Dataset-like
            try:
                rows = dataset_obj.to_dict(batch_size=len(dataset_obj))  # type: ignore
                # to_dict returns column-wise; convert to row-wise
                if isinstance(rows, dict):
                    cols = list(rows.keys())
                    n = len(rows[cols[0]]) if cols else 0
                    rows = [{c: rows[c][i] for c in cols} for i in range(n)]
            except Exception:
                # Best-effort iteration
                try:
                    rows = list(dataset_obj)  # type: ignore
                except Exception:
                    rows = []
        return rows

    rows = _to_rows(ds)
    # Try merging multiple splits if ds is a DatasetDict and to_dict per-split failed
    if not rows:
        try:
            # Concatenate splits manually
            all_rows = []
            for k in ds.keys():
                split = ds[k]
                d = split.to_dict(batch_size=len(split))
                cols = list(d.keys())
                n = len(d[cols[0]]) if cols else 0
                all_rows.extend([{c: d[c][i] for c in cols} for i in range(n)])
            rows = all_rows
        except Exception:
            pass

    # Extract columns
    Xs, Ys, Gs = [], [], []
    for r in rows:
        try:
            x = float(r.get("log_flops"))
            y = float(r.get("brier_score"))
            if math.isnan(x) or math.isnan(y):
                continue
        except Exception:
            continue
        g = r.get("group")
        if g is None:
            # attempt alternative names
            g = r.get("group_name") or r.get("dataset") or r.get("family") or "all"
        Gs.append(str(g))
        Xs.append(x)
        Ys.append(y)

    if not Xs:
        _params_by_group = {}
        _default_params = (0.0, 0.0, 0.0)
        _write_explain_md(_params_by_group, _default_params, fitted=False)
        return

    # Global fit
    a, b, c = _fit_quadratic(Xs, Ys)
    _default_params = (a, b, c)

    # Per-group fits (require at least 3 points; otherwise fallback to global)
    _params_by_group = {}
    from collections import defaultdict

    by_group_x: Dict[str, List[float]] = defaultdict(list)
    by_group_y: Dict[str, List[float]] = defaultdict(list)
    for x, y, g in zip(Xs, Ys, Gs):
        by_group_x[g].append(x)
        by_group_y[g].append(y)

    for g, xs in by_group_x.items():
        ys = by_group_y[g]
        if len(xs) >= 3:
            _params_by_group[g] = _fit_quadratic(xs, ys)
        else:
            _params_by_group[g] = _default_params

    # Also keep a global entry for fallback
    _params_by_group["__all__"] = _default_params

    _write_explain_md(_params_by_group, _default_params, fitted=True)


def _write_explain_md(params_by_group: Dict[str, Tuple[float, float, float]],
                      default_params: Tuple[float, float, float],
                      fitted: bool):
    """
    Writes a human-readable explanation and a table of fitted coefficients.
    """
    def fmt(v: float) -> str:
        try:
            return f"{float(v):.6g}"
        except Exception:
            return str(v)

    lines: List[str] = []
    lines.append("# Discovered U-shaped scaling law for Brier score vs. log FLOPs")
    lines.append("")
    lines.append("We model the final language-modeling performance (brier_score) as a U-shaped quadratic function of the training compute measured in log FLOPs:")
    lines.append("")
    lines.append("brier_score = a * (log_flops)^2 + b * (log_flops) + c")
    lines.append("")
    lines.append("This quadratic form captures the U-shape via a > 0. The parabola's vertex (minimum) occurs at:")
    lines.append("")
    lines.append("log_flops* = -b / (2a)")
    lines.append("")
    lines.append("Methodology:")
    lines.append("- Loaded the dataset from /app/data (Hugging Face datasets format).")
    lines.append("- Fitted a regularized least-squares quadratic y = a x^2 + b x + c per group (with a light L2 penalty).")
    lines.append("- If a non-U-shaped fit (a <= 0) occurred, refit a constrained form y = m (x - x0)^2 + k via grid search over x0, then converted back to (a, b, c).")
    lines.append("- Also computed a global ('__all__') fit for fallback when a group's data are insufficient.")
    lines.append("")
    if not fitted or not params_by_group:
        lines.append("Fitting did not run (missing dependencies or dataset not found). Defaults are in effect.")
        lines.append("")
    lines.append("Fitted coefficients by group:")
    lines.append("")
    header = "| group | a | b | c | vertex_x = -b/(2a) | min_y |"
    sep = "|---|---:|---:|---:|---:|---:|"
    lines.append(header)
    lines.append(sep)

    def vertex_and_min(a: float, b: float, c: float):
        if a != 0:
            xv = -b / (2.0 * a)
            yv = a * xv * xv + b * xv + c
            return xv, yv
        return float("nan"), float("nan")

    any_rows = False
    # Sort known groups alphabetically but put __all__ last
    keys = [k for k in params_by_group.keys() if k != "__all__"]
    keys.sort()
    if "__all__" in params_by_group:
        keys.append("__all__")
    for g in keys:
        a, b, c = params_by_group[g]
        xv, yv = vertex_and_min(a, b, c)
        lines.append(f"| {g} | {fmt(a)} | {fmt(b)} | {fmt(c)} | {fmt(xv)} | {fmt(yv)} |")
        any_rows = True

    if not any_rows:
        a, b, c = default_params
        xv, yv = vertex_and_min(a, b, c)
        lines.append(f"| __all__ | {fmt(a)} | {fmt(b)} | {fmt(c)} | {fmt(xv)} | {fmt(yv)} |")

    try:
        os.makedirs(os.path.dirname(_EXPLAIN_MD), exist_ok=True)
        with open(_EXPLAIN_MD, "w", encoding="utf-8") as f:
            f.write("\n".join(lines) + "\n")
    except Exception:
        # Best-effort only; ignore write failures
        pass


# Attempt to fit on import and write explain.md
try:
    _load_and_fit(_DATA_PATH)
except Exception:
    # Keep defaults if anything unexpected happened
    _params_by_group = {}
    _default_params = (0.0, 0.0, 0.0)
    _write_explain_md(_params_by_group, _default_params, fitted=False)


def _predict_for_group(log_flops: float, group: str) -> float:
    """
    Evaluate the quadratic for the given group (falls back to global then defaults).
    """
    a, b, c = _params_by_group.get(group) or _params_by_group.get("__all__") or _default_params
    x = float(log_flops)
    return float(a * x * x + b * x + c)


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    outputs: List[dict[str, float]] = []
    for row in input_data:
        if "log_flops" not in row:
            raise KeyError("Each input row must contain 'log_flops'.")
        y_hat = _predict_for_group(row["log_flops"], group)
        outputs.append({"brier_score": float(y_hat)})
    return outputs

#4 Run 4 R² = -1.000000

▼

Python

from __future__ import annotations

from typing import Dict, List
import math
import os
import sys
import json
from dataclasses import dataclass

# Optional dependencies: we guard imports to keep this file robust.
try:
    import numpy as np
except Exception:  # pragma: no cover
    np = None  # We will fall back to simple defaults if numpy is unavailable.

# Try to load the HF dataset at import time to fit parameters per group.
DATA_DIR = "/app/data"


@dataclass
class QuadParams:
    # Quadratic in centered input: y = a + b*(x - mu) + c*(x - mu)^2
    a: float
    b: float
    c: float  # constrained to c > 0 for U-shape
    mu: float

    def predict(self, x: float) -> float:
        xc = x - self.mu
        return self.a + self.b * xc + self.c * xc * xc


def _safe_clip_brier(y: float) -> float:
    # Brier score lies in [0, 1]
    if math.isnan(y) or math.isinf(y):
        return 1.0
    return min(1.0, max(0.0, y))


def _default_params() -> QuadParams:
    # Generic U-shaped prior (mild curvature), centered at mu=0
    # y = 0.2 - 0.02*x + 0.002*x^2, clipped to [0,1]
    return QuadParams(a=0.2, b=-0.02, c=0.002, mu=0.0)


def _fit_group(xs: List[float], ys: List[float]) -> QuadParams:
    # Fit y ~ a + b*(x - mu) + c*(x - mu)^2 with OLS, enforce c > 0
    if np is None or len(xs) == 0 or len(ys) == 0:
        return _default_params()

    x = np.asarray(xs, dtype=float)
    y = np.asarray(ys, dtype=float)

    # Numerical stability: center x
    mu = float(np.mean(x))
    xc = x - mu
    X = np.column_stack([np.ones_like(xc), xc, xc**2])

    # Solve least squares
    try:
        beta, *_ = np.linalg.lstsq(X, y, rcond=None)
    except Exception:
        return _default_params()

    a, b, c = map(float, beta)

    # Enforce U-shape: c > 0. If not, refit a,b with c fixed to positive.
    eps = 1e-8
    if not (c > 0.0 and math.isfinite(c)):
        c_pos = max(abs(c), eps)
        # Refit a and b on z = y - c_pos*xc^2
        z = y - c_pos * (xc**2)
        Xab = np.column_stack([np.ones_like(xc), xc])
        try:
            ab, *_ = np.linalg.lstsq(Xab, z, rcond=None)
            a, b = map(float, ab)
            c = c_pos
        except Exception:
            return _default_params()

    # Small regularization to avoid overfitting extreme curvature
    # Cap curvature to a sensible range relative to x scale
    c = float(max(c, eps))

    return QuadParams(a=a, b=b, c=c, mu=mu)


def _load_and_fit() -> Dict[str, QuadParams]:
    coeffs: Dict[str, QuadParams] = {}

    # Try to load dataset
    try:
        from datasets import load_from_disk  # type: ignore
    except Exception:
        # datasets not available; fall back to default/global
        coeffs["__global__"] = _default_params()
        return coeffs

    # Load from disk
    try:
        ds = load_from_disk(DATA_DIR)
    except Exception:
        coeffs["__global__"] = _default_params()
        return coeffs

    # Get a Dataset (if DatasetDict, prefer 'train')
    try:
        # datasets >= 2.x
        if hasattr(ds, "keys"):  # DatasetDict-like
            split = "train" if "train" in ds.keys() else next(iter(ds.keys()))
            dset = ds[split]
        else:
            dset = ds
    except Exception:
        coeffs["__global__"] = _default_params()
        return coeffs

    # Identify columns
    colnames = list(getattr(dset, "column_names", []))
    x_key_candidates = ["log_flops", "log_compute", "log_flop", "x"]
    y_key_candidates = ["brier_score", "brier", "y"]
    group_key_candidates = ["group", "Group", "family", "dataset", "series"]

    def _pick(cands: List[str], cols: List[str]) -> str | None:
        for k in cands:
            if k in cols:
                return k
        return None

    x_key = _pick(x_key_candidates, colnames)
    y_key = _pick(y_key_candidates, colnames)
    g_key = _pick(group_key_candidates, colnames)

    if x_key is None or y_key is None:
        # Cannot fit without these; fallback
        coeffs["__global__"] = _default_params()
        return coeffs

    # Materialize data to python lists to avoid dependency on HF dataset at predict-time
    try:
        records = dset.to_dict()
        xs_all = records[x_key]
        ys_all = records[y_key]
        if g_key is not None:
            gs_all = records[g_key]
        else:
            gs_all = ["__global__"] * len(xs_all)
    except Exception:
        # Fallback to iteration
        xs_all, ys_all, gs_all = [], [], []
        try:
            for row in dset:
                xs_all.append(row.get(x_key))
                ys_all.append(row.get(y_key))
                gs_all.append(row.get(g_key) if g_key in row else "__global__")
        except Exception:
            coeffs["__global__"] = _default_params()
            return coeffs

    # Group data
    groups: Dict[str, Dict[str, List[float]]] = {}
    for x, y, g in zip(xs_all, ys_all, gs_all):
        try:
            xf = float(x)
            yf = float(y)
        except Exception:
            continue
        if not (math.isfinite(xf) and math.isfinite(yf)):
            continue
        groups.setdefault(str(g), {"x": [], "y": []})
        groups[str(g)]["x"].append(xf)
        groups[str(g)]["y"].append(yf)

    # Also fit a global prior
    all_x = [float(v) for v in xs_all if v is not None]
    all_y = [float(v) for v in ys_all if v is not None]
    global_params = _fit_group(all_x, all_y)
    coeffs["__global__"] = global_params

    # Fit per-group, fallback to global if insufficient data
    for g, data in groups.items():
        xs = data["x"]
        ys = data["y"]
        if len(xs) >= 3 and len(ys) >= 3:
            coeffs[g] = _fit_group(xs, ys)
        else:
            coeffs[g] = global_params

    return coeffs


# Fit at import time
_COEFFS: Dict[str, QuadParams] = _load_and_fit()


def _write_explain_md(coeffs: Dict[str, QuadParams]) -> None:
    """
    Write the explanation and fitted parameters to /app/explain.md.
    This runs at import time to reflect the dataset actually present.
    """
    try:
        lines: List[str] = []
        lines.append("# U-shaped scaling law for Brier score vs. log_flops")
        lines.append("")
        lines.append("Formula (per group g):")
        lines.append("  brier_score = a_g + b_g * (log_flops - mu_g) + c_g * (log_flops - mu_g)^2")
        lines.append("where c_g > 0 enforces the U-shape; mu_g is a per-group centering constant.")
        lines.append("")
        lines.append("Methodology:")
        lines.append("- Load /app/data with datasets.load_from_disk().")
        lines.append("- Fit an ordinary least squares quadratic in centered log_flops per group.")
        lines.append("- Enforce convexity (U-shape) by projecting curvature c_g to be strictly positive;")
        lines.append("  if needed, refit a_g and b_g holding c_g fixed.")
        lines.append("- Clip predictions to [0, 1], the valid Brier score range.")
        lines.append("")
        lines.append("Fitted coefficients (one line per group):")
        for g, p in sorted(coeffs.items()):
            if g == "__global__":
                label = "(global prior)"
            else:
                label = ""
            # Vertex (minimum) of the parabola in original x scale
            # Vertex at x* = mu - b/(2c); y* = a - b^2/(4c)
            vx = p.mu - (p.b / (2.0 * p.c))
            vy = p.a - (p.b * p.b) / (4.0 * p.c)
            lines.append(
                f"- group: {g} {label} | a={p.a:.8g}, b={p.b:.8g}, c={p.c:.8g}, mu={p.mu:.8g} | vertex_x={vx:.8g}, vertex_y={vy:.8g}"
            )
        lines.append("")
        with open("/app/explain.md", "w", encoding="utf-8") as f:
            f.write("\n".join(lines))
    except Exception:
        # Best-effort: do not crash if filesystem is read-only
        pass


# Write explanation file at import (best-effort).
_write_explain_md(_COEFFS)


def _get_params_for_group(group: str) -> QuadParams:
    # Use exact match if available, else fall back to global prior
    if group in _COEFFS:
        return _COEFFS[group]
    # Try some forgiving matches
    key_lower = group.lower()
    for k in _COEFFS.keys():
        if k.lower() == key_lower:
            return _COEFFS[k]
    return _COEFFS.get("__global__", _default_params())


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    params = _get_params_for_group(group)
    out: List[Dict[str, float]] = []
    for row in input_data:
        # Accept both "log_flops" key and a fallback "x"
        if "log_flops" in row:
            x = float(row["log_flops"])
        elif "x" in row:
            x = float(row["x"])
        else:
            # If input is missing, predict with x=mu (parabola vertex vicinity)
            x = params.mu
        yhat = params.predict(x)
        out.append({"brier_score": _safe_clip_brier(float(yhat))})
    return out

#5 Run 5 R² = -1.000000

▼

Python

"""
U-shaped scaling law for brier_score as a function of log_flops.

We model the final performance (brier_score) with a convex quadratic in log_flops:
    y = a_g + b_g * (x - c_g)^2
where:
    - x = log_flops
    - y = brier_score
    - g = group identifier
    - b_g >= 0 ensures a U-shaped curve
The functional form is shared across groups; parameters (a_g, b_g, c_g) are fit per group
from the dataset located at /app/data using datasets.load_from_disk().
"""

from __future__ import annotations

from typing import Dict, List, Tuple
import math
import os

import numpy as np

try:
    from datasets import load_from_disk  # type: ignore
except Exception:  # pragma: no cover
    load_from_disk = None  # type: ignore

# Path to on-disk dataset
_DATA_PATH = "/app/data"

# Stored parameters per group: {group: {"a": float, "b": float, "c": float}}
_PARAMS: Dict[str, Dict[str, float]] = {}

# Fallback/global group key (used when a requested group wasn't seen during fitting)
_GLOBAL_KEY = "__GLOBAL__"


def _is_dataset_dict(obj) -> bool:
    # Lightweight check without importing specific classes
    return hasattr(obj, "keys") and hasattr(obj, "__getitem__") and not hasattr(obj, "column_names")


def _load_dataset():
    if load_from_disk is None:
        return None
    if not os.path.exists(_DATA_PATH):
        return None
    try:
        ds = load_from_disk(_DATA_PATH)
        return ds
    except Exception:
        return None


def _coalesce_split(ds):
    # Accept either a Dataset or a DatasetDict. Prefer "train" split if present,
    # otherwise concatenate all splits.
    if ds is None:
        return None
    if _is_dataset_dict(ds):
        # DatasetDict-like
        try:
            if "train" in ds:
                return ds["train"]
            # Concatenate all splits
            splits = [ds[k] for k in ds.keys()]
            if len(splits) == 1:
                return splits[0]
            # Use Dataset's .concatenate if available, else simple reduction
            base = splits[0]
            for nxt in splits[1:]:
                base = base.concatenate(nxt)  # type: ignore[attr-defined]
            return base
        except Exception:
            # Fallback to first available
            keys = list(ds.keys())
            return ds[keys[0]]
    else:
        return ds


def _detect_columns(dataset) -> Tuple[str, str, str | None]:
    """
    Detect (x_key, y_key, group_key) from a Hugging Face Dataset.
    x_key ~ log_flops, y_key ~ brier_score, group_key optional (e.g., 'group').
    """
    cols = set(dataset.column_names)  # type: ignore[attr-defined]
    # x (log_flops)
    if "log_flops" in cols:
        x_key = "log_flops"
    else:
        # heuristic search
        candidates = [c for c in cols if "log" in c.lower() and "flop" in c.lower()]
        x_key = candidates[0] if candidates else next(iter(cols))
    # y (brier_score)
    if "brier_score" in cols:
        y_key = "brier_score"
    else:
        candidates = [c for c in cols if "brier" in c.lower() and "score" in c.lower()]
        y_key = candidates[0] if candidates else next(iter(cols - {x_key}))
    # group (optional)
    group_key = None
    for gk in ("group", "Group", "family", "cluster", "cohort"):
        if gk in cols:
            group_key = gk
            break
    return x_key, y_key, group_key


def _fit_quadratic_vertex(xs: np.ndarray, ys: np.ndarray) -> Tuple[float, float, float]:
    """
    Fit y = a + b*(x - c)^2 via linear regression in standard quadratic form:
      y = A*x^2 + B*x + C, with A > 0 for convexity (U-shape).
    Convert to vertex parameters:
      b = A
      c = -B/(2A)
      a = C - b*c^2
    A small ridge term improves stability. Enforce A >= eps.
    """
    xs = xs.astype(float)
    ys = ys.astype(float)
    X = np.stack([xs**2, xs, np.ones_like(xs)], axis=1)  # columns: [x^2, x, 1]
    lam = 1e-6
    XtX = X.T @ X
    XtX += lam * np.eye(3)
    Xty = X.T @ ys
    try:
        theta = np.linalg.solve(XtX, Xty)
    except np.linalg.LinAlgError:
        theta, *_ = np.linalg.lstsq(X, ys, rcond=None)
    A, B, C = map(float, theta)
    eps = 1e-12
    if not math.isfinite(A) or A <= eps:
        # Project to a minimal convex curvature
        A = max(eps, abs(A) if math.isfinite(A) else eps)
    b = A
    c = -B / (2.0 * A)
    a = C - b * c * c
    return float(a), float(b), float(c)


def _fit_params_per_group():
    ds_all = _load_dataset()
    ds = _coalesce_split(ds_all)
    if ds is None:
        # No dataset available: set a very conservative global default.
        _PARAMS[_GLOBAL_KEY] = {"a": 0.2, "b": 0.01, "c": 0.0}
        return

    x_key, y_key, group_key = _detect_columns(ds)

    # Collect data
    xs_all: List[float] = []
    ys_all: List[float] = []
    by_group: Dict[str, Tuple[List[float], List[float]]] = {}

    for row in ds:  # type: ignore[assignment]
        try:
            x = float(row[x_key])
            y = float(row[y_key])
        except Exception:
            continue
        if not (math.isfinite(x) and math.isfinite(y)):
            continue
        xs_all.append(x)
        ys_all.append(y)
        g = str(row[group_key]) if group_key and (row.get(group_key) is not None) else _GLOBAL_KEY
        by_group.setdefault(g, ([], []))
        by_group[g][0].append(x)
        by_group[g][1].append(y)

    # Always fit a global model as a fallback
    if len(xs_all) >= 3:
        a, b, c = _fit_quadratic_vertex(np.array(xs_all), np.array(ys_all))
    elif len(xs_all) >= 2:
        # With 2 points, b is poorly identified; set minimal curvature
        xs = np.array(xs_all)
        ys = np.array(ys_all)
        x0 = float(xs.mean())
        # Solve for a, given b ~ eps and c ~ x0
        b = 1e-6
        c = x0
        a = float((ys - b * (xs - c) ** 2).mean())
    elif len(xs_all) == 1:
        x0 = xs_all[0]
        y0 = ys_all[0]
        a, b, c = y0, 1e-6, x0
    else:
        a, b, c = 0.2, 0.01, 0.0
    _PARAMS[_GLOBAL_KEY] = {"a": float(a), "b": float(b), "c": float(c)}

    # Fit per-group; for tiny groups, back off to global
    for g, (xs_g, ys_g) in by_group.items():
        xs = np.array(xs_g, dtype=float)
        ys = np.array(ys_g, dtype=float)
        if len(xs) >= 3:
            a_g, b_g, c_g = _fit_quadratic_vertex(xs, ys)
            _PARAMS[g] = {"a": float(a_g), "b": float(b_g), "c": float(c_g)}
        else:
            _PARAMS[g] = _PARAMS[_GLOBAL_KEY].copy()


# Fit once at import time
try:
    _fit_params_per_group()
except Exception:
    # Robust to any runtime issues; retain defaults if present
    if _GLOBAL_KEY not in _PARAMS:
        _PARAMS[_GLOBAL_KEY] = {"a": 0.2, "b": 0.01, "c": 0.0}


def _extract_x(d: Dict[str, float]) -> float:
    if "log_flops" in d:
        return float(d["log_flops"])
    # heuristic fallback
    for k, v in d.items():
        lk = k.lower()
        if "log" in lk and "flop" in lk:
            try:
                return float(v)
            except Exception:
                continue
    # As a last resort, try the first numeric value
    for v in d.values():
        if isinstance(v, (int, float)) and math.isfinite(v):
            return float(v)
    raise KeyError("No suitable 'log_flops' key found in input_data row.")


def _predict_one(x: float, params: Dict[str, float]) -> float:
    a = float(params["a"])
    b = max(0.0, float(params["b"]))  # ensure convexity
    c = float(params["c"])
    y = a + b * (x - c) ** 2
    # Brier score is bounded in [0, 1]; clamp for numerical safety.
    return float(min(1.0, max(0.0, y)))


def get_params() -> Dict[str, Dict[str, float]]:
    """
    Returns a copy of the fitted parameters per group.
    Keys: group names (plus __GLOBAL__), Values: dict(a, b, c).
    """
    return {k: dict(v) for k, v in _PARAMS.items()}


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    params = _PARAMS.get(group) or _PARAMS.get(_GLOBAL_KEY)
    if params is None:
        params = {"a": 0.2, "b": 0.01, "c": 0.0}

    preds: List[Dict[str, float]] = []
    for row in input_data:
        x = _extract_x(row)
        y_hat = _predict_one(x, params)
        preds.append({"brier_score": y_hat})
    return preds