SLD - SFT Scaling Law - aider + GPT-5

All Runs (sorted by R²)

Best Run 1 R² = 0.892971

▼

Python

from __future__ import annotations

import math
import os
from typing import Dict, Iterable, List, Tuple

# Try to import datasets; fall back gracefully if unavailable.
try:
    from datasets import load_from_disk, Dataset, DatasetDict  # type: ignore
except Exception:  # pragma: no cover
    load_from_disk = None  # type: ignore
    Dataset = object  # type: ignore
    DatasetDict = dict  # type: ignore


# Parameters: group -> (L_inf, A, alpha)
_PARAMS_BY_GROUP: Dict[str, Tuple[float, float, float]] = {}
_GLOBAL_PARAMS: Tuple[float, float, float] = (0.0, 1.0, 0.5)  # sensible default fallback


def _linear_fit(x: List[float], y: List[float]) -> Tuple[float, float]:
    """
    Simple unweighted least squares fit for y = m*x + b
    Returns (m, b)
    """
    n = len(x)
    if n == 0:
        return (0.0, 0.0)
    mean_x = sum(x) / n
    mean_y = sum(y) / n
    sxx = sum((xi - mean_x) ** 2 for xi in x)
    if sxx <= 0.0:
        return (0.0, mean_y)
    sxy = sum((xi - mean_x) * (yi - mean_y) for xi, yi in zip(x, y))
    m = sxy / sxx
    b = mean_y - m * mean_x
    return (m, b)


def _fit_power_law_with_asymptote(xs: List[float], ys: List[float]) -> Tuple[float, float, float]:
    """
    Fit the three-parameter scaling law:
        loss(N) = L_inf + A * N^(-alpha)
    via a coarse grid-search over L_inf and linear regression on log-space for A, alpha.

    Returns (L_inf, A, alpha)
    """
    # Sanitize and filter data
    data = [(float(x), float(y)) for x, y in zip(xs, ys) if x is not None and y is not None]
    data = [(x, y) for x, y in data if x > 0 and math.isfinite(x) and math.isfinite(y)]
    if not data:
        return (0.0, 1.0, 0.5)

    xs = [x for x, _ in data]
    ys = [y for _, y in data]
    y_min = min(ys)
    y_max = max(ys)

    # If no variation, fall back to a simpler 2-parameter power law with L_inf=0
    if not math.isfinite(y_min) or not math.isfinite(y_max) or abs(y_max - y_min) < 1e-12:
        # Fit y = A * N^(-alpha) in log space
        t = [math.log(x) for x in xs]
        z = [math.log(max(y, 1e-12)) for y in ys]
        m, b = _linear_fit(t, z)
        alpha = -m
        A = math.exp(b)
        if not (math.isfinite(alpha) and alpha > 0 and math.isfinite(A) and A > 0):
            alpha, A = 0.5, max(y_min, 1e-6)
        return (0.0, A, alpha)

    # Define a grid for L_inf below the minimum observed loss
    span = max(y_max - y_min, 1e-6)
    upper = y_min - 1e-9  # must be strictly below min(y)
    lower = max(0.0, y_min - 0.25 * span)
    if lower >= upper:
        lower = max(0.0, 0.5 * upper)

    candidates: List[float] = []
    steps = 50
    for i in range(steps):
        frac = (i + 0.5) / steps
        L = lower + frac * (upper - lower)
        if L < upper:
            candidates.append(L)
    # Also try L_inf = 0 explicitly
    if 0.0 < upper:
        candidates.append(0.0)

    best_err = float("inf")
    best_params = (0.0, 1.0, 0.5)

    t_vals = [math.log(x) for x in xs]

    for L in candidates:
        # Compute transformed targets z = log(y - L)
        # Safe because L < min(y) by construction
        z_vals = [math.log(y - L) for y in ys]
        m, b = _linear_fit(t_vals, z_vals)
        alpha = -m
        A = math.exp(b)

        # Discard invalid fits
        if not (math.isfinite(alpha) and alpha > 0 and math.isfinite(A) and A > 0 and math.isfinite(L) and L >= 0):
            continue

        # Evaluate SSE in natural space
        err = 0.0
        for x, y in zip(xs, ys):
            y_hat = L + A * (x ** (-alpha))
            if not math.isfinite(y_hat):
                err = float("inf")
                break
            diff = y_hat - y
            err += diff * diff

        if err < best_err:
            best_err = err
            best_params = (L, A, alpha)

    return best_params


def _load_all_records(path: str = "/app/data") -> List[dict]:
    """
    Load all rows from a HuggingFace dataset or dataset dict located at path.
    Returns a list of Python dict records.
    """
    records: List[dict] = []
    if load_from_disk is None:
        return records
    try:
        ds = load_from_disk(path)  # type: ignore
    except Exception:
        return records

    def _iter_rows(d) -> Iterable[dict]:
        try:
            return iter(d)  # HuggingFace Datasets are iterable
        except Exception:
            return iter([])

    # DatasetDict: combine splits
    try:
        if isinstance(ds, DatasetDict):  # type: ignore
            for split_name in ds.keys():  # type: ignore
                split_ds = ds[split_name]  # type: ignore
                for row in _iter_rows(split_ds):
                    records.append(row)
        elif isinstance(ds, Dataset):  # type: ignore
            for row in _iter_rows(ds):
                records.append(row)
        else:
            # Fallback: try dict-like
            if hasattr(ds, "values"):
                for part in ds.values():  # type: ignore
                    for row in _iter_rows(part):
                        records.append(row)
    except Exception:
        # As a last resort, attempt to iterate ds directly
        try:
            for row in _iter_rows(ds):
                records.append(row)
        except Exception:
            pass

    return records


def _fit_all_groups() -> None:
    """
    Fit parameters per group and globally, storing them in module-level caches.
    Also writes/updates /app/explain.md with the discovered parameters if possible.
    """
    global _PARAMS_BY_GROUP, _GLOBAL_PARAMS

    records = _load_all_records("/app/data")
    # Extract columns robustly
    def get_val(rec: dict, key: str, default=None):
        return rec.get(key, default)

    # Determine group field
    group_field_candidates = ["group", "sft_group", "family", "model_group"]
    group_field = None
    if records:
        sample = records[0]
        for k in group_field_candidates:
            if k in sample:
                group_field = k
                break
    if group_field is None:
        group_field = "group"  # default name; treat all as one group

    # Partition data by group
    by_group: Dict[str, Tuple[List[float], List[float]]] = {}
    xs_all: List[float] = []
    ys_all: List[float] = []

    for rec in records:
        x = get_val(rec, "sft_data_size")
        y = get_val(rec, "sft_loss")
        g = get_val(rec, group_field, "default")
        try:
            xf = float(x)
            yf = float(y)
        except Exception:
            continue
        if not (math.isfinite(xf) and math.isfinite(yf) and xf > 0):
            continue

        xs_all.append(xf)
        ys_all.append(yf)
        if g not in by_group:
            by_group[g] = ([], [])
        by_group[g][0].append(xf)
        by_group[g][1].append(yf)

    # Global fit (pooled)
    if xs_all and ys_all:
        _GLOBAL_PARAMS = _fit_power_law_with_asymptote(xs_all, ys_all)
    else:
        # Keep default fallback
        _GLOBAL_PARAMS = _GLOBAL_PARAMS

    # Per-group fit
    params_by_group: Dict[str, Tuple[float, float, float]] = {}
    if by_group:
        for g, (xs, ys) in by_group.items():
            params_by_group[g] = _fit_power_law_with_asymptote(xs, ys)
    else:
        # No groups available; use a single default group
        params_by_group["default"] = _GLOBAL_PARAMS

    _PARAMS_BY_GROUP = params_by_group

    # Attempt to write an explain file with discovered parameters
    try:
        lines: List[str] = []
        lines.append("# SFT Scaling Law\n")
        lines.append("We model the supervised fine-tuning loss as a function of the number of fine-tuning examples N using a three-parameter power law with an asymptote:\n")
        lines.append("L(N) = L_inf + A * N^(-alpha)\n")
        lines.append("\nMethodology:\n")
        lines.append("- For each group, we sweep a grid of candidate L_inf values below the minimum observed loss.\n")
        lines.append("- For each candidate L_inf, we fit log(L - L_inf) = log A - alpha * log N via linear least squares to estimate A and alpha.\n")
        lines.append("- We pick the parameters (L_inf, A, alpha) that minimize squared error in the original loss space.\n")
        lines.append("\nFitted parameters by group:\n")
        for g, (L_inf, A, alpha) in sorted(_PARAMS_BY_GROUP.items(), key=lambda kv: str(kv[0])):
            lines.append(f"- {g}: L_inf={L_inf:.6g}, A={A:.6g}, alpha={alpha:.6g}\n")
        lines.append("\nGlobal pooled fit (used as fallback for unknown groups):\n")
        L_inf, A, alpha = _GLOBAL_PARAMS
        lines.append(f"- GLOBAL: L_inf={L_inf:.6g}, A={A:.6g}, alpha={alpha:.6g}\n")

        with open("/app/explain.md", "w", encoding="utf-8") as f:
            f.write("\n".join(lines))
    except Exception:
        # Non-fatal if we cannot write the explanation.
        pass


# Fit once at import time (best-effort; safe no-op if dataset unavailable)
_fit_all_groups()


def _params_for_group(group: str) -> Tuple[float, float, float]:
    if group in _PARAMS_BY_GROUP:
        return _PARAMS_BY_GROUP[group]
    # Try case-insensitive key match
    group_lower = group.lower()
    for g in _PARAMS_BY_GROUP.keys():
        if str(g).lower() == group_lower:
            return _PARAMS_BY_GROUP[g]
    return _GLOBAL_PARAMS


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Ensure parameters are available (import-time fit may have been skipped in some environments)
    if not _PARAMS_BY_GROUP:
        _fit_all_groups()

    L_inf, A, alpha = _params_for_group(group)

    outputs: List[Dict[str, float]] = []
    for row in input_data:
        n = float(row.get("sft_data_size", 0.0))
        if not (math.isfinite(n) and n > 0):
            # Graceful handling for invalid N: predict using N=1 as minimal meaningful size
            n = 1.0
        y_hat = L_inf + A * (n ** (-alpha))
        # Safety: ensure finite
        if not math.isfinite(y_hat):
            y_hat = float("nan")
        outputs.append({"sft_loss": float(y_hat)})
    return outputs

#2 Run 2 R² = 0.887761

▼

Python

from __future__ import annotations

import math
from typing import Dict, Iterable, List, Tuple

# Optional, but provides fast array ops if available.
try:
    import numpy as np  # type: ignore
except Exception:  # pragma: no cover
    np = None  # type: ignore

# Optional dependency. We fail gracefully if not present or data is missing.
try:
    from datasets import load_from_disk  # type: ignore
except Exception:  # pragma: no cover
    load_from_disk = None  # type: ignore

DATA_PATH = "/app/data"

# Fitted parameters are stored here as:
#   PARAMS[group] = {"L_inf": float, "A": float, "alpha": float}
PARAMS: Dict[str, Dict[str, float]] = {}


def _to_np(x: Iterable[float]):
    if np is None:
        # Minimal shim with list semantics when numpy is unavailable
        return list(float(v) for v in x)
    return np.asarray(list(float(v) for v in x), dtype=float)


def _linear_regression(x: Iterable[float], y: Iterable[float]) -> Tuple[float, float]:
    """
    Fit y = b + m * x by least squares.
    Returns (b, m).
    """
    X = _to_np(x)
    Y = _to_np(y)
    if np is None:
        n = len(X)
        if n == 0:
            return 0.0, 0.0
        mean_x = sum(X) / n
        mean_y = sum(Y) / n
        num = sum((xi - mean_x) * (yi - mean_y) for xi, yi in zip(X, Y))
        den = sum((xi - mean_x) ** 2 for xi in X)
        m = 0.0 if den == 0 else (num / den)
        b = mean_y - m * mean_x
        return b, m
    else:
        x_mean = float(np.mean(X)) if X.size else 0.0
        y_mean = float(np.mean(Y)) if Y.size else 0.0
        num = float(np.sum((X - x_mean) * (Y - y_mean)))
        den = float(np.sum((X - x_mean) ** 2))
        m = 0.0 if den == 0.0 else (num / den)
        b = y_mean - m * x_mean
        return b, m


def _sse_loss(N, L, L_inf: float, A: float, alpha: float) -> float:
    # Compute mean squared error in original space for stability/interpretability
    if np is None:
        preds = [L_inf + A * (max(n, 1e-12) ** (-alpha)) for n in N]
        residuals = [(lp - lt) for lp, lt in zip(preds, L)]
        return sum(r * r for r in residuals) / (len(residuals) or 1)
    else:
        N = _to_np(N)
        L = _to_np(L)
        preds = L_inf + A * np.power(np.maximum(N, 1e-12), -alpha)
        residuals = preds - L
        return float(np.mean(residuals ** 2))


def _fit_group(N_raw: Iterable[float], L_raw: Iterable[float]) -> Tuple[float, float, float]:
    """
    Fit the scaling law:
        L(N) = L_inf + A * N^{-alpha}
    using a grid-search over L_inf and closed-form linear regression for (log A, alpha).
    """
    # Clean data: positive N, finite values
    N = []
    L = []
    for n, l in zip(N_raw, L_raw):
        if n is None or l is None:
            continue
        try:
            n_val = float(n)
            l_val = float(l)
        except Exception:
            continue
        if not (math.isfinite(n_val) and math.isfinite(l_val)):
            continue
        if n_val <= 0.0:
            continue
        N.append(n_val)
        L.append(l_val)

    if len(N) < 2:
        # Fallback: insufficient data
        L_inf = min(L) if L else 0.0
        A = max((max(L) - L_inf), 1e-6) if L else 1.0
        alpha = 0.5
        return float(L_inf), float(A), float(alpha)

    # Establish a reasonable L_inf search range:
    L_min = min(L)
    L_max = max(L)
    if L_min <= 0:
        low = L_min * 0.5
        high = min(L_min * 0.99, L_min - 1e-6)
    else:
        low = max(0.0, 0.1 * L_min)
        high = 0.99 * L_min

    # If range is degenerate, expand conservatively
    if not math.isfinite(low) or not math.isfinite(high) or low >= high:
        low = max(0.0, L_min * 0.25)
        high = 0.99 * L_min if L_min > 0 else (L_min * 0.9)

    # Build candidate grid for L_inf
    grid_count = 101
    if np is None:
        L_grid = [low + (high - low) * i / (grid_count - 1) for i in range(grid_count)]
    else:
        L_grid = list(np.linspace(low, high, grid_count))

    best = {
        "sse": float("inf"),
        "L_inf": None,  # type: ignore
        "A": None,      # type: ignore
        "alpha": None,  # type: ignore
    }

    logN = [math.log(n) for n in N]

    for L0 in L_grid:
        # Ensure positivity of (L - L0)
        y = [l - L0 for l in L]
        if any(v <= 0 for v in y):
            continue
        logy = [math.log(v) for v in y]
        b, m = _linear_regression(logN, logy)  # log(y) = b + m * logN
        # Here, m = -alpha and A = exp(b)
        alpha = -m
        if not math.isfinite(alpha) or alpha <= 0:
            continue
        A = math.exp(b)
        if not math.isfinite(A) or A <= 0:
            continue
        sse = _sse_loss(N, L, L0, A, alpha)
        if sse < best["sse"]:
            best.update({"sse": sse, "L_inf": L0, "A": A, "alpha": alpha})

    # If grid search failed (e.g., numerical issues), fallback
    if best["L_inf"] is None:
        L_inf = max(0.0, 0.5 * L_min)
        A = max(L_max - L_inf, 1e-6)
        alpha = 0.5
        return float(L_inf), float(A), float(alpha)

    # Optional local refinement around the best L_inf
    L0 = float(best["L_inf"])
    span = max(1e-12, 0.1 * abs(L0) + 1e-6)
    candidates = [L0 + d for d in ( -span, -span/2, 0.0, span/2, span )]
    for Lc in candidates:
        y = [l - Lc for l in L]
        if any(v <= 0 for v in y):
            continue
        logy = [math.log(v) for v in y]
        b, m = _linear_regression(logN, logy)
        alpha = -m
        if not math.isfinite(alpha) or alpha <= 0:
            continue
        A = math.exp(b)
        if not math.isfinite(A) or A <= 0:
            continue
        sse = _sse_loss(N, L, Lc, A, alpha)
        if sse < best["sse"]:
            best.update({"sse": sse, "L_inf": Lc, "A": A, "alpha": alpha})

    return float(best["L_inf"]), float(best["A"]), float(best["alpha"])


def _load_all_rows_from_disk(path: str):
    if load_from_disk is None:
        return []

    try:
        ds = load_from_disk(path)
    except Exception:
        return []

    rows = []
    try:
        # DatasetDict (multiple splits)
        values = getattr(ds, "values", None)
        if callable(values):
            for split in ds.values():  # type: ignore[attr-defined]
                for ex in split:
                    rows.append(ex)
        else:
            # Single Dataset
            for ex in ds:
                rows.append(ex)
    except Exception:
        # As a very last resort, try iterating directly
        try:
            for ex in ds:
                rows.append(ex)
        except Exception:
            return []

    return rows


def _detect_group_key(example: dict) -> str | None:
    candidates = (
        "group",
        "sft_group",
        "exp_group",
        "setting",
        "task",
        "model",
    )
    for k in candidates:
        if k in example:
            return k
    return None


def _detect_size_key(example: dict) -> str | None:
    candidates = (
        "sft_data_size",
        "data_size",
        "n",
        "N",
        "examples",
        "num_examples",
        "train_examples",
    )
    for k in candidates:
        if k in example:
            return k
    return None


def _detect_loss_key(example: dict) -> str | None:
    candidates = (
        "sft_loss",
        "loss",
        "final_loss",
    )
    for k in candidates:
        if k in example:
            return k
    return None


def _fit_params_from_data() -> Dict[str, Dict[str, float]]:
    rows = _load_all_rows_from_disk(DATA_PATH)
    if not rows:
        # Fallback defaults if no data
        return {"GLOBAL": {"L_inf": 0.0, "A": 1.0, "alpha": 0.5}}

    # Detect key names
    g_key = _detect_group_key(rows[0]) or "group"
    n_key = _detect_size_key(rows[0]) or "sft_data_size"
    l_key = _detect_loss_key(rows[0]) or "sft_loss"

    groups: Dict[str, Tuple[List[float], List[float]]] = {}
    allN: List[float] = []
    allL: List[float] = []

    for ex in rows:
        if n_key not in ex or l_key not in ex:
            continue
        g = str(ex.get(g_key, "GLOBAL"))
        try:
            n = float(ex[n_key])
            l = float(ex[l_key])
        except Exception:
            continue
        if not (math.isfinite(n) and math.isfinite(l)) or n <= 0:
            continue
        allN.append(n)
        allL.append(l)
        if g not in groups:
            groups[g] = ([], [])
        groups[g][0].append(n)
        groups[g][1].append(l)

    params: Dict[str, Dict[str, float]] = {}

    # Global fit (useful fallback)
    L_inf_g, A_g, alpha_g = _fit_group(allN, allL)
    params["GLOBAL"] = {"L_inf": L_inf_g, "A": A_g, "alpha": alpha_g}

    # Per-group fits
    for g, (Ns, Ls) in groups.items():
        L_inf, A, alpha = _fit_group(Ns, Ls)
        params[g] = {"L_inf": L_inf, "A": A, "alpha": alpha}

    return params


# Eagerly fit at import time for reproducibility and speed at inference.
try:
    PARAMS = _fit_params_from_data()
except Exception:
    # Robust fallback
    PARAMS = {"GLOBAL": {"L_inf": 0.0, "A": 1.0, "alpha": 0.5}}


def _get_param_set(group: str) -> Dict[str, float]:
    # Exact group match
    if group in PARAMS:
        return PARAMS[group]
    # Case-insensitive match
    for g in PARAMS:
        if g.lower() == group.lower():
            return PARAMS[g]
    # Fallback to GLOBAL
    return PARAMS.get("GLOBAL", {"L_inf": 0.0, "A": 1.0, "alpha": 0.5})


def _extract_size_from_input(d: dict) -> float:
    # Support several synonymous keys for convenience.
    for k in ("sft_data_size", "data_size", "n", "N", "examples", "num_examples", "train_examples"):
        if k in d:
            try:
                v = float(d[k])
                if math.isfinite(v) and v > 0:
                    return v
            except Exception:
                continue
    # If missing/invalid, use a tiny positive to avoid division-by-zero
    return 1e-12


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).

    Scaling law implemented:
        sft_loss(N) = L_inf + A * N^(-alpha)
    """
    params = _get_param_set(group)
    L_inf = float(params.get("L_inf", 0.0))
    A = float(params.get("A", 1.0))
    alpha = float(params.get("alpha", 0.5))

    outputs: List[dict[str, float]] = []
    for row in input_data:
        N = _extract_size_from_input(row)
        # Numerical guard
        N = max(float(N), 1e-12)
        pred = float(L_inf + A * (N ** (-alpha)))
        outputs.append({"sft_loss": pred})
    return outputs

#3 Run 3 R² = 0.875399

▼

Python

from __future__ import annotations

import math
import os
from typing import Dict, List, Tuple

# We use numpy if available for robust fitting; otherwise fall back to simple heuristics.
try:
    import numpy as np
except Exception:  # pragma: no cover
    np = None  # type: ignore

_PARAMS: Dict[str, Tuple[float, float, float]] = {}
_GLOBAL_PARAMS: Tuple[float, float, float] | None = None
_FITTED: bool = False


def _safe_log(x: float) -> float:
    return math.log(max(x, 1e-12))


def _fit_floor_powerlaw(xs: List[float], ys: List[float]) -> Tuple[float, float, float]:
    """
    Fit L(N) = L_inf + A * N^{-alpha} using a grid search over L_inf and
    linear regression in log-log space for (A, alpha).
    Returns (L_inf, A, alpha). All parameters are constrained to be > 0.
    """
    # Basic sanitation
    data = [(float(x), float(y)) for x, y in zip(xs, ys) if x is not None and y is not None]
    data = [(x, y) for x, y in data if x > 0 and y > 0 and not (math.isinf(x) or math.isinf(y) or math.isnan(x) or math.isnan(y))]
    if len(data) == 0:
        # Degenerate fallback
        return (0.01, 1.0, 0.3)
    xs = [x for x, _ in data]
    ys = [y for _, y in data]

    if np is None or len(data) < 2:  # Not enough data or numpy missing
        # Heuristic: pick a floor slightly below min loss, alpha=0.3, set A to match median
        y_min = min(ys)
        y_max = max(ys)
        L0 = max(1e-8, y_min - 0.1 * max(y_max - y_min, 1e-6))
        alpha = 0.3
        vals = []
        for x, y in data:
            vals.append(max(1e-12, (y - L0) * (x ** alpha)))
        A = sorted(vals)[len(vals) // 2] if vals else 1.0
        return (float(L0), float(max(1e-12, A)), float(max(1e-6, alpha)))

    x_arr = np.array(xs, dtype=float)
    y_arr = np.array(ys, dtype=float)

    y_min = float(np.min(y_arr))
    y_max = float(np.max(y_arr))
    spread = max(y_max - y_min, 1e-6)

    upper = y_min - 1e-9  # L_inf must be strictly below the minimum observed loss
    lower = max(1e-12, y_min - 0.8 * spread)
    if lower >= upper:
        lower = max(1e-12, y_min * 0.5)

    grid = np.linspace(lower, upper, num=60)

    best = None  # (score, L0, A, alpha)
    xlog = np.log(np.maximum(x_arr, 1.0))

    for L0 in grid:
        # Ensure positive differences
        diff = y_arr - L0
        mask = diff > 1e-12
        if not np.any(mask):
            continue

        ylog = np.log(diff[mask])
        xlog_masked = xlog[mask]
        if ylog.size < 2:
            continue

        # Linear regression in log space: ylog = b + m * xlog; alpha = -m; A = exp(b)
        m, b = np.polyfit(xlog_masked, ylog, deg=1)
        alpha = -float(m)
        A = float(np.exp(b))
        if alpha <= 0 or not np.isfinite(alpha) or A <= 0 or not np.isfinite(A):
            continue

        y_pred = L0 + A * np.power(np.maximum(x_arr, 1.0), -alpha)

        # Use log-space MSE for relative fit robustness
        score = float(np.mean((np.log(np.maximum(y_arr, 1e-12)) - np.log(np.maximum(y_pred, 1e-12))) ** 2))
        if not math.isfinite(score):
            continue

        if best is None or score < best[0]:
            best = (score, float(L0), float(A), float(alpha))

    if best is not None:
        _, L0, A, alpha = best
        return (L0, A, alpha)

    # Fallback: L0 = 0, straight log-log fit
    m, b = np.polyfit(xlog, np.log(np.maximum(y_arr, 1e-12)), deg=1)
    alpha = -float(m)
    A = float(np.exp(b))
    L0 = 1e-8
    alpha = float(max(alpha, 1e-6))
    A = float(max(A, 1e-12))
    return (L0, A, alpha)


def _collect_rows_from_hf_dataset() -> List[dict]:
    """
    Load the dataset from /app/data via datasets.load_from_disk.
    Returns a flat list of dicts across all splits if present.
    """
    try:
        from datasets import load_from_disk
    except Exception:
        return []

    path = "/app/data"
    if not os.path.exists(path):
        return []

    try:
        ds = load_from_disk(path)
    except Exception:
        return []

    rows: List[dict] = []
    try:
        # DatasetDict
        if hasattr(ds, "keys") and hasattr(ds, "__getitem__") and not hasattr(ds, "column_names"):
            for split in ds.keys():
                part = ds[split]
                for ex in part:
                    rows.append(dict(ex))
        else:
            # Single Dataset
            for ex in ds:
                rows.append(dict(ex))
    except Exception:
        return []
    return rows


def _fit_all() -> None:
    global _PARAMS, _GLOBAL_PARAMS, _FITTED

    if _FITTED:
        return

    rows = _collect_rows_from_hf_dataset()

    size_key = "sft_data_size"
    loss_key = "sft_loss"
    group_key = "group"

    # Filter and group rows
    by_group: Dict[str, List[Tuple[float, float]]] = {}
    all_pairs: List[Tuple[float, float]] = []

    for r in rows:
        if size_key not in r or loss_key not in r:
            continue
        try:
            n = float(r[size_key])
            l = float(r[loss_key])
        except Exception:
            continue
        if not (n > 0 and l > 0 and math.isfinite(n) and math.isfinite(l)):
            continue

        g = str(r.get(group_key, "default"))
        by_group.setdefault(g, []).append((n, l))
        all_pairs.append((n, l))

    # If no data was found, install a safe default
    if not all_pairs:
        _GLOBAL_PARAMS = (0.01, 1.0, 0.3)
        _PARAMS = {}
        _FITTED = True
        _write_report(_PARAMS, _GLOBAL_PARAMS, rows_present=False)
        return

    # Global fit
    xs_all = [x for x, _ in all_pairs]
    ys_all = [y for _, y in all_pairs]
    global_params = _fit_floor_powerlaw(xs_all, ys_all)
    _GLOBAL_PARAMS = global_params

    # Per-group fit with fallbacks
    params: Dict[str, Tuple[float, float, float]] = {}
    for g, pairs in by_group.items():
        xs = [x for x, _ in pairs]
        ys = [y for _, y in pairs]
        if len(pairs) >= 3:
            params[g] = _fit_floor_powerlaw(xs, ys)
        else:
            # Use global alpha and L_inf, solve A by robust statistic
            L0_g, A_g, alpha_g = global_params
            if len(pairs) >= 1:
                transformed = []
                for x, y in pairs:
                    transformed.append(max(1e-12, (y - L0_g) * (x ** alpha_g)))
                transformed.sort()
                A_est = transformed[len(transformed) // 2] if transformed else A_g
                params[g] = (L0_g, float(A_est), alpha_g)
            else:
                params[g] = global_params

    _PARAMS = params
    _FITTED = True
    _write_report(_PARAMS, _GLOBAL_PARAMS, rows_present=True)


def _write_report(params: Dict[str, Tuple[float, float, float]],
                  global_params: Tuple[float, float, float] | None,
                  rows_present: bool) -> None:
    """
    Write a human-readable explanation and the fitted parameters to /app/explain.md.
    """
    try:
        lines: List[str] = []
        lines.append("# Discovered SFT scaling law")
        lines.append("")
        lines.append("We model the final SFT loss as a function of the number of SFT examples N with a saturating power law:")
        lines.append("")
        lines.append("    L_hat(N) = L_inf + A * N^{-alpha}")
        lines.append("")
        lines.append("- L_inf (asymptotic loss floor) >= 0")
        lines.append("- A > 0 is the scale factor at N = 1")
        lines.append("- alpha > 0 controls how quickly the loss decreases with more data")
        lines.append("")
        lines.append("Fitting procedure:")
        lines.append("- For each group, we search over candidate values of L_inf strictly below the minimum observed loss for that group.")
        lines.append("- For each candidate L_inf, we fit log(L - L_inf) = log A - alpha * log N by linear regression in log-log space.")
        lines.append("- We select the parameters that minimize the mean squared error in log space, which emphasizes relative error.")
        lines.append("")
        if global_params is not None:
            L0, A, alpha = global_params
            lines.append("Global fit across all groups (used as a fallback):")
            lines.append(f"- L_inf = {L0:.6g}, A = {A:.6g}, alpha = {alpha:.6g}")
            lines.append("")
        if not params:
            if rows_present:
                lines.append("No distinct groups were found in the dataset; the global fit above is used for all predictions.")
            else:
                lines.append("Dataset was not available at runtime; default heuristic parameters are used.")
        else:
            lines.append("Per-group fitted parameters:")
            lines.append("")
            lines.append("| group | L_inf | A | alpha |")
            lines.append("|---|---:|---:|---:|")
            for g in sorted(params.keys()):
                L0, A, alpha = params[g]
                lines.append(f"| {g} | {L0:.6g} | {A:.6g} | {alpha:.6g} |")
            lines.append("")
        lines.append("This functional form is shared across groups; only the constants (L_inf, A, alpha) differ by group.")

        with open("/app/explain.md", "w", encoding="utf-8") as f:
            f.write("\n".join(lines) + "\n")
    except Exception:
        # Best-effort only; silently ignore I/O errors to keep prediction path reliable.
        pass


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    global _FITTED
    if not _FITTED:
        _fit_all()

    params = _PARAMS.get(group, _GLOBAL_PARAMS if _GLOBAL_PARAMS is not None else (0.01, 1.0, 0.3))
    L_inf, A, alpha = params

    outputs: list[dict[str, float]] = []
    for point in input_data:
        n = float(point.get("sft_data_size", 0.0))
        if not (n > 0 and math.isfinite(n)):
            # Graceful handling of nonpositive sizes: treat as N=1
            n = 1.0
        y_hat = L_inf + A * (n ** (-alpha))
        # Numerical safety
        y_hat = float(max(1e-12, y_hat))
        outputs.append({"sft_loss": y_hat})
    return outputs

#4 Run 4 R² = -1.000000

▼

Python

from __future__ import annotations

import math
from dataclasses import dataclass
from typing import Dict, Iterable, List, Tuple, Optional

# Public API: law(input_data, group) -> predictions
# We discover and cache per-group parameters at import time by fitting
# a three-parameter power law with offset to the dataset in /app/data:
#     sft_loss(N) = L_inf[group] + A[group] * N ** (-alpha[group])
#
# The functional form is identical for all groups; only the constants differ.


@dataclass(frozen=True)
class Params:
    L_inf: float
    A: float
    alpha: float


DATA_PATH = "/app/data"
# Cache of fitted parameters per group name
PARAMS: Dict[str, Params] = {}
# Fallback/global params if a specific group isn't found
GLOBAL_KEY = "__ALL__"


def _safe_log(arr: Iterable[float]) -> List[float]:
    eps = 1e-12
    return [math.log(max(eps, float(x))) for x in arr]


def _polyfit_loglog_with_offset(
    Ns: List[float],
    losses: List[float],
    n_grid: int = 128,
) -> Optional[Params]:
    """
    Fit y = L_inf + A * N^(-alpha) by:
      1) Grid search over L_inf in [0, min(loss)*(1-1e-6)]
      2) For each L_inf, linear regression in log space on (log N, log(y - L_inf))
      3) Choose L_inf minimizing squared error in original space
    Returns Params or None if fitting is not feasible.
    """
    if len(Ns) < 2:
        return None

    # Clean and ensure strictly positive Ns and finite y
    pairs: List[Tuple[float, float]] = []
    for n, y in zip(Ns, losses):
        if n is None or y is None:
            continue
        try:
            nf = float(n)
            yf = float(y)
        except (TypeError, ValueError):
            continue
        if not (math.isfinite(nf) and math.isfinite(yf)):
            continue
        if nf <= 0:
            continue
        pairs.append((nf, yf))

    if len(pairs) < 2:
        return None

    Ns_clean = [p[0] for p in pairs]
    ys_clean = [p[1] for p in pairs]

    y_min = min(ys_clean)
    if not math.isfinite(y_min):
        return None

    # Define L_inf search range; keep strictly below y_min to ensure log(y - L_inf) > 0
    upper = y_min * (1.0 - 1e-6) if y_min > 0 else y_min - 1e-6
    lower = 0.0
    if upper <= lower:
        # Degenerate range; fall back to small positive fraction of min
        upper = y_min * 0.9 if y_min > 0 else -1e-6
        lower = 0.0

    best_params: Optional[Params] = None
    best_sse: float = float("inf")

    # Precompute log N
    logN = _safe_log(Ns_clean)

    # Grid search over L_inf
    for i in range(n_grid):
        L_inf = lower + (upper - lower) * (i + 0.5) / n_grid  # midpoints
        # Prepare dependent variable: z = log(y - L_inf)
        resid = [y - L_inf for y in ys_clean]
        # Ensure positivity
        if any(r <= 0 for r in resid):
            continue
        log_resid = _safe_log(resid)

        # Simple least squares fit of z = c + m * logN
        # Compute slope (m) and intercept (c)
        # Use numerically stable formulas for simple linear regression
        n = float(len(logN))
        sum_x = sum(logN)
        sum_y = sum(log_resid)
        sum_xx = sum(x * x for x in logN)
        sum_xy = sum(x * y for x, y in zip(logN, log_resid))

        denom = n * sum_xx - sum_x * sum_x
        if denom == 0:
            continue
        m = (n * sum_xy - sum_x * sum_y) / denom
        c = (sum_y - m * sum_x) / n

        alpha = -m
        # Enforce a reasonable range for alpha to avoid pathological fits
        if not math.isfinite(alpha) or alpha < 0:
            continue
        # Recover A
        A = math.exp(c)
        if not (math.isfinite(A) and A > 0):
            continue

        # Evaluate SSE in original space
        sse = 0.0
        for N, y in zip(Ns_clean, ys_clean):
            y_hat = L_inf + A * (N ** (-alpha))
            diff = y_hat - y
            sse += diff * diff

        if sse < best_sse:
            best_sse = sse
            best_params = Params(L_inf=L_inf, A=A, alpha=alpha)

    return best_params


def _load_dataset_records() -> List[dict]:
    """
    Load dataset from disk and return a list of records (dicts).
    """
    try:
        from datasets import load_from_disk
    except Exception:
        return []

    try:
        ds = load_from_disk(DATA_PATH)
    except Exception:
        return []

    # Handle DatasetDict vs Dataset
    try:
        # DatasetDict: pick 'train' if present else first split
        if hasattr(ds, "keys") and callable(getattr(ds, "keys", None)):
            split = None
            if "train" in ds:
                split = ds["train"]
            else:
                # First available split
                for k in ds.keys():
                    split = ds[k]
                    break
            dataset = split
        else:
            dataset = ds
    except Exception:
        dataset = ds

    records: List[dict] = []
    try:
        # Iterate rows as dicts; HF datasets are iterable
        for row in dataset:
            if isinstance(row, dict):
                records.append(row)
    except Exception:
        # Fallback: try to convert columns to rows
        try:
            cols = {k: list(v) for k, v in dataset.items()}  # type: ignore[attr-defined]
            n = min(len(v) for v in cols.values())
            for i in range(n):
                rec = {k: cols[k][i] for k in cols}
                records.append(rec)
        except Exception:
            pass

    return records


def _discover_group_key(records: List[dict]) -> Optional[str]:
    if not records:
        return None
    candidates = [
        "group",
        "grp",
        "setting",
        "experiment_group",
        "exp_group",
        "arch",
        "model_group",
    ]
    first = records[0]
    for k in candidates:
        if k in first:
            return k
    # If a "group" isn't present, treat all as one group
    return None


def _fit_params_from_data() -> Dict[str, Params]:
    records = _load_dataset_records()
    if not records:
        return {}

    group_key = _discover_group_key(records)

    # Build per-group arrays
    per_group: Dict[str, Tuple[List[float], List[float]]] = {}
    for r in records:
        if "sft_data_size" not in r or "sft_loss" not in r:
            continue
        try:
            N = float(r["sft_data_size"])
            y = float(r["sft_loss"])
        except Exception:
            continue
        if not (math.isfinite(N) and math.isfinite(y)):
            continue
        g = str(r[group_key]) if group_key and (group_key in r) else GLOBAL_KEY
        per_group.setdefault(g, ([], []))
        per_group[g][0].append(N)
        per_group[g][1].append(y)

    params: Dict[str, Params] = {}
    # First fit per specific group
    for g, (Ns, ys) in per_group.items():
        fitted = _polyfit_loglog_with_offset(Ns, ys)
        if fitted is not None:
            params[g] = fitted

    # Also fit a global model across all data for fallback
    all_Ns: List[float] = []
    all_ys: List[float] = []
    for (Ns, ys) in per_group.values():
        all_Ns.extend(Ns)
        all_ys.extend(ys)
    global_fit = _polyfit_loglog_with_offset(all_Ns, all_ys)
    if global_fit is not None:
        params[GLOBAL_KEY] = global_fit

    return params


# Fit on import (best-effort). If it fails, PARAMS remains empty and we use a heuristic fallback.
try:
    PARAMS = _fit_params_from_data()
except Exception:
    PARAMS = {}


def _get_params_for_group(group: str) -> Params:
    # Exact group
    if group in PARAMS:
        return PARAMS[group]
    # Fallback to global fit
    if GLOBAL_KEY in PARAMS:
        return PARAMS[GLOBAL_KEY]
    # Heuristic safety net
    return Params(L_inf=0.0, A=1.0, alpha=0.3)


def _predict_loss(N: float, p: Params) -> float:
    N = float(N)
    if N <= 0 or not math.isfinite(N):
        return float("nan")
    return float(p.L_inf + p.A * (N ** (-p.alpha)))


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Must include 'sft_data_size'.
        group: The name of the experimental group for which to make predictions.
               The functional form of the law is the same for all groups, but the
               constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s) under key 'sft_loss'.
    """
    params = _get_params_for_group(str(group))

    preds: List[dict] = []
    for row in input_data:
        if "sft_data_size" not in row:
            raise KeyError("Each input row must contain 'sft_data_size'.")
        N = float(row["sft_data_size"])
        y_hat = _predict_loss(N, params)
        preds.append({"sft_loss": y_hat})
    return preds

#5 Run 5 R² = -1.000000

▼

Python

from __future__ import annotations

import json
import math
import os
from dataclasses import dataclass
from typing import Dict, Iterable, List, Optional

# Numpy is widely available and sufficient for our small optimization routine.
import numpy as np

# We load the dataset to fit group-specific parameters.
try:
    from datasets import load_from_disk, Dataset, DatasetDict
except Exception:  # pragma: no cover - graceful fallback if datasets isn't available
    load_from_disk = None
    Dataset = None
    DatasetDict = None


DATA_PATH = "/app/data"
PARAMS_PATH = "/app/params.json"
EXPLAIN_PATH = "/app/explain.md"


@dataclass(frozen=True)
class Params:
    L_inf: float
    A: float
    alpha: float

    def predict(self, n: float) -> float:
        n = float(n)
        if not math.isfinite(n) or n <= 0:
            # Extremely conservative fallback for non-positive/invalid n
            n = 1.0
        # L(N) = L_inf + A * N^{-alpha}
        return float(self.L_inf + self.A * (n ** (-self.alpha)))


# Module-level cache of learned parameters per group
_PARAMS: Dict[str, Params] = {}
_FITTED: bool = False


def _safe_float_array(x: Iterable) -> np.ndarray:
    arr = np.asarray(list(x), dtype=float)
    # Filter out non-finite entries
    return arr[np.isfinite(arr)]


def _load_all_rows():
    if load_from_disk is None:
        return None

    try:
        ds = load_from_disk(DATA_PATH)
    except Exception:
        return None

    # Concatenate splits if a DatasetDict, else take the Dataset as-is
    try:
        from datasets import concatenate_datasets  # type: ignore
    except Exception:
        concatenate_datasets = None

    if DatasetDict is not None and isinstance(ds, DatasetDict):
        if concatenate_datasets is None:
            # Fallback: just use the first split if we cannot concatenate
            first_split = next(iter(ds.values()))
            return first_split
        return concatenate_datasets(list(ds.values()))
    return ds


def _find_group_column(column_names: List[str]) -> Optional[str]:
    candidates = [
        "group",
        "exp_group",
        "Group",
        "model",
        "arch",
        "task",
        "family",
        "variant",
        "experiment",
    ]
    lowercase = {c.lower(): c for c in column_names}
    for name in candidates:
        if name in column_names:
            return name
        if name in lowercase:
            return lowercase[name]
    return None


def _fit_power_law(N: np.ndarray, Y: np.ndarray) -> Optional[Params]:
    # Clean and validate
    mask = np.isfinite(N) & np.isfinite(Y) & (N > 0) & (Y > 0)
    N = N[mask]
    Y = Y[mask]
    if N.size < 3:
        return None

    y_min = float(np.min(Y))
    # Guardrails: L_inf must be less than min observed loss
    # Create a grid for L_inf and fit linear model in log-space for each.
    eps = max(1e-12, 1e-6 * y_min)
    low = max(eps, 0.25 * y_min)
    high = 0.99 * y_min
    if not math.isfinite(low) or not math.isfinite(high) or low >= high:
        # Degenerate case: fall back to L_inf = 0
        low, high = 0.0, 0.0

    # Include 0 in the grid to allow pure power law fits if appropriate
    l_grid = np.unique(np.concatenate([np.linspace(low, high, 60), np.array([0.0])]))

    best = None  # (mse, L_inf, A, alpha)
    x_log = np.log(N)

    for L_inf in l_grid:
        z = Y - L_inf
        if np.any(z <= 0):
            # invalid for log-space; skip
            continue
        t = np.log(z)

        # OLS: t = c0 + c1 * x_log  => log(z) = log(A) - alpha * log(N)
        # So: alpha = -c1,  A = exp(c0)
        try:
            c1, c0 = np.polyfit(x_log, t, 1)  # returns slope, intercept
        except Exception:
            continue

        alpha = -float(c1)
        A = float(np.exp(c0))

        if not (math.isfinite(alpha) and math.isfinite(A)) or A <= 0:
            continue
        if alpha <= 0 or alpha > 5.0:
            # Reasonable prior: alpha in (0, 5]
            continue

        y_hat = L_inf + A * (N ** (-alpha))
        mse = float(np.mean((Y - y_hat) ** 2))
        cand = (mse, L_inf, A, alpha)
        if best is None or mse < best[0]:
            best = cand

    # If grid search failed, try pure log-log (L_inf = 0)
    if best is None:
        try:
            t = np.log(Y)
            c1, c0 = np.polyfit(x_log, t, 1)
            alpha = -float(c1)
            A = float(np.exp(c0))
            if alpha > 0 and A > 0 and math.isfinite(alpha) and math.isfinite(A):
                best = (float(np.mean((Y - (A * (N ** (-alpha)))) ** 2)), 0.0, A, alpha)
        except Exception:
            pass

    if best is None:
        return None

    _, L_inf, A, alpha = best

    # Small local refinement around L_inf with narrower grid
    # This keeps the method SciPy-free while improving L_inf.
    rng = max(1e-3 * y_min, abs(0.1 * (high - low)))
    if rng > 0:
        l_ref_grid = np.linspace(max(0.0, L_inf - rng), max(0.0, L_inf + rng), 21)
        best_ref = None
        for L_inf_ref in l_ref_grid:
            z = Y - L_inf_ref
            if np.any(z <= 0):
                continue
            t = np.log(z)
            try:
                c1, c0 = np.polyfit(x_log, t, 1)
            except Exception:
                continue
            alpha_ref = -float(c1)
            A_ref = float(np.exp(c0))
            if alpha_ref <= 0 or not (math.isfinite(alpha_ref) and math.isfinite(A_ref)):
                continue
            y_hat = L_inf_ref + A_ref * (N ** (-alpha_ref))
            mse = float(np.mean((Y - y_hat) ** 2))
            cand = (mse, L_inf_ref, A_ref, alpha_ref)
            if best_ref is None or mse < best_ref[0]:
                best_ref = cand
        if best_ref is not None and best_ref[0] <= best[0]:
            _, L_inf, A, alpha = best_ref

    return Params(L_inf=float(L_inf), A=float(A), alpha=float(alpha))


def _fit_params_per_group() -> Dict[str, Params]:
    # If a cached params file exists, load it to avoid re-fitting.
    if os.path.exists(PARAMS_PATH):
        try:
            with open(PARAMS_PATH, "r", encoding="utf-8") as f:
                raw = json.load(f)
            loaded: Dict[str, Params] = {
                g: Params(**vals) for g, vals in raw.items()
            }
            if loaded:
                return loaded
        except Exception:
            pass  # fall through to refit

    ds_all = _load_all_rows()
    if ds_all is None:
        # Fallback defaults if dataset is unavailable at runtime
        return {"default": Params(L_inf=0.5, A=0.5, alpha=0.5)}

    colnames = list(ds_all.column_names)
    # Required columns
    if "sft_data_size" not in colnames or "sft_loss" not in colnames:
        # Try a couple of common aliases before giving up
        name_map = {
            "sft_data_size": next((c for c in colnames if c.lower() in {"n", "size", "num_examples", "data_size"}), None),
            "sft_loss": next((c for c in colnames if c.lower() in {"loss", "final_loss"}), None),
        }
        size_col = name_map["sft_data_size"] or "sft_data_size"
        loss_col = name_map["sft_loss"] or "sft_loss"
    else:
        size_col, loss_col = "sft_data_size", "sft_loss"

    group_col = _find_group_column(colnames)

    # Build per-group arrays
    by_group: Dict[str, Dict[str, List[float]]] = {}
    for row in ds_all:
        try:
            n = float(row[size_col])
            y = float(row[loss_col])
        except Exception:
            continue
        g = str(row[group_col]) if group_col and group_col in row else "default"
        d = by_group.setdefault(g, {"N": [], "Y": []})
        d["N"].append(n)
        d["Y"].append(y)

    params: Dict[str, Params] = {}
    # Fit each group
    for g, data in by_group.items():
        N = _safe_float_array(data["N"])
        Y = _safe_float_array(data["Y"])
        p = _fit_power_law(N, Y)
        if p is None:
            p = Params(L_inf=float(np.min(Y)) * 0.5 if Y.size else 0.5, A=0.5, alpha=0.5)
        params[g] = p

    # Persist for reuse
    try:
        with open(PARAMS_PATH, "w", encoding="utf-8") as f:
            json.dump({g: vars(p) for g, p in params.items()}, f, indent=2, sort_keys=True)
    except Exception:
        pass

    # Update explain.md with the fitted values
    try:
        _write_explain(params)
    except Exception:
        pass

    return params


def _write_explain(params: Dict[str, Params]) -> None:
    lines = []
    lines.append("# Scaling law for SFT loss vs. dataset size")
    lines.append("")
    lines.append("We model the final SFT loss as a simple, data-efficient power law with an asymptote:")
    lines.append("")
    lines.append("L(N) = L_inf + A * N^{-alpha}")
    lines.append("")
    lines.append("Where:")
    lines.append("- N is the SFT dataset size (number of examples).")
    lines.append("- L_inf is the irreducible loss as N → ∞.")
    lines.append("- A > 0 scales the improvement from finite data.")
    lines.append("- alpha > 0 is the data-scaling exponent.")
    lines.append("")
    lines.append("Fitting methodology:")
    lines.append("- Load all rows from /app/data (concatenating splits if present).")
    lines.append("- Group by the experimental group column (auto-detected; defaults to a single 'default' group if missing).")
    lines.append("- For each group, grid-search L_inf (including 0), and for each candidate fit log(Y - L_inf) ~ log(N) by OLS to obtain A and alpha.")
    lines.append("- Select the candidate minimizing MSE in the original loss space, then locally refine L_inf with a short 1D search.")
    lines.append("")
    lines.append("This functional form is widely used for learning curves and extrapolates well: as N grows, the loss declines as a power law towards L_inf.")
    lines.append("")
    lines.append("## Fitted parameter values per group")
    lines.append("")
    if not params:
        lines.append("_No parameters fitted yet. Importing `law` will trigger fitting automatically._")
    else:
        lines.append("| group | L_inf | A | alpha |")
        lines.append("|---|---:|---:|---:|")
        for g, p in sorted(params.items()):
            lines.append(f"| {g} | {p.L_inf:.6f} | {p.A:.6f} | {p.alpha:.6f} |")
    lines.append("")
    lines.append("_This file is auto-updated when /app/law.py fits parameters (on first import)._" )
    content = "\n".join(lines)
    try:
        with open(EXPLAIN_PATH, "w", encoding="utf-8") as f:
            f.write(content)
    except Exception:
        # best-effort; ignore write errors
        pass


def _ensure_fitted() -> None:
    global _FITTED, _PARAMS
    if _FITTED:
        return
    _PARAMS = _fit_params_per_group()
    _FITTED = True


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Expected key: 'sft_data_size'.
        group: The name of the experimental group for which to make predictions.
               The functional form of the law is the same for all groups,
               but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s) under key 'sft_loss'.
    """
    _ensure_fitted()

    # Choose parameters for the requested group; if missing, pick a reasonable fallback.
    params = _PARAMS.get(group)
    if params is None:
        # Fallback hierarchy:
        params = _PARAMS.get("default")
        if params is None and _PARAMS:
            # Use the median parameters across groups
            L_infs = np.array([p.L_inf for p in _PARAMS.values()])
            As = np.array([p.A for p in _PARAMS.values()])
            alphas = np.array([p.alpha for p in _PARAMS.values()])
            params = Params(
                L_inf=float(np.median(L_infs)),
                A=float(np.median(As)),
                alpha=float(np.median(alphas)),
            )
        if params is None:
            # Final safety fallback
            params = Params(L_inf=0.5, A=0.5, alpha=0.5)

    out: List[Dict[str, float]] = []
    for row in input_data:
        # Primary expected key
        n = row.get("sft_data_size", None)
        # Common aliases to be robust to input variations
        if n is None:
            for k in ("N", "n", "size", "data_size", "num_examples"):
                if k in row:
                    n = row[k]
                    break
        if n is None:
            # If completely missing, predict using N=1.0 to avoid NaN
            n = 1.0

        y_pred = params.predict(n)
        out.append({"sft_loss": float(y_pred)})

    return out


# Fit at import so the parameters and explanation are available immediately.
try:
    _ensure_fitted()
except Exception:
    # Defer fitting errors to first call of law()
    pass