SLD - MoE Scaling Law - aider + GPT-5

All Runs (sorted by R²)

Best Run 1 R² = 0.825565

▼

Python

# This module intentionally defines only a single public function: law

_FIT_CACHE: dict[str, dict] = {}

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
               The functional form of the law is the same for all groups,
               but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    import math
    from typing import List, Dict
    import numpy as np

    # Lazy import to avoid import-time dependency unless needed for fitting
    def _load_dataset():
        try:
            from datasets import load_from_disk
        except Exception as e:
            raise RuntimeError("The 'datasets' package is required to load /app/data.") from e
        return load_from_disk("/app/data")

    def _as_numpy_array(x):
        return np.asarray(list(x), dtype=np.float64)

    def _choose_group_column(column_names):
        # Try common group column names
        candidates = [
            "group",
            "Group",
            "group_name",
            "experiment_group",
            "dataset_group",
            "suite",
            "task",
            "series",
        ]
        for c in candidates:
            if c in column_names:
                return c
        return None

    def _extract_split(ds):
        # Accept either Dataset or DatasetDict
        try:
            from datasets import DatasetDict
            is_dict = isinstance(ds, DatasetDict)
        except Exception:
            is_dict = hasattr(ds, "keys") and hasattr(ds, "__getitem__")
        if is_dict:
            # Prefer 'train', otherwise pick the first available split
            for split_name in ("train", "training", "train_set"):
                if split_name in ds:
                    return ds[split_name]
            # Fallback: first split
            first_key = next(iter(ds.keys()))
            return ds[first_key]
        return ds

    def _fit_group_params(target_group: str):
        ds_all = _load_dataset()
        ds = _extract_split(ds_all)
        colnames = list(ds.column_names)

        # Required variable names (per problem statement)
        p_col = "dense_parameter_count"
        e_col = "num_experts"
        y_col = "loss_validation"

        for required in (p_col, e_col, y_col):
            if required not in colnames:
                raise KeyError(f"Required column '{required}' not found in dataset columns: {colnames}")

        g_col = _choose_group_column(colnames)
        # Materialize the relevant rows for the selected group
        Ps, Es, Ys = [], [], []
        if g_col is None:
            # No group column; use all rows as one group
            for row in ds:
                try:
                    P = float(row[p_col])
                    E = float(row[e_col])
                    Y = float(row[y_col])
                except Exception:
                    continue
                if not (math.isfinite(P) and math.isfinite(E) and math.isfinite(Y)):
                    continue
                Ps.append(P)
                Es.append(E)
                Ys.append(Y)
        else:
            for row in ds:
                if str(row.get(g_col, "")) != str(target_group):
                    continue
                try:
                    P = float(row[p_col])
                    E = float(row[e_col])
                    Y = float(row[y_col])
                except Exception:
                    continue
                if not (math.isfinite(P) and math.isfinite(E) and math.isfinite(Y)):
                    continue
                Ps.append(P)
                Es.append(E)
                Ys.append(Y)

            # If no rows matched the group, fallback to using all rows (shared fit)
            if len(Ps) == 0:
                for row in ds:
                    try:
                        P = float(row[p_col])
                        E = float(row[e_col])
                        Y = float(row[y_col])
                    except Exception:
                        continue
                    if not (math.isfinite(P) and math.isfinite(E) and math.isfinite(Y)):
                        continue
                    Ps.append(P)
                    Es.append(E)
                    Ys.append(Y)

        P = _as_numpy_array(Ps)
        E = _as_numpy_array(Es)
        Y = _as_numpy_array(Ys)

        # Basic guards
        eps = 1e-12
        P = np.clip(P, 1.0, None)
        E = np.clip(E, 1.0, None)

        # Model (shared functional form across groups):
        #   L ≈ w0 + w1 * P^(-α) + w2 * E^(-β) + w3 * (P^(-α) * E^(-β))
        # We grid-search α, β and solve for w via least squares.
        alpha_grid = np.linspace(0.1, 1.6, 31)  # 31 steps
        beta_grid = np.linspace(0.1, 1.6, 31)

        best = {
            "mse": float("inf"),
            "alpha": None,
            "beta": None,
            "w": None,
        }

        # Precompute logs to speed up repeated power computations
        logP = np.log(P)
        logE = np.log(E)

        for alpha in alpha_grid:
            # P^{-α} = exp(-α log P)
            f1 = np.exp(-alpha * logP)
            for beta in beta_grid:
                f2 = np.exp(-beta * logE)
                f3 = f1 * f2

                # Design matrix with bias and interaction term
                X = np.column_stack([np.ones_like(f1), f1, f2, f3])

                # Solve least squares (small ridge by augmenting if needed)
                try:
                    w, *_ = np.linalg.lstsq(X, Y, rcond=None)
                except np.linalg.LinAlgError:
                    # Add tiny ridge if singular
                    lam = 1e-10
                    XT = X.T
                    A = XT @ X + lam * np.eye(X.shape[1])
                    b = XT @ Y
                    w = np.linalg.solve(A, b)

                resid = Y - X @ w
                mse = float(np.mean(resid * resid))
                if mse < best["mse"]:
                    best["mse"] = mse
                    best["alpha"] = float(alpha)
                    best["beta"] = float(beta)
                    best["w"] = w.astype(float)

        params = {
            "alpha": best["alpha"],
            "beta": best["beta"],
            "w0": float(best["w"][0]),
            "w1": float(best["w"][1]),
            "w2": float(best["w"][2]),
            "w3": float(best["w"][3]),
            "mse": best["mse"],
            "formula": "loss_validation ≈ w0 + w1*P^{-alpha} + w2*E^{-beta} + w3*(P^{-alpha}*E^{-beta})",
            "variables": {"P": p_col, "E": e_col, "Y": y_col},
        }
        return params

    # Ensure we've fit parameters for this group (cache to avoid repeated fitting)
    if group not in _FIT_CACHE:
        fitted = _fit_group_params(group)
        _FIT_CACHE[group] = fitted
        # Expose fitted params for introspection if desired
        try:
            # Attach to the function object (does not define a new function)
            if not hasattr(law, "_params_cache"):
                setattr(law, "_params_cache", {})
            law._params_cache[group] = dict(_FIT_CACHE[group])
        except Exception:
            pass

    params = _FIT_CACHE[group]
    alpha = params["alpha"]
    beta = params["beta"]
    w0 = params["w0"]
    w1 = params["w1"]
    w2 = params["w2"]
    w3 = params["w3"]

    # Predict for each input row
    out: List[Dict[str, float]] = []
    for row in input_data:
        try:
            P = float(row.get("dense_parameter_count", float("nan")))
            E = float(row.get("num_experts", float("nan")))
        except Exception:
            P, E = float("nan"), float("nan")

        if not (math.isfinite(P) and math.isfinite(E)):
            out.append({"loss_validation": float("nan")})
            continue

        P = max(P, 1.0)
        E = max(E, 1.0)
        # Compute features
        f1 = P ** (-alpha)
        f2 = E ** (-beta)
        f3 = f1 * f2

        yhat = w0 + w1 * f1 + w2 * f2 + w3 * f3
        out.append({"loss_validation": float(yhat)})

    return out

#2 Run 2 R² = 0.791017

▼

Python

from __future__ import annotations

import math
import os
from typing import Dict, List

import numpy as np


# Module-level cache for fitted parameters per group
# Each entry: group -> dict with keys: L0, logK, pD, pE
_PARAMS_BY_GROUP: Dict[str, Dict[str, float]] = {}
_FITTED: bool = False
_DATASET_PATH: str = "/app/data"


def _load_dataset():
    """
    Load the dataset from disk using datasets.load_from_disk.
    Returns an iterable of rows (dicts).
    """
    try:
        from datasets import load_from_disk, Dataset, DatasetDict
    except Exception as e:
        raise RuntimeError(
            "Failed to import the 'datasets' library. Please ensure it is installed."
        ) from e

    ds = load_from_disk(_DATASET_PATH)

    # Normalize to a single iterable over rows
    rows = []
    if isinstance(ds, dict) or getattr(ds, "__class__", None).__name__ == "DatasetDict":
        # Combine all splits
        for split in (getattr(ds, "values", lambda: [])() or ds.values()):
            # split is a Dataset
            rows.extend(iter(split))
    else:
        # Single Dataset
        rows = list(iter(ds))

    return rows


def _prepare_group_data(rows):
    """
    Returns:
        data_by_group: dict[group] -> dict with numpy arrays: D, E, Y
        all_data: dict with same keys but pooled across all groups, under key "__ALL__"
    """
    data_by_group: Dict[str, Dict[str, np.ndarray]] = {}
    all_D, all_E, all_Y = [], [], []

    for r in rows:
        try:
            g = str(r["group"])
            D = float(r["dense_parameter_count"])
            E = float(r["num_experts"])
            Y = float(r["loss_validation"])
        except Exception:
            # Skip malformed rows
            continue

        # Filter invalid values
        if not (np.isfinite(D) and D > 0.0):
            continue
        if not (np.isfinite(E) and E > 0.0):
            continue
        if not (np.isfinite(Y) and Y > 0.0):
            continue

        bucket = data_by_group.setdefault(g, {"D": [], "E": [], "Y": []})
        bucket["D"].append(D)
        bucket["E"].append(E)
        bucket["Y"].append(Y)

        all_D.append(D)
        all_E.append(E)
        all_Y.append(Y)

    # Convert lists to numpy arrays
    for g, v in data_by_group.items():
        v["D"] = np.asarray(v["D"], dtype=float)
        v["E"] = np.asarray(v["E"], dtype=float)
        v["Y"] = np.asarray(v["Y"], dtype=float)

    data_by_group["__ALL__"] = {
        "D": np.asarray(all_D, dtype=float),
        "E": np.asarray(all_E, dtype=float),
        "Y": np.asarray(all_Y, dtype=float),
    }
    return data_by_group


def _fit_power_law_with_offset(D: np.ndarray, E: np.ndarray, Y: np.ndarray) -> Dict[str, float]:
    """
    Fit the following scaling law (same functional form for every group):
        L(D, E) = L0 + K * D^pD * E^pE
    where typically pD and pE will be negative.

    We fit by grid-searching L0 and, for each candidate, doing linear least-squares on:
        log(Y - L0) = logK + pD * log(D) + pE * log(E)

    Returns dict with keys: L0, logK, pD, pE
    """
    # Basic sanity
    if D.size == 0 or E.size == 0 or Y.size == 0:
        # Fallback: trivial constant model
        m = float(np.mean(Y)) if Y.size else 1.0
        return {"L0": max(1e-8, 0.9 * m), "logK": math.log(max(1e-8, 0.1 * m)), "pD": 0.0, "pE": 0.0}

    minY = float(np.min(Y))
    maxY = float(np.max(Y))
    rng = maxY - minY
    if not np.isfinite(rng) or rng <= 0:
        # Degenerate: nearly constant losses
        # Make L0 just below minY, small K and zero exponents
        tiny = max(1e-12, 1e-6 * abs(minY))
        return {"L0": minY - tiny, "logK": math.log(tiny), "pD": 0.0, "pE": 0.0}

    # Candidate L0 values (must be strictly below min(Y))
    # Use a mixture of coarse-to-fine values below minY
    eps = max(1e-12, 1e-8 * abs(minY))
    coarse = minY - np.linspace(0.0, 0.99, 40) * (rng + eps)
    fine = np.linspace(minY - 10 * eps, minY - eps, 10)
    L0_candidates = np.unique(np.concatenate([coarse, fine]))

    logD = np.log(D)
    logE = np.log(E)

    best = None  # (rss, params)
    for L0 in L0_candidates:
        # Ensure positivity
        diff = Y - L0
        if np.any(diff <= 0):
            continue

        t = np.log(diff)
        X = np.column_stack([np.ones_like(logD), logD, logE])

        # Least squares fit: minimize ||X c - t||^2
        try:
            c, residuals, rank, s = np.linalg.lstsq(X, t, rcond=None)
        except Exception:
            continue

        # Compute RSS explicitly for robustness
        pred = X @ c
        rss = float(np.sum((t - pred) ** 2))

        # Regularize slightly to avoid extreme exponents on tiny datasets
        rss += 1e-10 * float(np.sum(c[1:] ** 2))

        if (best is None) or (rss < best[0]):
            best = (rss, {"L0": float(L0), "logK": float(c[0]), "pD": float(c[1]), "pE": float(c[2])})

    if best is None:
        # Fallback: just under minY
        tiny = max(1e-12, 1e-6 * abs(minY))
        return {"L0": minY - tiny, "logK": math.log(tiny), "pD": 0.0, "pE": 0.0}

    return best[1]


def _fit_all_groups():
    global _FITTED, _PARAMS_BY_GROUP

    rows = _load_dataset()
    data = _prepare_group_data(rows)

    # Fit global model
    all_params = _fit_power_law_with_offset(data["__ALL__"]["D"], data["__ALL__"]["E"], data["__ALL__"]["Y"])
    _PARAMS_BY_GROUP["__ALL__"] = all_params

    # Fit per-group models; if data too small, fall back to global
    for g, v in data.items():
        if g == "__ALL__":
            continue
        D, E, Y = v["D"], v["E"], v["Y"]
        if min(D.size, E.size, Y.size) < 3:
            _PARAMS_BY_GROUP[g] = all_params
            continue
        _PARAMS_BY_GROUP[g] = _fit_power_law_with_offset(D, E, Y)

    _FITTED = True


def _ensure_fitted():
    if not _FITTED:
        _fit_all_groups()


def _predict_for_group(dense: float, experts: float, group: str) -> float:
    _ensure_fitted()
    params = _PARAMS_BY_GROUP.get(group) or _PARAMS_BY_GROUP.get("__ALL__")
    if params is None:
        # Final fallback in pathological cases
        return float(dense)  # arbitrary, but won't crash

    # Guard inputs
    d = max(1e-12, float(dense))
    e = max(1e-12, float(experts))

    L0 = params["L0"]
    logK = params["logK"]
    pD = params["pD"]
    pE = params["pE"]

    pred = L0 + math.exp(logK) * (d ** pD) * (e ** pE)
    # Numerical safety: ensure positive and finite
    if not np.isfinite(pred) or pred <= 0:
        # fallback to safe value close to L0
        pred = max(L0 + 1e-8, 1e-8)
    return float(pred)


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    outputs: List[Dict[str, float]] = []
    for row in input_data:
        try:
            dense = float(row["dense_parameter_count"])
            experts = float(row["num_experts"])
        except Exception as e:
            raise ValueError(
                "Each input datum must contain 'dense_parameter_count' and 'num_experts' as numeric values."
            ) from e

        yhat = _predict_for_group(dense, experts, group)
        outputs.append({"loss_validation": float(yhat)})
    return outputs


def debug_dump_params() -> None:
    """
    Utility: Fit (if necessary) and print the fitted parameters for all groups.
    """
    _ensure_fitted()
    groups = sorted([g for g in _PARAMS_BY_GROUP.keys() if g != "__ALL__"])
    print("# Fitted parameters per group for law: L = L0 + exp(logK) * D^pD * E^pE")
    print("# Columns: group, L0, logK, pD, pE")
    for g in groups:
        p = _PARAMS_BY_GROUP[g]
        print(f"{g}\t{p['L0']:.8g}\t{p['logK']:.8g}\t{p['pD']:.8g}\t{p['pE']:.8g}")
    # Also print global
    p = _PARAMS_BY_GROUP.get("__ALL__", None)
    if p is not None:
        print("\n# Global (pooled) fit")
        print(f"__ALL__\t{p['L0']:.8g}\t{p['logK']:.8g}\t{p['pD']:.8g}\t{p['pE']:.8g}")


if __name__ == "__main__":
    # If executed directly, print fitted parameters as a quick sanity check.
    debug_dump_params()

#3 Run 3 R² = 0.731313

▼

Python

from typing import List, Dict
import math

# Lazy-fit cache (module globals, no extra public functions exposed)
_FITTED: bool = False
_GROUP_MODELS: Dict[str, Dict[str, float]] = {}
_DATA_PATH: str = "/app/data"


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
                    Required keys per item:
                      - 'num_experts' (float, > 0)
                      - 'dense_parameter_count' (float, > 0)
        group: The name of the experimental group for which to make predictions.
               The functional form of the law is fixed across groups, but
               coefficients are fitted per group from /app/data.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s):
          - 'loss_validation'
    """
    # Import heavy deps inside the function to keep module import cheap.
    # The environment is expected to provide HuggingFace datasets and numpy.
    global _FITTED, _GROUP_MODELS

    # Small positive floor to avoid log/zero issues
    EPS = 1e-12

    # Fit once (lazily) from the provided dataset on disk.
    if not _FITTED:
        try:
            import numpy as np
            from datasets import load_from_disk
        except Exception:
            # If datasets/numpy are unavailable, fall back to a generic prior-like model.
            # This still preserves the functional form and keeps predictions finite.
            _GROUP_MODELS = {
                "GLOBAL": {
                    "L": 1.0,
                    "A": 1.0,
                    "alpha": 0.5,
                    "B": 0.5,
                    "beta": 0.5,
                    "D": 0.25,
                    "gamma": 0.5,
                }
            }
            _FITTED = True
        else:
            # Load dataset from disk
            ds_any = load_from_disk(_DATA_PATH)

            # Extract rows into a simple list of dicts
            rows: List[Dict[str, float]] = []
            try:
                from datasets import DatasetDict, Dataset

                if hasattr(ds_any, "keys"):  # likely a DatasetDict
                    # Prefer 'train' if available, else merge all splits
                    if "train" in ds_any.keys():
                        base = ds_any["train"]
                        rows = base.to_list()
                    else:
                        rows = []
                        for k in ds_any.keys():
                            rows.extend(ds_any[k].to_list())
                else:
                    # Single Dataset
                    rows = ds_any.to_list()
            except Exception:
                # Last-resort attempt: assume iterable of dict-like
                try:
                    rows = list(ds_any)
                except Exception:
                    rows = []

            # Detect key names robustly
            group_key_candidates = ("group", "Group", "group_name", "family")
            feature_example = rows[0] if rows else {}
            if feature_example:
                gkey = next((k for k in group_key_candidates if k in feature_example), None)
            else:
                gkey = None

            def get_group_value(r: Dict) -> str:
                if gkey is None:
                    return "GLOBAL"
                return str(r.get(gkey, "GLOBAL"))

            # Required feature keys
            ne_key_candidates = ("num_experts", "experts", "n_experts")
            dp_key_candidates = ("dense_parameter_count", "dense_params", "non_expert_params", "dense_parameters")

            def get_key(cands):
                return next((k for k in cands if (feature_example and k in feature_example)), cands[0])

            ne_key = get_key(ne_key_candidates)
            dp_key = get_key(dp_key_candidates)
            y_key = "loss_validation"  # per prompt

            # Group data
            groups: Dict[str, Dict[str, list]] = {}
            for r in rows:
                try:
                    ne = float(r.get(ne_key, float("nan")))
                    dp = float(r.get(dp_key, float("nan")))
                    y = float(r.get(y_key, float("nan")))
                except Exception:
                    continue
                if not (math.isfinite(ne) and math.isfinite(dp) and math.isfinite(y)):
                    continue
                if ne <= 0 or dp <= 0:
                    continue
                g = get_group_value(r)
                bucket = groups.setdefault(g, {"ne": [], "dp": [], "y": []})
                bucket["ne"].append(ne)
                bucket["dp"].append(dp)
                bucket["y"].append(y)

            # Also build a GLOBAL pool across all rows for fallback
            if groups:
                all_ne = [v for g in groups.values() for v in g["ne"]]
                all_dp = [v for g in groups.values() for v in g["dp"]]
                all_y = [v for g in groups.values() for v in g["y"]]
                groups["GLOBAL"] = {"ne": all_ne, "dp": all_dp, "y": all_y}
            else:
                # No data: fallback to a generic prior-like model
                groups = {
                    "GLOBAL": {
                        "ne": [8.0, 16.0, 32.0],
                        "dp": [1e7, 5e7, 1e8],
                        "y": [1.2, 1.0, 0.9],
                    }
                }

            def fit_one_group(ne_arr: np.ndarray, dp_arr: np.ndarray, y_arr: np.ndarray) -> Dict[str, float]:
                # Log-transform inputs
                x1 = np.log(np.maximum(dp_arr, EPS))
                x2 = np.log(np.maximum(ne_arr, EPS))
                y = y_arr.astype(float)

                # Candidate exponents (coarse grid)
                grid = np.arange(0.1, 1.6, 0.1)  # 0.1..1.5

                best = {"mse": float("inf")}
                # Ridge epsilon to stabilize normal equations
                ridge = 1e-10

                for alpha in grid:
                    t0 = np.exp(-alpha * x1)
                    for beta in grid:
                        t1 = np.exp(-beta * x2)
                        for gamma in grid:
                            t2 = np.exp(-gamma * (x1 - x2))
                            # Design matrix: [1, t0, t1, t2]
                            Z = np.column_stack([np.ones_like(y), t0, t1, t2])
                            # Solve ridge least squares
                            # (Z^T Z + lam I) c = Z^T y
                            ZTZ = Z.T @ Z
                            ZTy = Z.T @ y
                            ZTZ += ridge * np.eye(ZTZ.shape[0])
                            try:
                                coeffs = np.linalg.solve(ZTZ, ZTy)
                            except np.linalg.LinAlgError:
                                # Fallback to lstsq
                                coeffs, *_ = np.linalg.lstsq(Z, y, rcond=None)

                            y_hat = Z @ coeffs
                            mse = float(np.mean((y_hat - y) ** 2))
                            if mse < best["mse"]:
                                best = {
                                    "mse": mse,
                                    "L": float(coeffs[0]),
                                    "A": float(coeffs[1]),
                                    "alpha": float(alpha),
                                    "B": float(coeffs[2]),
                                    "beta": float(beta),
                                    "D": float(coeffs[3]),
                                    "gamma": float(gamma),
                                }

                return best

            # Fit per group
            fitted: Dict[str, Dict[str, float]] = {}
            for gname, gdata in groups.items():
                try:
                    ne_arr = np.asarray(gdata["ne"], dtype=float)
                    dp_arr = np.asarray(gdata["dp"], dtype=float)
                    y_arr = np.asarray(gdata["y"], dtype=float)
                    # Basic sanity check
                    mask = np.isfinite(ne_arr) & np.isfinite(dp_arr) & np.isfinite(y_arr) & (ne_arr > 0) & (dp_arr > 0)
                    ne_arr = ne_arr[mask]
                    dp_arr = dp_arr[mask]
                    y_arr = y_arr[mask]
                    if ne_arr.size >= 4:
                        fitted[gname] = fit_one_group(ne_arr, dp_arr, y_arr)
                    else:
                        # Too few points; copy from GLOBAL later
                        pass
                except Exception:
                    # If anything goes wrong fitting a group, we'll fill it from GLOBAL
                    pass

            # Ensure GLOBAL exists and is fitted
            if "GLOBAL" not in fitted:
                # If still missing, fit on whatever GLOBAL pool we have
                try:
                    import numpy as np  # re-import safe guard
                    gl = groups.get("GLOBAL")
                    if gl:
                        fitted["GLOBAL"] = fit_one_group(
                            np.asarray(gl["ne"], dtype=float),
                            np.asarray(gl["dp"], dtype=float),
                            np.asarray(gl["y"], dtype=float),
                        )
                except Exception:
                    fitted["GLOBAL"] = {
                        "L": 1.0,
                        "A": 1.0,
                        "alpha": 0.5,
                        "B": 0.5,
                        "beta": 0.5,
                        "D": 0.25,
                        "gamma": 0.5,
                        "mse": float("inf"),
                    }

            # Fill missing/small groups from GLOBAL
            for gname in list(groups.keys()):
                if gname not in fitted:
                    fitted[gname] = dict(fitted["GLOBAL"])

            _GROUP_MODELS = fitted
            _FITTED = True

    # Make predictions using the selected group's coefficients, or GLOBAL fallback
    model = _GROUP_MODELS.get(group) or _GROUP_MODELS.get("GLOBAL")
    if model is None:
        # Absolute fallback (shouldn't happen)
        model = {
            "L": 1.0,
            "A": 1.0,
            "alpha": 0.5,
            "B": 0.5,
            "beta": 0.5,
            "D": 0.25,
            "gamma": 0.5,
        }

    L = float(model["L"])
    A = float(model["A"])
    alpha = float(model["alpha"])
    B = float(model["B"])
    beta = float(model["beta"])
    D = float(model["D"])
    gamma = float(model["gamma"])

    out: List[Dict[str, float]] = []
    for row in input_data:
        ne = float(row.get("num_experts", 0.0))
        dp = float(row.get("dense_parameter_count", 0.0))
        # Guard against non-positive inputs
        ne = ne if (ne > 0 and math.isfinite(ne)) else 1.0
        dp = dp if (dp > 0 and math.isfinite(dp)) else 1.0

        # Predict using the discovered form:
        # loss ≈ L + A * dp^{-alpha} + B * ne^{-beta} + D * (dp/ne)^{-gamma}
        # Implemented in log-space for stability of the exponentials.
        x1 = math.log(max(dp, EPS))
        x2 = math.log(max(ne, EPS))

        t0 = math.exp(-alpha * x1)  # dp^{-alpha}
        t1 = math.exp(-beta * x2)   # ne^{-beta}
        t2 = math.exp(-gamma * (x1 - x2))  # (dp/ne)^{-gamma}

        y_hat = L + A * t0 + B * t1 + D * t2
        out.append({"loss_validation": float(y_hat)})

    return out

#4 Run 4 R² = -0.753057

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
               The functional form of the law is the same for all groups,
               but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    import math
    # Fit on first use from the on-disk dataset and cache coefficients on the function.
    # Functional form (shared across groups):
    #   loss_validation_hat = a_g + b_g * log10(dense_parameter_count) + c_g * log10(num_experts)
    # Coefficients (a_g, b_g, c_g) are fitted per group with OLS; a global fallback is also fitted.

    try:
        from datasets import load_from_disk
    except Exception:
        load_from_disk = None

    try:
        import numpy as np
    except Exception:
        np = None  # If numpy is unavailable, we will still return a deterministic baseline.

    if not hasattr(law, "_coeffs"):
        law._coeffs = {}
        law._global = [0.0, 0.0, 0.0]  # Fallback if fitting fails

        if load_from_disk is not None:
            try:
                ds_any = load_from_disk("/app/data")
                # Resolve to a single dataset split
                if hasattr(ds_any, "keys"):  # DatasetDict
                    if "train" in ds_any:
                        ds = ds_any["train"]
                    else:
                        ds = next(iter(ds_any.values()))
                else:
                    ds = ds_any  # Already a Dataset

                # Try columnar access; if it fails, fallback to row iteration
                try:
                    g_list = list(ds["group"])
                    pd_list = [float(v) for v in ds["dense_parameter_count"]]
                    ne_list = [float(v) for v in ds["num_experts"]]
                    y_list = [float(v) for v in ds["loss_validation"]]
                except Exception:
                    g_list = []
                    pd_list = []
                    ne_list = []
                    y_list = []
                    for row in ds:
                        g_list.append(row["group"])
                        pd_list.append(float(row["dense_parameter_count"]))
                        ne_list.append(float(row["num_experts"]))
                        y_list.append(float(row["loss_validation"]))

                # Local helper to fit OLS on provided indices
                def _fit_indices(indices):
                    if np is None or len(indices) == 0:
                        # Baseline: constant equal to mean target, zero slopes
                        if len(y_list) > 0:
                            mu = float(sum(y_list) / len(y_list))
                        else:
                            mu = 0.0
                        return [mu, 0.0, 0.0]
                    X = []
                    y = []
                    for i in indices:
                        P = max(pd_list[i], 1e-12)
                        E = max(ne_list[i], 1e-12)
                        X.append([1.0, math.log10(P), math.log10(E)])
                        y.append(y_list[i])
                    X = np.array(X, dtype=float)
                    y = np.array(y, dtype=float)
                    try:
                        w, *_ = np.linalg.lstsq(X, y, rcond=None)
                        return [float(w[0]), float(w[1]), float(w[2])]
                    except Exception:
                        # Pseudoinverse fallback
                        try:
                            w = np.matmul(np.linalg.pinv(X), y)
                            return [float(w[0]), float(w[1]), float(w[2])]
                        except Exception:
                            mu = float(y.mean()) if hasattr(y, "mean") else (sum(y) / len(y))
                            return [mu, 0.0, 0.0]

                # Fit per-group coefficients
                unique_groups = sorted(set(g_list))
                for g in unique_groups:
                    idxs = [i for i, gg in enumerate(g_list) if gg == g]
                    law._coeffs[g] = _fit_indices(idxs)

                # Global fallback using all data
                all_idxs = list(range(len(g_list)))
                law._global = _fit_indices(all_idxs)

                # Write explain.md with fitted coefficients (best-effort)
                try:
                    lines = []
                    lines.append("# MoE Scaling Law")
                    lines.append("")
                    lines.append("We model the validation loss as a log-linear function of dense parameters and the number of experts, with group-specific coefficients:")
                    lines.append("")
                    lines.append("    loss_validation_hat = a_g + b_g * log10(dense_parameter_count) + c_g * log10(num_experts)")
                    lines.append("")
                    lines.append("Fitting methodology: ordinary least squares per group on /app/data (train split if present),")
                    lines.append("with a global model used as a fallback for unseen groups.")
                    lines.append("")
                    lines.append("Fitted coefficients by group (a_g, b_g, c_g):")
                    if len(law._coeffs) == 0:
                        lines.append("- (no coefficients could be fitted; dataset not accessible at runtime)")
                    else:
                        for g in sorted(law._coeffs.keys()):
                            a, b, c = law._coeffs[g]
                            lines.append(f"- {g}: a={a:.6f}, b={b:.6f}, c={c:.6f}")
                    if law._global is not None:
                        a, b, c = law._global
                        lines.append("")
                        lines.append(f"Global fallback: a={a:.6f}, b={b:.6f}, c={c:.6f}")
                    with open("/app/explain.md", "w", encoding="utf-8") as f:
                        f.write("\n".join(lines) + "\n")
                except Exception:
                    pass
            except Exception:
                # Could not load or fit; keep default coefficients
                pass

    # Select coefficients for the requested group (or global fallback)
    a, b, c = law._coeffs.get(group, law._global)

    # Make predictions
    outputs: list[dict[str, float]] = []
    for row in input_data:
        P = max(float(row.get("dense_parameter_count", 0.0)), 1e-12)
        E = max(float(row.get("num_experts", 0.0)), 1e-12)
        pred = a + b * math.log10(P) + c * math.log10(E)
        outputs.append({"loss_validation": float(pred)})
    return outputs

#5 Run 5 R² = -1.000000

▼

Python

"""
Scaling law predictor for MoE architectures.

This module exposes a single function:

    law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]

It implements the discovered functional form:

    loss_validation ≈ L_inf[group]
                      + A[group] * dense_parameter_count^(-alpha[group])
                      + B[group] * num_experts^(-beta[group])

Per-group coefficients are loaded from /app/params.json. If that file does not
exist, the module will fit parameters from the dataset at /app/data, persist
them to /app/params.json, and also write a detailed explanation to /app/explain.md.

You can force (re)fitting and regenerating /app/explain.md by running:
    python /app/law.py
"""
from __future__ import annotations

import json
import math
import os
from dataclasses import dataclass
from datetime import datetime
from typing import Dict, List, Tuple

import numpy as np


PARAMS_PATH = "/app/params.json"
DATASET_PATH = "/app/data"
EXPLAIN_PATH = "/app/explain.md"


@dataclass
class Coeffs:
    L_inf: float
    A: float
    alpha: float
    B: float
    beta: float

    def predict(self, num_experts: float, dense_parameter_count: float) -> float:
        # Numerical safety: clamp to tiny positive to avoid 0**negative.
        e = max(float(num_experts), 1e-12)
        p = max(float(dense_parameter_count), 1e-12)
        return float(self.L_inf + self.A * p ** (-self.alpha) + self.B * e ** (-self.beta))


def _load_params() -> Dict[str, Coeffs] | None:
    if not os.path.exists(PARAMS_PATH):
        return None
    with open(PARAMS_PATH, "r", encoding="utf-8") as f:
        raw = json.load(f)
    coeffs: Dict[str, Coeffs] = {}
    for g, c in raw.get("coefficients", {}).items():
        coeffs[g] = Coeffs(
            L_inf=float(c["L_inf"]),
            A=float(c["A"]),
            alpha=float(c["alpha"]),
            B=float(c["B"]),
            beta=float(c["beta"]),
        )
    return coeffs


def _save_params_and_explain(
    coeffs: Dict[str, Coeffs],
    fit_meta: Dict[str, Dict[str, float]],
    columns: Dict[str, str],
) -> None:
    # Save params.json
    payload = {
        "formula": "loss = L_inf + A * dense_parameter_count^(-alpha) + B * num_experts^(-beta)",
        "fitted_on": datetime.utcnow().isoformat() + "Z",
        "dataset_path": DATASET_PATH,
        "columns": columns,
        "coefficients": {
            g: {
                "L_inf": c.L_inf,
                "A": c.A,
                "alpha": c.alpha,
                "B": c.B,
                "beta": c.beta,
                **({"r2": fit_meta[g]["r2"], "mse": fit_meta[g]["mse"]} if g in fit_meta else {}),
            }
            for g, c in coeffs.items()
        },
    }
    with open(PARAMS_PATH, "w", encoding="utf-8") as f:
        json.dump(payload, f, indent=2, sort_keys=True)

    # Write explain.md with details and fitted parameters
    lines: List[str] = []
    lines.append("# Discovered Scaling Law for MoE Validation Loss")
    lines.append("")
    lines.append("This document describes the fitted scaling law relating Mixture-of-Experts (MoE) architecture choices to the final validation loss.")
    lines.append("")
    lines.append("## Functional form")
    lines.append("")
    lines.append("We model the validation loss as an additive, saturating power-law in the dense (non-expert) parameters and the number of experts:")
    lines.append("")
    lines.append("loss_validation = L_inf + A * dense_parameter_count^(-alpha) + B * num_experts^(-beta)")
    lines.append("")
    lines.append("- L_inf: irreducible loss floor (as capacity → ∞).")
    lines.append("- A, alpha: magnitude and decay exponent for dense parameters.")
    lines.append("- B, beta: magnitude and decay exponent for the number of experts.")
    lines.append("")
    lines.append("Rationale: In MoE models, increasing experts primarily expands conditional capacity, while dense parameters govern shared representation power. Empirically, both exhibit diminishing returns well-captured by power-laws; the additive form separates their contributions around a floor L_inf.")
    lines.append("")
    lines.append("## Fitting methodology")
    lines.append("")
    lines.append("For each experimental group, we:")
    lines.append("- Performed a grid search over exponents alpha and beta on [0.05, 2.0].")
    lines.append("- For each (alpha, beta), solved a linear least-squares fit for (L_inf, A, B) in y ≈ c + A * P^(-alpha) + B * E^(-beta).")
    lines.append("- Selected the (alpha, beta) that minimized MSE, and reported the corresponding (L_inf, A, B).")
    lines.append("")
    lines.append("This approach avoids non-convex optimization while capturing the main curvature via exponents.")
    lines.append("")
    lines.append("## Fitted parameters by group")
    lines.append("")
    lines.append("| Group | L_inf | A | alpha | B | beta | R^2 | MSE |")
    lines.append("|---|---:|---:|---:|---:|---:|---:|---:|")
    for g, c in coeffs.items():
        r2 = fit_meta.get(g, {}).get("r2", float("nan"))
        mse = fit_meta.get(g, {}).get("mse", float("nan"))
        lines.append(
            f"| {g} | {c.L_inf:.6g} | {c.A:.6g} | {c.alpha:.6g} | {c.B:.6g} | {c.beta:.6g} | {r2:.4f} | {mse:.6g} |"
        )
    lines.append("")
    lines.append("## Column mapping")
    lines.append("")
    lines.append(f"- num_experts: `{columns.get('num_experts')}`")
    lines.append(f"- dense_parameter_count: `{columns.get('dense_parameter_count')}`")
    lines.append(f"- loss_validation: `{columns.get('loss_validation')}`")
    lines.append(f"- group: `{columns.get('group')}`")
    lines.append("")
    lines.append("To reproduce or update these values, run:")
    lines.append("")
    lines.append("```bash")
    lines.append("python /app/law.py")
    lines.append("```")
    with open(EXPLAIN_PATH, "w", encoding="utf-8") as f:
        f.write("\n".join(lines))


def _choose_split(ds) -> Tuple[object, str]:
    # HuggingFace datasets: handle DatasetDict or Dataset
    try:
        from datasets import DatasetDict  # type: ignore
    except Exception:
        DatasetDict = None  # type: ignore

    if DatasetDict is not None and isinstance(ds, DatasetDict):
        if "train" in ds:
            return ds["train"], "train"
        # Fallback to the first available split
        first_key = next(iter(ds.keys()))
        return ds[first_key], first_key
    return ds, "dataset"


def _detect_columns(cols: List[str]) -> Dict[str, str]:
    # Try a set of likely column names for each required variable.
    def pick(cands: List[str]) -> str | None:
        for c in cands:
            if c in cols:
                return c
        return None

    experts = pick(["num_experts", "experts", "n_experts", "moe_num_experts"])
    dense = pick(
        [
            "dense_parameter_count",
            "dense_params",
            "dense_parameters",
            "non_expert_parameters",
            "dense_parameter_count_total",
        ]
    )
    loss = pick(["loss_validation", "val_loss", "validation_loss", "valid_loss"])
    group = pick(["group", "group_name", "group_id", "dataset_group", "task"])

    missing = [name for name, v in [("num_experts", experts), ("dense_parameter_count", dense), ("loss_validation", loss)] if v is None]
    if missing:
        raise RuntimeError(f"Required columns not found in dataset: {', '.join(missing)}. Available: {cols}")

    return {
        "num_experts": experts,  # type: ignore[arg-type]
        "dense_parameter_count": dense,  # type: ignore[arg-type]
        "loss_validation": loss,  # type: ignore[arg-type]
        "group": group or "__ALL__",
    }


def _to_numpy_column(dataset, key: str) -> np.ndarray:
    # Convert a dataset column to numpy array of float
    data = dataset.to_dict()
    col = data[key]
    return np.asarray(col, dtype=float)


def _fit_group(E: np.ndarray, P: np.ndarray, y: np.ndarray) -> Tuple[Coeffs, float, float]:
    # Grid over exponents, solve linear least squares for (c, A, B) for each.
    alphas = np.linspace(0.05, 2.0, 40)
    betas = np.linspace(0.05, 2.0, 40)

    best_loss = math.inf
    best_params = None  # type: ignore

    # Precompute logs if needed; here we directly compute power features.
    for alpha in alphas:
        fP = np.power(np.clip(P, 1e-12, None), -alpha)
        # Early compute to avoid repetition
        for beta in betas:
            fE = np.power(np.clip(E, 1e-12, None), -beta)
            # Design matrix with intercept
            F = np.stack([np.ones_like(fP), fP, fE], axis=1)
            # Solve least squares: y ≈ F @ theta, theta = [c, A, B]
            theta, residuals, rank, s = np.linalg.lstsq(F, y, rcond=None)
            yhat = F @ theta
            mse = float(np.mean((y - yhat) ** 2))
            if mse < best_loss:
                best_loss = mse
                best_params = (theta[0], theta[1], float(alpha), theta[2], float(beta))

    assert best_params is not None
    c, A, alpha, B, beta = [float(v) for v in best_params]

    # Compute R^2
    yhat = (c + A * np.power(np.clip(P, 1e-12, None), -alpha) + B * np.power(np.clip(E, 1e-12, None), -beta))
    ss_res = float(np.sum((y - yhat) ** 2))
    ss_tot = float(np.sum((y - np.mean(y)) ** 2))
    r2 = 1.0 - (ss_res / ss_tot if ss_tot > 0 else 0.0)
    mse = float(np.mean((y - yhat) ** 2))
    return Coeffs(L_inf=c, A=A, alpha=alpha, B=B, beta=beta), r2, mse


def _fit_from_dataset() -> Tuple[Dict[str, Coeffs], Dict[str, Dict[str, float]], Dict[str, str]]:
    try:
        from datasets import load_from_disk  # type: ignore
    except Exception as e:
        raise RuntimeError(
            "The 'datasets' package is required to fit parameters. "
            "Install it via: pip install datasets"
        ) from e

    ds = load_from_disk(DATASET_PATH)
    d, split = _choose_split(ds)
    cols = list(d.column_names)
    mapping = _detect_columns(cols)

    # Extract columns
    E_all = _to_numpy_column(d, mapping["num_experts"])
    P_all = _to_numpy_column(d, mapping["dense_parameter_count"])
    y_all = _to_numpy_column(d, mapping["loss_validation"])

    # Groups
    group_key = mapping["group"]
    if group_key == "__ALL__":
        groups = {"__ALL__": np.arange(len(y_all))}
    else:
        group_raw = d.to_dict()[group_key]
        # Map unique group names to indices
        uniq = {}
        for i, g in enumerate(group_raw):
            uniq.setdefault(str(g), []).append(i)
        groups = {k: np.asarray(v, dtype=int) for k, v in uniq.items()}

    coeffs: Dict[str, Coeffs] = {}
    meta: Dict[str, Dict[str, float]] = {}
    for g, idx in groups.items():
        E = E_all[idx]
        P = P_all[idx]
        y = y_all[idx]
        c, r2, mse = _fit_group(E, P, y)
        coeffs[g] = c
        meta[g] = {"r2": float(r2), "mse": float(mse)}

    return coeffs, meta, mapping


def _fit_and_save() -> Dict[str, Coeffs]:
    coeffs, meta, mapping = _fit_from_dataset()
    _save_params_and_explain(coeffs, meta, mapping)
    return coeffs


def _fallback_coeffs(coeffs: Dict[str, Coeffs]) -> Coeffs:
    # Median of parameters across groups as a robust fallback.
    arr = np.array([[c.L_inf, c.A, c.alpha, c.B, c.beta] for c in coeffs.values()], dtype=float)
    if arr.size == 0:
        # Reasonable generic defaults (will be poor but defined)
        return Coeffs(L_inf=0.5, A=1.0, alpha=0.5, B=1.0, beta=0.5)
    med = np.median(arr, axis=0)
    return Coeffs(L_inf=float(med[0]), A=float(med[1]), alpha=float(med[2]), B=float(med[3]), beta=float(med[4]))


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing:
                        - 'num_experts': float
                        - 'dense_parameter_count': float
        group: The name of the experimental group for which to make predictions.
               The functional form is the same for all groups; coefficients differ.

    Returns:
        A list of dictionaries with a single key:
            - 'loss_validation': predicted validation loss (float)
    """
    coeffs = _load_params()
    if coeffs is None:
        # Fit from dataset and persist params + explanation
        coeffs = _fit_and_save()

    # Choose coefficients for the requested group or a robust fallback
    c = coeffs.get(group)
    if c is None:
        c = _fallback_coeffs(coeffs)

    out: List[Dict[str, float]] = []
    for row in input_data:
        ne = float(row.get("num_experts", 0.0))
        dp = float(row.get("dense_parameter_count", 0.0))
        out.append({"loss_validation": c.predict(ne, dp)})
    return out


if __name__ == "__main__":
    # (Re)fit parameters from /app/data and regenerate /app/explain.md
    coeffs = _fit_and_save()
    print(f"Fitted coefficients for {len(coeffs)} group(s) and wrote:")
    print(f" - {PARAMS_PATH}")
    print(f" - {EXPLAIN_PATH}")