SLD - MoE Scaling Law - goose + GPT-5

All Runs (sorted by R²)

Best Run 1 R² = 0.832695

▼

Python

from __future__ import annotations
from typing import Dict, List
import math

# Discovered scaling law (shared functional form across groups):
#   loss_validation = L0[group] + C[group] * num_experts**(-p[group]) * dense_parameter_count**(-q[group])
# Coefficients were fit via nonlinear least squares on the provided dataset.
# See /app/explain.md for methodology and diagnostics.

# Group-specific parameters. If an unknown group is requested, we fall back to "all_data".
_PARAMS: Dict[str, Dict[str, float]] = {
    # Fitted on the provided data (single group present in the dataset)
    "all_data": {
        "L0": 1.6170181290494012,
        "C": 43.47571056885845,
        "p": 0.073982766969121,
        "q": 0.19898568380705728,
    },
}

_FALLBACK_GROUP = "all_data"


def _get_params(group: str) -> Dict[str, float]:
    # Use exact match if present; otherwise fall back to the default group.
    return _PARAMS.get(group, _PARAMS[_FALLBACK_GROUP])


def _predict_one(x: Dict[str, float], params: Dict[str, float]) -> float:
    # Extract inputs with basic validation and safety clamps
    E = float(x.get("num_experts", 0.0))
    D = float(x.get("dense_parameter_count", 0.0))

    # Guard against non-positive inputs to power operations
    eps = 1e-12
    E = max(E, eps)
    D = max(D, eps)

    L0 = params["L0"]
    C = params["C"]
    p = params["p"]
    q = params["q"]

    return float(L0 + C * (E ** (-p)) * (D ** (-q)))


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    params = _get_params(group)
    outputs: List[Dict[str, float]] = []
    for x in input_data:
        y = _predict_one(x, params)
        outputs.append({"loss_validation": y})
    return outputs

#2 Run 2 R² = 0.832695

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Experimental context: Mixture-of-Experts (MoE) architectures.
    We predict the final validation loss ('loss_validation') from the number
    of experts ('num_experts') and dense (non-expert) parameter count
    ('dense_parameter_count').

    Discovered scaling law (same functional form for all groups):
        loss_validation = L_inf[group] + k[group] * D^{-a[group]} * E^{-b[group]}
    where
        D = dense_parameter_count
        E = num_experts

    The constants (L_inf, k, a, b) depend on the experimental group.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s) under the key
        'loss_validation'.
    """
    # Per-group coefficients fitted offline.
    # If an unknown group is provided, fall back to the 'all_data' fit.
    params_by_group = {
        # Fitted on provided dataset (R^2 ≈ 0.9584):
        # L_inf = 1.6170182350, k = 43.4757253756, a = 0.1989857077, b = 0.0739827748
        "all_data": {
            "L_inf": 1.6170182350192674,
            "k": 43.475725375568594,
            "a": 0.19898570773332047,
            "b": 0.07398277477016527,
        },
    }

    # Select parameters for the requested group or default.
    p = params_by_group.get(group, params_by_group["all_data"])  # same form for all groups
    L_inf = float(p["L_inf"])  # asymptotic loss floor
    k = float(p["k"])          # scale
    a = float(p["a"])          # exponent for dense params
    b = float(p["b"])          # exponent for experts

    preds: list[dict[str, float]] = []
    for row in input_data:
        # Extract inputs with minimal validation; treat missing as 0 which drives term large.
        # (Callers should provide valid positive values.)
        D = float(row.get("dense_parameter_count", 0.0))
        E = float(row.get("num_experts", 0.0))

        # Guard against non-positive values to avoid pow domain issues.
        if D <= 0.0 or E <= 0.0:
            # If inputs are invalid, return NaN prediction to signal issue while preserving shape.
            y = float("nan")
        else:
            y = L_inf + k * (D ** (-a)) * (E ** (-b))

        preds.append({"loss_validation": float(y)})

    return preds

#3 Run 3 R² = 0.832694

▼

Python

from __future__ import annotations
from typing import List, Dict

def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    The functional form (shared across groups):
        loss_validation = L_inf + C * P^(-p) * E^(-q)
    where P=dense_parameter_count and E=num_experts.

    Per-group coefficients are allowed; if an unknown group is requested, the
    function falls back to coefficients fitted on all_data.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is the same for all groups,
                but the coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Per-group coefficients (fitted on provided data)
    coefficients: Dict[str, Dict[str, float]] = {
        "all_data": {
            "L_inf": 1.61701941,
            "C": 43.47587177,
            "p": 0.19898595,
            "q": 0.07398288,
        },
    }

    coeffs = coefficients.get(group, coefficients["all_data"])  # fallback
    L_inf = float(coeffs["L_inf"])  # asymptotic loss
    C = float(coeffs["C"])         # magnitude of finite-capacity correction
    p = float(coeffs["p"])         # exponent for dense params
    q = float(coeffs["q"])         # exponent for experts

    preds: List[Dict[str, float]] = []
    for row in input_data:
        if "dense_parameter_count" not in row or "num_experts" not in row:
            raise KeyError("Each input row must contain 'dense_parameter_count' and 'num_experts'.")
        P = float(row["dense_parameter_count"]) if row["dense_parameter_count"] is not None else 0.0
        E = float(row["num_experts"]) if row["num_experts"] is not None else 0.0
        if P <= 0.0 or E <= 0.0:
            y_hat = L_inf
        else:
            y_hat = L_inf + C * (P ** (-p)) * (E ** (-q))
        preds.append({"loss_validation": float(y_hat)})

    return preds

#4 Run 4 R² = 0.823211

▼

Python

from __future__ import annotations


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Functional form (shared across groups):
        loss_validation = c0
                         + c1 * D**(-alpha)
                         + c2 * E**(-beta)
                         + c3 * (D**(-alpha) * E**(-beta))
        where D = dense_parameter_count, E = num_experts.

    Parameters are group-specific. For unseen groups, fall back to "all_data".
    """
    if not isinstance(input_data, list):
        raise TypeError("input_data must be a list of dicts")

    # Parameters fitted on the provided dataset (group "all_data").
    params_by_group: dict[str, dict[str, float]] = {
        "all_data": {
            "alpha": 0.20,
            "beta": 0.05,
            "c0": 1.74245634,
            "c1": -21.57233488,
            "c2": -0.13997281,
            "c3": 66.18500647,
        }
    }

    params = params_by_group.get(group, params_by_group["all_data"])

    outputs: list[dict[str, float]] = []
    for row in input_data:
        if not isinstance(row, dict):
            raise TypeError("Each element of input_data must be a dict")
        try:
            D_raw = float(row["dense_parameter_count"])  # dense (non-expert) parameters
            E_raw = float(row["num_experts"])            # number of experts
        except KeyError as e:
            raise KeyError(f"Missing required key in input_data row: {e}")

        # Numerical safety (handle zeros/negatives defensively)
        D = max(D_raw, 1e-12)
        E = max(E_raw, 1e-12)

        alpha = params["alpha"]
        beta = params["beta"]
        c0 = params["c0"]
        c1 = params["c1"]
        c2 = params["c2"]
        c3 = params["c3"]

        Dm = D ** (-alpha)
        Em = E ** (-beta)
        y = c0 + c1 * Dm + c2 * Em + c3 * (Dm * Em)

        outputs.append({"loss_validation": float(y)})

    return outputs

#5 Run 5 R² = 0.743222

▼

Python

from __future__ import annotations
import math
from typing import List, Dict


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Scaling law (same functional form across groups):
    # loss_validation = a + b * D**p + c * ln(E) + d * (D**p) * ln(E)
    # where D = dense_parameter_count, E = num_experts
    # Coefficients below are fitted per-group. If an unknown group is provided,
    # we fall back to the "all_data" coefficients.

    COEFS: Dict[str, Dict[str, float]] = {
        # Fitted on provided dataset (group == 'all_data')
        # p found via grid search; a,b,c,d via OLS given p
        "all_data": {
            "a": 1.7049166669399852,
            "b": 56.51400046820522,
            "c": -0.0011799949980899783,
            "d": -3.5934294058295933,
            "p": -0.21875,  # -7/32
        }
    }

    params = COEFS.get(group, COEFS["all_data"])
    a = float(params["a"])  # intercept
    b = float(params["b"])  # coefficient for D**p
    c = float(params["c"])  # coefficient for ln(E)
    d = float(params["d"])  # coefficient for (D**p)*ln(E)
    p = float(params["p"])  # exponent on D

    outputs: List[Dict[str, float]] = []
    for row in input_data:
        # Extract inputs; be robust to key naming variations by lowercasing
        # but default to the canonical names if present.
        D = float(row.get("dense_parameter_count", row.get("D", 0.0)))
        E = float(row.get("num_experts", row.get("E", 0.0)))

        # Safety: avoid log(<=0) by clamping experts to a tiny positive value.
        E_safe = E if E > 0.0 else 1e-8
        lnE = math.log(E_safe)

        # Compute D**p (guard against D==0 with the same small epsilon)
        D_safe = D if D > 0.0 else 1e-8
        Dp = D_safe ** p

        y = a + b * Dp + c * lnE + d * Dp * lnE
        outputs.append({"loss_validation": float(y)})

    return outputs