SLD - MoE Scaling Law - codex + GPT-5

All Runs (sorted by R²)

Best Run 1 R² = 0.832737

▼

Python

from __future__ import annotations

from typing import Dict, List
import math


# Discovered scaling law (shared functional form across groups):
#   loss = L + K * (P**alpha * E**beta) ** (-gamma)
# where:
#   P = dense_parameter_count (float, > 0)
#   E = num_experts (float, > 0)
# Parameters (L, K, gamma, alpha, beta) are group-specific constants.


# Fitted parameters per group from the provided dataset.
# Values are rounded to 6 significant decimals for stability/readability.
_PARAMS_BY_GROUP: Dict[str, tuple[float, float, float, float, float]] = {
    # group: (L, K, gamma, alpha, beta)
    "all_data": (
        1.616974,  # L
        43.469602, # K
        0.190978,  # gamma
        1.041879,  # alpha
        0.387373,  # beta
    ),
}


def _predict_loss(P: float, E: float, params: tuple[float, float, float, float, float]) -> float:
    L, K, gamma, alpha, beta = params
    # Guard against non-positive inputs; fall back to returning L if invalid.
    if P <= 0 or E <= 0:
        return float(L)
    # Compute effective scale and apply the power-law decay.
    # Use logs for numerical stability: (P**alpha * E**beta)**(-gamma) = exp(-gamma * (alpha*ln P + beta*ln E))
    s_log = alpha * math.log(P) + beta * math.log(E)
    decay = math.exp(-gamma * s_log)
    return float(L + K * decay)


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Select parameters for the provided group. If unseen, fall back to a reasonable default.
    # Default: use the parameters fitted on the aggregate group if available, otherwise a safe baseline.
    if group in _PARAMS_BY_GROUP:
        params = _PARAMS_BY_GROUP[group]
    elif "all_data" in _PARAMS_BY_GROUP:
        params = _PARAMS_BY_GROUP["all_data"]
    else:
        # Conservative fallback (keeps loss near a plausible constant if no params are known)
        params = (2.0, 1.0, 0.2, 1.0, 0.5)

    outputs: List[Dict[str, float]] = []
    for row in input_data:
        P = float(row.get("dense_parameter_count", 0.0))
        E = float(row.get("num_experts", 0.0))
        pred = _predict_loss(P, E, params)
        outputs.append({"loss_validation": pred})

    return outputs

#2 Run 2 R² = 0.832695

▼

Python

from __future__ import annotations

import math
from typing import Dict, List


# Fitted coefficients per experimental group for the scaling law:
#   loss_validation = L + K * (dense_parameter_count)**(-a) * (num_experts)**(-b)
# These were fit on the provided dataset using non-linear least squares.
_COEFFS: Dict[str, Dict[str, float]] = {
    # Single observed group in the provided dataset
    "all_data": {
        "L": 1.61701857,
        "K": 43.47577959,
        "a": 0.19898580,
        "b": 0.07398279,
    },
}


def _get_group_coeffs(group: str) -> Dict[str, float]:
    # Fallback to a reasonable default if an unseen group is requested.
    # Using the coefficients fit on all available data.
    return _COEFFS.get(group, _COEFFS["all_data"])


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    coeffs = _get_group_coeffs(group)
    L = float(coeffs["L"])  # asymptotic loss floor
    K = float(coeffs["K"])  # scale factor
    a = float(coeffs["a"])  # exponent on dense_parameter_count
    b = float(coeffs["b"])  # exponent on num_experts

    outputs: List[Dict[str, float]] = []
    for row in input_data:
        try:
            dense_params = float(row["dense_parameter_count"])  # > 0 expected
            num_experts = float(row["num_experts"])  # > 0 expected
        except KeyError as e:
            raise KeyError(
                f"Missing required key {e!s} in input row. "
                "Expected keys: 'dense_parameter_count', 'num_experts'."
            ) from None

        if dense_params <= 0.0 or num_experts <= 0.0:
            raise ValueError(
                "Input values must be positive: "
                f"dense_parameter_count={dense_params}, num_experts={num_experts}"
            )

        # Scaling law prediction
        pred_loss = L + K * (dense_params ** (-a)) * (num_experts ** (-b))

        # Numerical safety: ensure result is finite and within a reasonable domain
        if not math.isfinite(pred_loss):
            raise ValueError(
                "Non-finite prediction encountered. "
                f"Computed loss={pred_loss} for inputs {row} with group '{group}'."
            )

        outputs.append({"loss_validation": float(pred_loss)})

    return outputs

#3 Run 3 R² = 0.808867

▼

Python

from __future__ import annotations

from typing import Dict, List


# Discovered scaling law (shared functional form across groups):
#   loss_validation = L_inf + K * dense_parameter_count^(-a) * num_experts^(-b)
# Parameters were fitted per group using log-space regression with a grid search
# over L_inf (see explain.md for details). For unseen groups, we fall back to
# the 'all_data' parameters if available, otherwise to the median over known groups.


_GROUP_PARAMS: Dict[str, Dict[str, float]] = {
    # Fitted on the provided dataset (group == 'all_data')
    'all_data': {
        'L_inf': 1.089119235508997,
        'K': 16.181886878870902,
        'a': 0.1238983859205747,
        'b': 0.047610349087410624,
    },
}


def _get_params_for_group(group: str) -> Dict[str, float]:
    if group in _GROUP_PARAMS:
        return _GROUP_PARAMS[group]
    # Fallbacks for unseen groups
    if 'all_data' in _GROUP_PARAMS:
        return _GROUP_PARAMS['all_data']
    # Median across known groups (robust default)
    import statistics as _stats
    keys = ['L_inf', 'K', 'a', 'b']
    med = {k: _stats.median([v[k] for v in _GROUP_PARAMS.values()]) for k in keys}
    return med


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    params = _get_params_for_group(group)
    L_inf = float(params['L_inf'])
    K = float(params['K'])
    a = float(params['a'])
    b = float(params['b'])

    outputs: List[Dict[str, float]] = []
    for row in input_data:
        # Read inputs with safe casting to float
        ne = float(row.get('num_experts', 0.0))
        dp = float(row.get('dense_parameter_count', 0.0))

        # Guard against non-positive values to avoid undefined behavior
        if ne <= 0.0 or dp <= 0.0:
            # Produce a conservative fallback: just return L_inf
            y = L_inf
        else:
            y = L_inf + K * (dp ** (-a)) * (ne ** (-b))

        outputs.append({'loss_validation': float(y)})

    return outputs

#4 Run 4 R² = 0.779898

▼

Python

from __future__ import annotations

import math
from typing import Dict, List


# Discovered scaling law (shared functional form across groups):
#   loss_validation = L_inf + A * (dense_parameter_count ** b) * (num_experts ** c)
# Parameters below were fitted on the provided dataset.

_PARAMS_BY_GROUP: Dict[str, Dict[str, float]] = {
    # Single group observed in the dataset. Used as default for unknown groups too.
    "all_data": {
        "L_inf": 1.689119235508997,
        "A": 55.80528542409267,
        "b": -0.21610135342483788,
        "c": -0.0818973789911405,
    }
}


def _get_group_params(group: str) -> Dict[str, float]:
    # Fall back to 'all_data' if an unknown group is requested.
    return _PARAMS_BY_GROUP.get(group, _PARAMS_BY_GROUP["all_data"])


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """

    params = _get_group_params(group)
    L_inf = float(params["L_inf"])
    A = float(params["A"])
    b = float(params["b"])
    c = float(params["c"])

    outputs: List[Dict[str, float]] = []
    for row in input_data:
        # Extract required inputs; tolerate int-like values.
        dense = float(row.get("dense_parameter_count"))
        experts = float(row.get("num_experts"))

        # Guard against non-positive inputs which would break power law.
        if dense <= 0.0 or experts <= 0.0:
            raise ValueError(
                "dense_parameter_count and num_experts must be positive for the scaling law"
            )

        pred = L_inf + A * (dense ** b) * (experts ** c)
        outputs.append({"loss_validation": float(pred)})

    return outputs

#5 Run 5 R² = -0.007324

▼

Python

from __future__ import annotations

import math
from typing import Dict, List


# Discovered scaling law (shared functional form across groups):
#   loss_validation = L0[g] + K[g] * dense_parameter_count**p * num_experts**q
# Coefficients below are fitted for the available group(s) in the dataset.
_COEFFICIENTS = {
    # Fitted on /app/data (group = 'all_data')
    "all_data": {
        "L0": 2.039119235509,  # asymptotic loss floor
        "K": 1328.4955556622685,
        "p": -0.41098782024304786,  # exponent for dense_parameter_count
        "q": -0.15536913058068183,  # exponent for num_experts
    }
}


def _predict_loss(dense_params: float, num_experts: float, params: Dict[str, float]) -> float:
    # Guard against non-positive inputs which would be invalid for power laws.
    dp = max(float(dense_params), 1e-12)
    ne = max(float(num_experts), 1e-12)
    L0 = params["L0"]
    K = params["K"]
    p = params["p"]
    q = params["q"]
    return L0 + K * (dp ** p) * (ne ** q)


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Select group-specific coefficients if available; otherwise fall back to any
    # known group (here, the available 'all_data') to preserve functional form.
    params = _COEFFICIENTS.get(group) or _COEFFICIENTS.get("all_data")
    if params is None:
        raise ValueError("No coefficients available to evaluate the scaling law.")

    outputs: List[Dict[str, float]] = []
    for row in input_data:
        ne = float(row.get("num_experts", 0.0))
        dp = float(row.get("dense_parameter_count", 0.0))
        pred = _predict_loss(dp, ne, params)
        outputs.append({"loss_validation": float(pred)})
    return outputs