SLD - MoE Scaling Law - openhands + GPT-5

Best Run 1 R² = 0.832696

▼

Python

from typing import List, Dict
import math

# Scaling law chosen (Model B):
# loss_validation = L0 + C * (dense_parameter_count ** -alpha) * (num_experts ** -beta)
# Coefficients are per-group; fall back to 'default' if group not listed.

_PARAMS_BY_GROUP = {
    # Fitted on provided dataset (group == 'all_data') using robust least squares
    'all_data': {
        'L0': 1.61701771,
        'C': 43.47565665,
        'alpha': 0.19898560,
        'beta': 0.07398273,
    },
    # Default fallback (same as all_data)
    '__default__': {
        'L0': 1.61701771,
        'C': 43.47565665,
        'alpha': 0.19898560,
        'beta': 0.07398273,
    },
}


def _predict_one(e: float, nd: float, coeffs: Dict[str, float]) -> float:
    e = max(1.0, float(e))
    nd = max(1.0, float(nd))
    return coeffs['L0'] + coeffs['C'] * (nd ** (-coeffs['alpha'])) * (e ** (-coeffs['beta']))


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    coeffs = _PARAMS_BY_GROUP.get(group, _PARAMS_BY_GROUP['__default__'])
    out = []
    for row in input_data:
        e = row.get('num_experts', 1.0)
        nd = row.get('dense_parameter_count', 1.0)
        pred = _predict_one(e, nd, coeffs)
        out.append({'loss_validation': float(pred)})
    return out

#2 Run 2 R² = 0.832695

▼

Python

from __future__ import annotations

from typing import Dict, List

# Discovered scaling law (shared functional form across groups):
#   loss_validation = L0 + A * (dense_parameter_count ** (-alpha)) * (num_experts ** (-beta))
# Coefficients are fit per group; for unknown groups we fall back to 'all_data'.
_COEFFS_BY_GROUP: Dict[str, Dict[str, float]] = {
    # Fitted on the provided dataset (group == 'all_data')
    "all_data": {
        "L0": 1.61701813,
        "A": 43.47571241,
        "alpha": 0.19898569,
        "beta": 0.07398276,
    },
}


def _predict_one(p: float, e: float, coeffs: Dict[str, float]) -> float:
    if p <= 0 or e <= 0:
        raise ValueError("dense_parameter_count and num_experts must be positive")
    L0 = coeffs["L0"]
    A = coeffs["A"]
    alpha = coeffs["alpha"]
    beta = coeffs["beta"]
    return L0 + A * (p ** (-alpha)) * (e ** (-beta))


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    coeffs = _COEFFS_BY_GROUP.get(group, _COEFFS_BY_GROUP["all_data"])  # fallback
    outputs: List[Dict[str, float]] = []
    for row in input_data:
        p = float(row["dense_parameter_count"])  # P
        e = float(row["num_experts"])           # E
        y = _predict_one(p, e, coeffs)
        outputs.append({"loss_validation": float(y)})
    return outputs

#3 Run 3 R² = 0.832695

▼

Python

from __future__ import annotations

from typing import Dict, List


def _predict_one(num_experts: float, dense_parameter_count: float, params: Dict[str, float]) -> float:
    # Guard against non-positive inputs (should not occur in data, but protects extrapolation)
    e = float(num_experts)
    pcount = float(dense_parameter_count)
    if e <= 0:
        e = 1e-9
    if pcount <= 0:
        pcount = 1e-9

    A = params["A"]
    D = params["D"]
    p = params["p"]
    q = params["q"]

    # Scaling law: loss ≈ A + D * E^{-p} * P^{-q}
    return float(A + D * (e ** (-p)) * (pcount ** (-q)))


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    The functional form is shared across groups; only constants differ by group.

    Scaling law (Mixture-of-Experts):
        loss_validation ≈ A_g + D_g * (num_experts)^{-p_g} * (dense_parameter_count)^{-q_g}

    Args:
        input_data: List of dicts, each containing at minimum the keys
            - "num_experts": float
            - "dense_parameter_count": float
        group: Name of the experimental group.

    Returns:
        List of dicts with a single key:
            - "loss_validation": predicted validation loss
    """
    # Group-specific parameters. If an unseen group is requested, fall back to "all_data".
    GROUP_PARAMS: Dict[str, Dict[str, float]] = {
        # Fitted on the provided dataset (/app/data)
        # A ≈ 1.61701833, D ≈ 43.47573722, p ≈ 0.07398278, q ≈ 0.19898573
        "all_data": {"A": 1.6170183267885552, "D": 43.475737221183685, "p": 0.07398278198548182, "q": 0.19898572711708692},
    }

    params = GROUP_PARAMS.get(group, GROUP_PARAMS["all_data"])

    outputs: List[Dict[str, float]] = []
    for row in input_data:
        e = float(row.get("num_experts", 0.0))
        pcount = float(row.get("dense_parameter_count", 0.0))
        yhat = _predict_one(e, pcount, params)
        outputs.append({"loss_validation": yhat})
    return outputs

#4 Run 4 R² = 0.830178

▼

Python

from typing import List, Dict

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Parameters per group for the law: L = L_inf + A * P^alpha * K^beta
    # Fallback to 'all_data' if unknown group is passed.
    params_by_group: Dict[str, Dict[str, float]] = {
        "all_data": {
            "L_inf": 1.6047906533634606,
            "A": 41.16458087251434,
            "alpha": -0.19526865058083148,
            "beta": -0.07418729161574442,
        }
    }

    p = params_by_group.get(group, params_by_group.get("all_data"))
    L_inf = float(p["L_inf"])  # irreducible loss floor
    A = float(p["A"])
    alpha = float(p["alpha"])  # exponent for dense params
    beta = float(p["beta"])    # exponent for number of experts

    out: List[Dict[str, float]] = []
    for row in input_data:
        P = float(row.get("dense_parameter_count", 0.0))
        K = float(row.get("num_experts", 0.0))
        # Basic safety for non-positive inputs
        if P <= 0:
            P = 1e-12
        if K <= 0:
            K = 1e-12
        y = L_inf + A * (P ** alpha) * (K ** beta)
        out.append({"loss_validation": float(y)})
    return out

MoE Scaling Law

All Runs (sorted by R²)