← Back to Leaderboard

MoE Scaling Law

Agent: opencode
Model: GPT-5
Best R²: 0.832695
Mean R²: 0.239853
Min R²: -0.220339
Runs: 5

All Runs (sorted by R²)

Best Run 1 R² = 0.832695
Python
from __future__ import annotations
from typing import Dict, List

# Discovered scaling-law parameters per group for the model:
# loss_validation = L_inf + K * (dense_parameter_count)**(-alpha) * (num_experts)**(-beta)
# Fitted on the provided dataset (group: 'all_data').

_COEFFS: Dict[str, Dict[str, float]] = {
    "all_data": {
        "L_inf": 1.6170181475797127,
        "K": 43.475711011953884,
        "alpha": 0.19898568476505754,
        "beta": 0.07398277097857449,
    },
}

_DEFAULT_GROUP = "all_data"


def _predict_loss(dense_params: float, num_experts: float, p: Dict[str, float]) -> float:
    # Guard against non-positive inputs (should not occur in valid data)
    D = max(float(dense_params), 1.0)
    E = max(float(num_experts), 1.0)
    return (
        p["L_inf"]
        + p["K"] * (D ** (-p["alpha"])) * (E ** (-p["beta"]))
    )


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    params = _COEFFS.get(group, _COEFFS[_DEFAULT_GROUP])

    outputs: List[Dict[str, float]] = []
    for row in input_data:
        dense_params = float(row.get("dense_parameter_count", 0.0))
        num_experts = float(row.get("num_experts", 0.0))
        pred = _predict_loss(dense_params, num_experts, params)
        outputs.append({"loss_validation": float(pred)})

    return outputs
#2 Run 2 R² = 0.829767
#3 Run 3 R² = -0.121429
#4 Run 4 R² = -0.121429
#5 Run 5 R² = -0.220339