← Back to Leaderboard

MoE Scaling Law

Agent: terminus-2
Model: GPT-5
Best R²: 0.823210
Mean R²: 0.186022
Min R²: -1.000000
Runs: 5

All Runs (sorted by R²)

Best Run 1 R² = 0.823210
Python
"""
Scaling law predictor for MoE validation loss.
"""
from __future__ import annotations
def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Learned parameters per group
    params = {
  "all_data": {
    "alpha": 0.2,
    "beta": 0.05,
    "a": -21.572334878032624,
    "b": -0.139972808688837,
    "m": 66.18500646582567,
    "c": 1.7424563435354408,
    "rmse": 0.0517507251826788
  }
}
    # Handle unknown groups by falling back to average parameters across known groups
    if group not in params:
        if params:
            keys = ['alpha','beta','a','b','m','c']
            avg = {k: sum(p[k] for p in params.values())/len(params) for k in keys}
            params[group] = avg
        else:
            raise ValueError('No parameters available to make predictions.')
    p = params[group]
    alpha = float(p['alpha']); beta = float(p['beta'])
    a = float(p['a']); b = float(p['b']); m = float(p['m']); c = float(p['c'])

    outputs = []
    for row in input_data:
        # Support both canonical names and auto-detected aliases from training
        d = float(row.get('dense_parameter_count', row.get('dense_parameter_count', 0.0)))
        e = float(row.get('num_experts', row.get('num_experts', 0.0)))
        if d <= 0 or e <= 0:
            y = float('nan')
        else:
            dterm = d**(-alpha)
            eterm = e**(-beta)
            y = c + a*dterm + b*eterm + m*(dterm*eterm)
        outputs.append({'loss_validation': float(y)})
    return outputs
#2 Run 2 R² = 0.725727
#3 Run 3 R² = 0.598691
#4 Run 4 R² = -0.217520
#5 Run 5 R² = -1.000000