SLD - MoE Scaling Law - terminus-2 + GPT-5

All Runs (sorted by R²)

Best Run 1 R² = 0.823210

▼

Python

"""
Scaling law predictor for MoE validation loss.
"""
from __future__ import annotations
def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Learned parameters per group
    params = {
  "all_data": {
    "alpha": 0.2,
    "beta": 0.05,
    "a": -21.572334878032624,
    "b": -0.139972808688837,
    "m": 66.18500646582567,
    "c": 1.7424563435354408,
    "rmse": 0.0517507251826788
  }
}
    # Handle unknown groups by falling back to average parameters across known groups
    if group not in params:
        if params:
            keys = ['alpha','beta','a','b','m','c']
            avg = {k: sum(p[k] for p in params.values())/len(params) for k in keys}
            params[group] = avg
        else:
            raise ValueError('No parameters available to make predictions.')
    p = params[group]
    alpha = float(p['alpha']); beta = float(p['beta'])
    a = float(p['a']); b = float(p['b']); m = float(p['m']); c = float(p['c'])

    outputs = []
    for row in input_data:
        # Support both canonical names and auto-detected aliases from training
        d = float(row.get('dense_parameter_count', row.get('dense_parameter_count', 0.0)))
        e = float(row.get('num_experts', row.get('num_experts', 0.0)))
        if d <= 0 or e <= 0:
            y = float('nan')
        else:
            dterm = d**(-alpha)
            eterm = e**(-beta)
            y = c + a*dterm + b*eterm + m*(dterm*eterm)
        outputs.append({'loss_validation': float(y)})
    return outputs

#2 Run 2 R² = 0.725727

▼

Python

from __future__ import annotations
from typing import List, Dict

# Fixed parameters fitted from the provided dataset
_PARAMS = {
  "all_data": {
    "alpha": 0.225,
    "beta": 0.1,
    "c0": 1.7036198023192506,
    "c1": 9.297571899930258,
    "c2": 0.027243439905111782,
    "c3": 53.748196893532196,
    "sse": 0.5469175712231034,
    "n": 193
  }
}

# Shared functional form across groups:
# loss = c0 + c1 * D**(-alpha) + c2 * E**(-beta) + c3 * (D**(-alpha) * E**(-beta))

def _predict_one(dense_parameter_count: float, num_experts: float, p: dict) -> float:
    # Ensure positive inputs
    D = max(float(dense_parameter_count), 1e-12)
    E = max(float(num_experts), 1e-12)
    a = p['alpha']
    b = p['beta']
    c0 = p['c0']
    c1 = p['c1']
    c2 = p['c2']
    c3 = p['c3']
    Dp = D**(-a)
    Ep = E**(-b)
    return c0 + c1*Dp + c2*Ep + c3*(Dp*Ep)

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Required keys: 'dense_parameter_count', 'num_experts'.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s): 'loss_validation'.
    """
    if group not in _PARAMS:
        # Fallback: if unknown group, use average params across known groups
        import numpy as np
        vals = list(_PARAMS.values())
        avg = {k: float(np.mean([v[k] for v in vals])) for k in ('alpha','beta','c0','c1','c2','c3')}
        p = avg
    else:
        p = _PARAMS[group]

    outputs = []
    for row in input_data:
        D = row.get('dense_parameter_count')
        E = row.get('num_experts')
        if D is None or E is None:
            raise ValueError("Each input datum must contain 'dense_parameter_count' and 'num_experts'")
        y = _predict_one(D, E, p)
        outputs.append({'loss_validation': float(y)})
    return outputs

#3 Run 3 R² = 0.598691

▼

Python

# Auto-generated on 2025-12-10T13:33:40.117746Z
from __future__ import annotations
from typing import List, Dict
import math

# Shared exponents (global across groups)
P_EXP = 0.238049610046
Q_EXP = 0.0917269242703

# Group-specific parameters learned from training data
_GROUP_PARAMS: dict[str, dict[str, float]] = {
    'all_data': {'L0': 1.77826360989, 'c': 76.2633040687},
}

# Fallback parameters if an unknown group is encountered
if _GROUP_PARAMS:
    _FALLBACK = {
        'L0': sum(v['L0'] for v in _GROUP_PARAMS.values())/len(_GROUP_PARAMS),
        'c': max(1e-12, sum(v['c'] for v in _GROUP_PARAMS.values())/len(_GROUP_PARAMS)),
    }
else:
    _FALLBACK = {'L0': 0.0, 'c': 1.0}

def _get_params(group: str) -> dict[str, float]:
    return _GROUP_PARAMS.get(group, _FALLBACK)

def _predict_one(dense_parameter_count: float, num_experts: float, group: str) -> float:
    params = _get_params(group)
    L0 = params['L0']
    c = params['c']
    # Guard against invalid inputs
    D = max(1e-12, float(dense_parameter_count))
    E = max(1e-12, float(num_experts))
    return float(L0 + c * (D ** (-P_EXP)) * (E ** (-Q_EXP)))

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: list of dicts with keys 'dense_parameter_count' and 'num_experts'
        group: name of the experimental group. Functional form is shared across groups, 
               while coefficients are group-specific.

    Returns:
        A list of dicts with key 'loss_validation' for each input row.
    """
    out = []
    for row in input_data:
        D = row.get('dense_parameter_count')
        E = row.get('num_experts')
        if D is None or E is None:
            raise ValueError("Each input row must include 'dense_parameter_count' and 'num_experts'.")
        y = _predict_one(D, E, group)
        out.append({'loss_validation': y})
    return out

#4 Run 4 R² = -0.217520

▼

Python

from __future__ import annotations
from typing import Dict, List

# Fixed parameters per group for the scaling law:
# loss = L0 + A * dense_parameter_count**(-a) + B * num_experts**(-b)
_PARAMS: dict[str, dict[str, float]] = {
    'all_data': {'L0': -437.0171474, 'A': 57.79747244, 'a': 0.238345632, 'B': 439.0313445, 'b': 0.0001639537764},

}

def _predict_one(x: Dict[str, float], coeffs: Dict[str, float]) -> Dict[str, float]:
    # Robustly get inputs, allow some aliasing of keys
    def get_key(d: Dict[str, float], names):
        for k in names:
            if k in d:
                return float(d[k])
        # Try case-insensitive
        lower = {kk.lower(): kk for kk in d.keys()}
        for k in names:
            if k.lower() in lower:
                return float(d[lower[k.lower()]])
        raise KeyError(f"Missing required key; tried aliases {names}")

    dense = get_key(x, ['dense_parameter_count','dense_params','dense_parameters','non_expert_params'])
    experts = get_key(x, ['num_experts','experts','n_experts','num_expert'])

    dense = max(dense, 1e-12)
    experts = max(experts, 1e-12)

    L0 = coeffs['L0']; A = coeffs['A']; a = coeffs['a']; B = coeffs['B']; b = coeffs['b']
    y = L0 + A * (dense ** (-a)) + B * (experts ** (-b))
    return {'loss_validation': float(y)}


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    if group not in _PARAMS:
        # Fallback: if an unknown group appears, use the closest (by name) or the first
        # Here we pick the first available as a conservative default
        fallback_group = next(iter(_PARAMS.keys()))
        coeffs = _PARAMS[fallback_group]
    else:
        coeffs = _PARAMS[group]

    return [_predict_one(x, coeffs) for x in input_data]

#5 Run 5 R² = -1.000000

▼

Python

# Auto-generated scaling law implementation
from __future__ import annotations
from typing import List, Dict
import math

_GROUP_KEY = 'group'
_PARAMS = {"all_data": {"L": 2.0473368507988168, "a": 4.776621762164751, "alpha": 0.14035826715996097, "b": 0.39189552986662596, "beta": 0.340359670189344, "rmse": 0.17143573996730122, "n": 193}}

def _predict_one(dp: float, ne: float, p: dict) -> float:
    # y = L + a * dp^-alpha + b * ne^-beta
    L = p['L']; a = p['a']; alpha = p['alpha']; b = p['b']; beta = p['beta']
    dp = max(1e-12, float(dp))
    ne = max(1e-12, float(ne))
    return float(L + a * (dp ** (-alpha)) + b * (ne ** (-beta)))


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    if group in _PARAMS:
        p = _PARAMS[group]
    else:
        if _PARAMS:
            keys = ['L','a','alpha','b','beta']
            avg = {k: sum(v[k] for v in _PARAMS.values())/len(_PARAMS) for k in keys}
            p = avg
        else:
            p = {'L': 1.0, 'a': 1.0, 'alpha': 0.5, 'b': 1.0, 'beta': 0.5}
    out: list[dict[str, float]] = []
    for ex in input_data:
        dp = ex.get('dense_parameter_count', ex.get('dense_params', 0.0))
        ne = ex.get('num_experts', ex.get('experts', 0.0))
        yhat = _predict_one(dp, ne, p)
        out.append({'loss_validation': yhat})
    return out