SLD - MoE Scaling Law - opencode + GPT-5

All Runs (sorted by R²)

Best Run 1 R² = 0.832695

▼

Python

from __future__ import annotations
from typing import Dict, List

# Discovered scaling-law parameters per group for the model:
# loss_validation = L_inf + K * (dense_parameter_count)**(-alpha) * (num_experts)**(-beta)
# Fitted on the provided dataset (group: 'all_data').

_COEFFS: Dict[str, Dict[str, float]] = {
    "all_data": {
        "L_inf": 1.6170181475797127,
        "K": 43.475711011953884,
        "alpha": 0.19898568476505754,
        "beta": 0.07398277097857449,
    },
}

_DEFAULT_GROUP = "all_data"


def _predict_loss(dense_params: float, num_experts: float, p: Dict[str, float]) -> float:
    # Guard against non-positive inputs (should not occur in valid data)
    D = max(float(dense_params), 1.0)
    E = max(float(num_experts), 1.0)
    return (
        p["L_inf"]
        + p["K"] * (D ** (-p["alpha"])) * (E ** (-p["beta"]))
    )


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    params = _COEFFS.get(group, _COEFFS[_DEFAULT_GROUP])

    outputs: List[Dict[str, float]] = []
    for row in input_data:
        dense_params = float(row.get("dense_parameter_count", 0.0))
        num_experts = float(row.get("num_experts", 0.0))
        pred = _predict_loss(dense_params, num_experts, params)
        outputs.append({"loss_validation": float(pred)})

    return outputs

#2 Run 2 R² = 0.829767

▼

Python

from typing import List, Dict


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    The discovered law models validation loss as an asymptotic multiplicative power law
    of dense (non-expert) parameters and the number of experts:

        loss_validation = A[group] + B[group] * (dense_parameter_count ** b[group]) * (num_experts ** c[group])

    Coefficients were fitted per experimental group on the provided dataset. If an
    unknown group is requested, the function falls back to the "all_data" coefficients
    (or the first available set if "all_data" is not present).

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Expected keys are:
                        - 'num_experts'
                        - 'dense_parameter_count'
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is the same for all groups, but the
                constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s):
            - 'loss_validation'
    """
    # Per-group coefficients fitted from the dataset under /app/data
    # Format: group -> (A, B, b, c)
    COEFFS = {
        # Fitted on 193 points (group == 'all_data')
        # RMSE ~ 0.05255 on training data
        "all_data": (1.6057342355089967, 41.28866734867788, -0.19547840587237095, -0.07426497458304816),
    }

    # Resolve which group's coefficients to use
    if group in COEFFS:
        A, B, b, c = COEFFS[group]
    elif "all_data" in COEFFS:
        A, B, b, c = COEFFS["all_data"]
    else:
        # Fallback: use any available group's coefficients deterministically
        first_key = next(iter(COEFFS))
        A, B, b, c = COEFFS[first_key]

    out: List[Dict[str, float]] = []
    for row in input_data:
        D = float(row.get("dense_parameter_count", 0.0))
        E = float(row.get("num_experts", 0.0))
        # Guard against non-positive values; the fitted law expects positives.
        if D <= 0 or E <= 0:
            y_hat = float("nan")
        else:
            y_hat = A + B * (D ** b) * (E ** c)
        out.append({"loss_validation": float(y_hat)})

    return out

#3 Run 3 R² = -0.121429

▼

Python

from __future__ import annotations
from typing import Dict, List
import math

# Fitted on the provided dataset (group = 'all_data').
# Functional form (shared across groups):
#   loss = a + b*log10(P) + c*log10(E) + d*log10(P)*log10(E)
# where P = dense_parameter_count, E = num_experts
# Coefficients may vary per `group`. If a group is unknown,
# fall back to the 'all_data' coefficients learned from training data.

COEFFICIENTS: Dict[str, Dict[str, float]] = {
    # Learned from the visible dataset
    'all_data': {
        'a': 7.02938669,
        'b': -0.53603608,
        'c': -0.80004463,
        'd': 0.08089269,
    },
}

# Default coefficients if a new group is encountered
DEFAULT_COEFFS: Dict[str, float] = COEFFICIENTS['all_data']


def _predict_one(P: float, E: float, coeffs: Dict[str, float]) -> float:
    # Guard against invalid inputs
    if P <= 0 or E <= 0:
        # Return a conservative high loss if inputs are invalid
        return float('inf')
    lp = math.log10(P)
    le = math.log10(E)
    return (
        coeffs['a']
        + coeffs['b'] * lp
        + coeffs['c'] * le
        + coeffs['d'] * lp * le
    )


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    coeffs = COEFFICIENTS.get(group, DEFAULT_COEFFS)
    results: List[Dict[str, float]] = []
    for row in input_data:
        P = float(row.get('dense_parameter_count', 0.0))
        E = float(row.get('num_experts', 0.0))
        y = _predict_one(P, E, coeffs)
        results.append({'loss_validation': float(y)})
    return results

#4 Run 4 R² = -0.121429

▼

Python

from typing import List, Dict
import math

# Coefficients fitted on the provided dataset (group: 'all_data')
# Model form:
#   loss_validation = a + b * log10(dense_parameter_count)
#                           + c * log10(num_experts)
#                           + d * log10(dense_parameter_count) * log10(num_experts)
# These coefficients were selected after model comparison via 5-fold CV.
_COEFFS = {
    "all_data": {
        "a": 7.0293866862586984,
        "b": -0.5360360776769779,
        "c": -0.8000446261370273,
        "d": 0.08089268558264515,
    }
}

_DEFAULT_GROUP = "all_data"


def _predict_row(row: Dict[str, float], coeffs: Dict[str, float]) -> Dict[str, float]:
    P = float(row["dense_parameter_count"])  # dense parameters
    E = float(row["num_experts"])            # number of experts
    if P <= 0 or E <= 0:
        # Guard against invalid inputs; fall back to minimal positive to avoid log issues
        # This mirrors the domain of the fitted data (strictly positive).
        P = max(P, 1e-12)
        E = max(E, 1e-12)
    logP = math.log10(P)
    logE = math.log10(E)
    a = coeffs["a"]; b = coeffs["b"]; c = coeffs["c"]; d = coeffs["d"]
    y = a + b * logP + c * logE + d * logP * logE
    return {"loss_validation": float(y)}


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Expected keys: 'num_experts',
                    'dense_parameter_count'.
        group: The name of the experimental group for which to make predictions.
               The functional form of the law is the same for all groups, but
               the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s): 'loss_validation'.
    """
    coeffs = _COEFFS.get(group, _COEFFS[_DEFAULT_GROUP])
    return [_predict_row(row, coeffs) for row in input_data]

#5 Run 5 R² = -0.220339

▼

Python

from __future__ import annotations
from typing import Dict, List

# Discovered scaling law (group-specific coefficients)
# Functional form (same for all groups):
#   loss_validation = L_inf + A * (dense_parameter_count / 1e8) ** (-alpha) \
#                                  + B * (num_experts) ** (-beta)
# Coefficients below were fit on the provided dataset.
_COEFFS: Dict[str, Dict[str, float]] = {
    "all_data": {
        "L_inf": -5.4053617,
        "A": 0.71103196,
        "alpha": 0.24,
        "B": 7.42708552,
        "beta": 0.01,
    },
}

# Default group to fall back on if an unknown group is requested.
_DEFAULT_GROUP = "all_data"


def _predict_one(num_experts: float, dense_parameter_count: float, group: str) -> float:
    if group not in _COEFFS:
        group = _DEFAULT_GROUP
    params = _COEFFS[group]
    # Safeguards to ensure numerical stability
    n = max(float(num_experts), 1e-12)
    d = max(float(dense_parameter_count), 1e-12)
    d_norm = d / 1e8

    y = (
        params["L_inf"]
        + params["A"] * (d_norm ** (-params["alpha"]))
        + params["B"] * (n ** (-params["beta"]))
    )
    return float(y)


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    outputs: List[Dict[str, float]] = []
    for row in input_data:
        if "num_experts" not in row or "dense_parameter_count" not in row:
            raise ValueError(
                "Each input row must contain 'num_experts' and 'dense_parameter_count' keys"
            )
        yhat = _predict_one(row["num_experts"], row["dense_parameter_count"], group)
        outputs.append({"loss_validation": yhat})
    return outputs