SLD - MoE Scaling Law - mini-swe-agent + GPT-5

Best Run 1 R² = 0.808867

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # learned parameters per group for the scaling law:
    # loss_validation = L_inf + k * (dense_parameter_count ** -a) * (num_experts ** -b)
    COEFFS = {
  "all_data": {
    "L_inf": 1.089119235508997,
    "ln_k": 2.783892522808186,
    "a": 0.1238983859205747,
    "b": 0.047610349087410624
  },
  "__default__": {
    "L_inf": 1.089119235508997,
    "ln_k": 2.783892522808186,
    "a": 0.1238983859205747,
    "b": 0.047610349087410624
  }
}
    gkey = group
    if gkey not in COEFFS:
        gkey = '__default__' if '__default__' in COEFFS else next(iter(COEFFS.keys()))
    p = COEFFS[gkey]
    L_inf = float(p['L_inf'])
    ln_k = float(p['ln_k'])
    a = float(p['a'])
    b = float(p['b'])
    import math as _m
    k = float(_m.exp(ln_k))
    out = []
    for row in input_data:
        D = float(row.get('dense_parameter_count', 0.0))
        E = float(row.get('num_experts', 0.0))
        if D <= 0 or E <= 0:
            y = float('nan')
        else:
            y = L_inf + k * (D ** (-a)) * (E ** (-b))
        out.append({'loss_validation': float(y)})
    return out

#2 Run 2 R² = 0.467622

▼

#3 Run 3 R² = 0.363070

▼

Python

# Auto-generated scaling law for MoE validation loss
# Formula: loss_validation = L + a * dp^(-p) + b * ne^(-q) + c * (dp*ne)^(-r)
# where dp = dense_parameter_count, ne = num_experts
from __future__ import annotations
from typing import List, Dict

PARAMS = {
  "GLOBAL": {
    "L": 1.834176305371023,
    "a": 35.997827556214766,
    "b": -0.20801752620041014,
    "c": 67.47418074483771,
    "n": 193,
    "p": 0.25,
    "q": 1.0,
    "r": 0.25,
    "rmse": 0.05406499663123175
  },
  "all_data": {
    "L": 1.834176305371023,
    "a": 35.997827556214766,
    "b": -0.20801752620041014,
    "c": 67.47418074483771,
    "n": 193,
    "p": 0.25,
    "q": 1.0,
    "r": 0.25,
    "rmse": 0.05406499663123175
  }
}

def _predict_one(dp: float, ne: float, pars: dict) -> float:
    if dp <= 0 or ne <= 0:
        return float('nan')
    L = pars["L"]; a = pars["a"]; b = pars["b"]; c = pars["c"]
    p = pars["p"]; q = pars["q"]; r = pars["r"]
    return L + a * (dp ** (-p)) + b * (ne ** (-q)) + c * ((dp * ne) ** (-r))

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    pars = PARAMS.get(group, PARAMS.get("GLOBAL"))
    out: list[dict[str, float]] = []
    for row in input_data:
        dp = float(row.get("dense_parameter_count", float('nan')))
        ne = float(row.get("num_experts", float('nan')))
        pred = _predict_one(dp, ne, pars)
        out.append({"loss_validation": float(pred)})
    return out

#4 Run 4 R² = 0.338276

▼

Python

# Auto-generated scaling law implementation
# Model form: log-quadratic
# Features: 1, logE, logD, logE2, logD2, logE_logD

from math import log10

_COEFFICIENTS = {
  "all_data": {
    "weights": [
      13.849928930727911,
      -0.7685396744020193,
      -2.266750584688975,
      -0.005468530952045237,
      0.10924879564098447,
      0.0786451523043135
    ],
    "r2": 0.9613252957444444,
    "bic": -1120.0506497593492,
    "n": 193
  },
  "_default_": {
    "weights": [
      13.849928930727911,
      -0.7685396744020193,
      -2.266750584688975,
      -0.005468530952045237,
      0.10924879564098447,
      0.0786451523043135
    ],
    "r2": 0.9613252957444444,
    "bic": -1120.0506497593492,
    "n": 193
  }
}

_FEATURES = ["1", "logE", "logD", "logE2", "logD2", "logE_logD"]

def _predict_one(num_experts: float, dense_parameter_count: float, weights: list[float]) -> float:
    # Guard against non-positive inputs for log
    e = max(float(num_experts), 1e-12)
    d = max(float(dense_parameter_count), 1e-12)
    le = log10(e)
    ld = log10(d)

    # X = [1, le, ld, le^2, ld^2, le*ld]
    x = [1.0, le, ld, le*le, ld*ld, le*ld]

    return sum(w*v for w,v in zip(weights, x))

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is the same for all groups, but
                the coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    coeffs = _COEFFICIENTS.get(group)
    if coeffs is None:
        coeffs = _COEFFICIENTS.get("_default_")
    weights = coeffs["weights"]
    outputs = []
    for row in input_data:
        ne = float(row.get("num_experts", 0.0))
        dp = float(row.get("dense_parameter_count", 0.0))
        yhat = _predict_one(ne, dp, weights)
        outputs.append({"loss_validation": float(yhat)})
    return outputs

#5 Run 5 R² = -0.217412

▼

Python

# Auto-generated scaling law for MoE validation loss
# Formula: loss_validation = a_g + b_g * dense_parameter_count^(-ALPHA) + c_g * num_experts^(-BETA)

from typing import List, Dict
import math

ALPHA = 0.23950000000000005
BETA = 0.01

COEFFS: Dict[str, Dict[str, float]] = {
  "all_data": {
    "a": -5.4067970077023535,
    "b": 58.73130251056639,
    "c": 7.426966606414652
  }
}
DEFAULT_COEFFS = {"a": -5.406797007702353, "b": 58.73130251056639, "c": 7.426966606414652}

def _predict_one(dense_parameter_count: float, num_experts: float, a: float, b: float, c: float) -> float:
    D = float(dense_parameter_count)
    E = float(num_experts)
    # Clamp to positive to avoid invalid power usage
    if not math.isfinite(D) or D <= 0:
        D = 1.0
    if not math.isfinite(E) or E <= 0:
        E = 1.0
    return float(a + b * (D ** (-ALPHA)) + c * (E ** (-BETA)))

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    coeff = COEFFS.get(group, DEFAULT_COEFFS)
    a = float(coeff.get("a", DEFAULT_COEFFS["a"]))
    b = float(coeff.get("b", DEFAULT_COEFFS["b"]))
    c = float(coeff.get("c", DEFAULT_COEFFS["c"]))
    outputs: list[dict[str, float]] = []
    for row in input_data:
        D = row.get("dense_parameter_count", 0.0)
        E = row.get("num_experts", 0.0)
        pred = _predict_one(D, E, a, b, c)
        outputs.append({"loss_validation": pred})
    return outputs

MoE Scaling Law

All Runs (sorted by R²)