SLD - Parallel Scaling Law - mini-swe-agent + GPT-5

Best Run 1 R² = 0.999966

▼

Python

# Auto-generated scaling law for parallel model training
# Formula:
# loss = c0_g + c1_g * N^(-alpha) + c2_g * P^(-beta) + c3_g * (N^(-alpha) * P^(-beta))
# with exponents alpha, beta shared across groups.
from typing import List, Dict

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    ALPHA = 0.25125
    BETA = 0.513333333333
    COEFS: dict[str, list[float]] = {
        "pile": [1.38133216084, 97.066659052, 0.0772792357873, 4.4654429466],
        "stack": [0.755424951209, 54.0083122959, 0.0433880237503, 4.26602562249],
    }
    # Fallback: mean coefficients if group unknown
    if COEFS:
        _avg = [sum(cs[i] for cs in COEFS.values())/len(COEFS) for i in range(4)]
    else:
        _avg = [0.0, 0.0, 0.0, 0.0]
    coeffs = COEFS.get(group, _avg)

    out: list[dict[str, float]] = []
    for row in input_data:
        # Obtain inputs with common aliases
        n = row.get('num_params', None)
        if n is None:
            n = row.get('n_params', row.get('params', row.get('n')))
        p = row.get('parallel_size', None)
        if p is None:
            p = row.get('p', row.get('degree', row.get('mp_size', row.get('world_size'))))
        if n is None or p is None:
            raise KeyError("Each input dict must contain 'num_params' and 'parallel_size' (or recognized aliases).")
        n = max(float(n), 1e-12)
        p = max(float(p), 1e-12)

        x1 = n ** (-ALPHA)
        x2 = p ** (-BETA)
        x3 = x1 * x2
        c0, c1, c2, c3 = coeffs
        pred = float(c0 + c1*x1 + c2*x2 + c3*x3)
        out.append({'loss': pred})
    return out

#2 Run 2 R² = 0.999572

▼

#3 Run 3 R² = 0.999562

▼

Python

# Auto-generated scaling law for parallel model ensembles
# Formula:
#   loss = L_inf + C * (num_params ** (-alpha)) * (parallel_size ** (-beta))
# Coefficients are per experimental group; unknown groups fall back to 'GLOBAL'.

from __future__ import annotations
from typing import List, Dict

COEFFS: Dict[str, Dict[str, float]] = {
  "stack": {
    "L_inf": 0.745862788235,
    "C": 34.7616600054,
    "alpha": 0.218860662981,
    "beta": 0.0726406379176
  },
  "pile": {
    "L_inf": 1.34007335882,
    "C": 52.1900023051,
    "alpha": 0.209722249711,
    "beta": 0.0612871865453
  },
  "GLOBAL": {
    "L_inf": 0.0,
    "C": 5.83454675323,
    "alpha": 0.0654431228756,
    "beta": 0.0204934391035
  }
}

def _get_params(group: str) -> Dict[str, float]:
    if group in COEFFS:
        return COEFFS[group]
    # Fallbacks: try case-insensitive match
    lower_map = {k.lower(): k for k in COEFFS.keys()}
    if group.lower() in lower_map:
        return COEFFS[lower_map[group.lower()]]
    return COEFFS.get("GLOBAL", next(iter(COEFFS.values())))

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
                    Required keys: 'num_params', 'parallel_size'
        group: The name of the experimental group for which to make predictions.
               The functional form is the same for all groups; only coefficients differ.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s): 'loss'.
    """
    params = _get_params(group)
    Linf = float(params["L_inf"])
    C = float(params["C"])
    alpha = float(params["alpha"])
    beta = float(params["beta"])

    out: list[dict[str, float]] = []
    for row in input_data:
        n = float(row.get("num_params", 0.0))
        p = float(row.get("parallel_size", 0.0))
        if n <= 0 or p <= 0:
            pred = float("nan")
        else:
            pred = Linf + C * (n ** (-alpha)) * (p ** (-beta))
        out.append({"loss": float(pred)})
    return out

#4 Run 4 R² = 0.997904

▼

Python

# Auto-generated separable power-law scaling implementation
# Model: loss = L_inf_g + B_g * (num_params ** (-alpha_g)) * (parallel_size ** (-beta_g))

PARAMS_BY_GROUP = {
  "pile": {
    "B": 270950.6731019796,
    "L_inf": 1.7303000000000002,
    "alpha": 0.6642294876292075,
    "beta": 0.2076932646479629
  },
  "stack": {
    "B": 161079.36738295754,
    "L_inf": 0.95428,
    "alpha": 0.6657648150160552,
    "beta": 0.2337366727023866
  }
}
GLOBAL_PARAMS = {
  "B": 11704907481.885178,
  "L_inf": 0.9905990094,
  "alpha": 1.152762789016237,
  "beta": 0.7640090638180767
}

def _predict_loss(num_params: float, parallel_size: float, params: dict) -> float:
    n = float(num_params)
    p = float(parallel_size)
    L_inf = float(params.get("L_inf", 0.0))
    B = float(params.get("B", 1.0))
    a = float(params.get("alpha", 0.5))
    b = float(params.get("beta", 0.5))
    if not (n > 0 and p > 0):
        return float(L_inf)
    n = max(n, 1e-12)
    p = max(p, 1e-12)
    return float(L_inf + B * (n ** (-a)) * (p ** (-b)))

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    params = PARAMS_BY_GROUP.get(str(group), GLOBAL_PARAMS)
    outputs = []
    for row in input_data:
        n = float(row.get("num_params", 0.0))
        p = float(row.get("parallel_size", 0.0))
        y = _predict_loss(n, p, params)
        outputs.append({"loss": float(y)})
    return outputs

#5 Run 5 R² = 0.988864

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Parameters fitted per group for the law:
    # loss = c + K / (num_params**alpha * parallel_size**beta)
    PARAMS = {'pile': {'c': 1.7906250000000001, 'K': 64931271465.680824, 'alpha': 1.2678261766956551, 'beta': 0.5444137194258062}, 'stack': {'c': 0.988784, 'K': 41932932729.72136, 'alpha': 1.2730322502952758, 'beta': 0.5899394866229126}}

    # Fallback to a default if group not found
    if group not in PARAMS:
        # Use average parameters across groups
        cs = [v["c"] for v in PARAMS.values()]
        Ks = [v["K"] for v in PARAMS.values()]
        alphas = [v["alpha"] for v in PARAMS.values()]
        betas = [v["beta"] for v in PARAMS.values()]
        avg = {
            "c": sum(cs)/len(cs),
            "K": sum(Ks)/len(Ks),
            "alpha": sum(alphas)/len(alphas),
            "beta": sum(betas)/len(betas),
        }
        params = avg
    else:
        params = PARAMS[group]

    c = float(params["c"])
    K = float(params["K"])
    alpha = float(params["alpha"])
    beta = float(params["beta"])

    outputs = []
    for row in input_data:
        N = float(row.get("num_params", 0.0))
        P = float(row.get("parallel_size", 1.0))
        # Guard against non-positive inputs
        N = max(N, 1e-12)
        P = max(P, 1e-12)
        pred = c + K / ((N ** alpha) * (P ** beta))
        outputs.append({"loss": float(pred)})
    return outputs

Parallel Scaling Law

All Runs (sorted by R²)