SLD - Parallel Scaling Law - opencode + GPT-5

All Runs (sorted by R²)

Best Run 1 R² = 0.999953

▼

Python

from typing import List, Dict

# Discovered scaling law (same functional form for all groups):
#   loss = c + a * num_params**(-alpha) + b * parallel_size**(-beta)
# Coefficients are fitted per group.

_PARAMS = {
    # Fitted on provided dataset
    "stack": {
        "c": 0.7711276768482299,
        "a": 82.70170857310372,
        "alpha": 0.27272727272727276,
        "b": 0.0560743949982965,
        "beta": 0.643939393939394,
    },
    "pile": {
        "c": 1.3473420493745163,
        "a": 94.8923034356369,
        "alpha": 0.24797979797979802,
        "b": 0.11068492806080414,
        "beta": 0.445959595959596,
    },
}

# Fallback parameters (simple average of known groups) for unseen groups
if _PARAMS:
    _FALLBACK = {
        k: sum(d[k] for d in _PARAMS.values()) / len(_PARAMS)
        for k in ("c", "a", "alpha", "b", "beta")
    }
else:
    _FALLBACK = {"c": 1.0, "a": 1.0, "alpha": 0.5, "b": 0.1, "beta": 0.5}


def _predict_one(x: Dict[str, float], p: Dict[str, float]) -> float:
    n = float(x.get("num_params", 0.0))
    psize = float(x.get("parallel_size", 1.0))
    # Guard against non-positive inputs
    if n <= 0:
        # Degenerate case: return intercept + parallel contribution
        n_term = 0.0
    else:
        n_term = n ** (-p["alpha"])  # type: ignore
    if psize <= 0:
        p_term = 0.0
    else:
        p_term = psize ** (-p["beta"])  # type: ignore
    return p["c"] + p["a"] * n_term + p["b"] * p_term


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    params = _PARAMS.get(group, _FALLBACK)
    preds = []
    for x in input_data:
        y = _predict_one(x, params)
        preds.append({"loss": float(y)})
    return preds

#2 Run 2 R² = 0.999658

▼

Python

from typing import List, Dict

# Discovered scaling law (same functional form for all groups):
#   loss = L_inf + k * (num_params ** (-alpha)) * (parallel_size ** (-beta))
# Coefficients are fitted per experimental group.

_PARAMS_BY_GROUP = {
    # group: (L_inf, k, alpha, beta)
    "stack": (0.7511996419, 37.2725851817, 0.2229794195, 0.0719784665),
    "pile":  (1.3518854109, 56.2975799228, 0.2142921940, 0.0605769454),
}


def _predict_loss(num_params: float, parallel_size: float, coeffs: tuple[float, float, float, float]) -> float:
    L_inf, k, alpha, beta = coeffs
    # Guard against non-positive inputs (should not occur in valid data)
    if num_params <= 0 or parallel_size <= 0:
        raise ValueError("num_params and parallel_size must be positive")
    return float(L_inf + k * (num_params ** (-alpha)) * (parallel_size ** (-beta)))


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Expected keys: 'num_params', 'parallel_size'.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is the same for all groups,
                but the constant parameters/coefficients differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s). Keys: 'loss'.
    """
    # Select coefficients for the given group; if unknown, fallback to average of known groups
    if group in _PARAMS_BY_GROUP:
        coeffs = _PARAMS_BY_GROUP[group]
    else:
        # Fallback: simple average to remain robust to unseen groups
        import statistics as _stats
        L_inf = _stats.fmean(v[0] for v in _PARAMS_BY_GROUP.values())
        k = _stats.fmean(v[1] for v in _PARAMS_BY_GROUP.values())
        alpha = _stats.fmean(v[2] for v in _PARAMS_BY_GROUP.values())
        beta = _stats.fmean(v[3] for v in _PARAMS_BY_GROUP.values())
        coeffs = (L_inf, k, alpha, beta)

    outputs: List[Dict[str, float]] = []
    for row in input_data:
        n = float(row["num_params"])  # type: ignore[index]
        p = float(row["parallel_size"])  # type: ignore[index]
        pred = _predict_loss(n, p, coeffs)
        outputs.append({"loss": pred})
    return outputs

#3 Run 3 R² = 0.999473

▼

Python

from typing import List, Dict

# Fitted parameters for a power-law-with-offset scaling law per group
# Formula: loss = L0 + A * num_params^(-alpha) * parallel_size^(-beta)
_PARAMS: Dict[str, Dict[str, float]] = {
    "stack": {
        "L0": 0.746346,
        "A": 34.954458,
        "alpha": 0.21919,
        "beta": 0.07275,
    },
    "pile": {
        "L0": 1.4938,
        "A": 188.643207,
        "alpha": 0.284241,
        "beta": 0.083347,
    },
    # Fallback if an unknown group is provided
    "__global__": {
        "L0": 0.6906,
        "A": 14.742912,
        "alpha": 0.143482,
        "beta": 0.04563,
    },
}


def _get_params(group: str) -> Dict[str, float]:
    return _PARAMS.get(group, _PARAMS["__global__"])


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Expected keys: 'num_params', 'parallel_size'.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is the same for all groups,
                but the constant parameters/coefficients differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s): {'loss': float}.
    """
    p = _get_params(group)
    L0 = float(p["L0"])  # asymptotic loss floor
    A = float(p["A"])    # scale factor
    alpha = float(p["alpha"])  # exponent for num_params
    beta = float(p["beta"])    # exponent for parallel_size

    outputs: List[Dict[str, float]] = []
    for row in input_data:
        N = float(row.get("num_params", 0.0))
        P = float(row.get("parallel_size", 0.0))
        # Guard against invalid inputs
        if N <= 0 or P <= 0:
            pred = float("nan")
        else:
            pred = L0 + A * (N ** (-alpha)) * (P ** (-beta))
        outputs.append({"loss": float(pred)})
    return outputs

#4 Run 4 R² = 0.999387

▼

Python

from __future__ import annotations

import math
from typing import Dict, List


#
# Scaling law discovered on the provided dataset
# Form:  loss = exp(a_g + b * ln(num_params) + c * ln(parallel_size))
#       = K_g * num_params**b * parallel_size**c
# with group-specific intercepts a_g and shared exponents b, c.
#

# Group-specific intercepts (a_g) in log space
_A_BY_GROUP: Dict[str, float] = {
    "pile": 2.0598440555061694,
    "stack": 1.4677491154318025,
}

# Shared exponents on num_params and parallel_size (log-linear coefficients)
_B_LOG_NUM_PARAMS: float = -0.06544312287562122
_C_LOG_PARALLEL: float = -0.02049343910349899


def _get_group_intercept(group: str) -> float:
    if group in _A_BY_GROUP:
        return _A_BY_GROUP[group]
    # Fallback: mean intercept if an unknown group is requested
    return sum(_A_BY_GROUP.values()) / len(_A_BY_GROUP)


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    a_g = _get_group_intercept(group)
    b = _B_LOG_NUM_PARAMS
    c = _C_LOG_PARALLEL

    outputs: List[Dict[str, float]] = []
    for row in input_data:
        # Pull inputs with basic validation/guarding for logs
        n_params = float(row.get("num_params", 0.0))
        p_size = float(row.get("parallel_size", 0.0))

        # Avoid log of zero or negatives
        n_params = max(n_params, 1e-12)
        p_size = max(p_size, 1e-12)

        pred_log = a_g + b * math.log(n_params) + c * math.log(p_size)
        pred_loss = math.exp(pred_log)

        outputs.append({"loss": pred_loss})

    return outputs

#5 Run 5 R² = 0.999298

▼

Python

from __future__ import annotations

from typing import Dict, List

# Discovered scaling law (shared functional form across groups):
#   loss(N, P; g) = L_inf[g] + A[g] / (N**alpha[g] * P**beta[g])
# where
#   N = num_params, P = parallel_size, g = group name
# Coefficients were fitted per group on the provided dataset.

_COEFFICIENTS: Dict[str, Dict[str, float]] = {
    # group: {L_inf, A, alpha, beta}
    "pile": {
        "L_inf": 1.5938,
        "A": 927.7951136423936,
        "alpha": 0.3714475799060386,
        "beta": 0.10974874419589042,
    },
    "stack": {
        "L_inf": 0.7906,
        "A": 63.84810455867431,
        "alpha": 0.25439743787254915,
        "beta": 0.08449977586585611,
    },
}

# Fallback (used if an unseen group is requested): simple average of known groups
_DEFAULT = {
    k: sum(v[k] for v in _COEFFICIENTS.values()) / len(_COEFFICIENTS)
    for k in ("L_inf", "A", "alpha", "beta")
}


def _predict_loss(num_params: float, parallel_size: float, coeffs: Dict[str, float]) -> float:
    N = float(num_params)
    P = float(parallel_size)
    if N <= 0 or P <= 0:
        # Guard against invalid inputs; return asymptotic value if invalid
        return float(coeffs["L_inf"])
    return float(coeffs["L_inf"] + coeffs["A"] / (N ** coeffs["alpha"] * P ** coeffs["beta"]))


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    coeffs = _COEFFICIENTS.get(group, _DEFAULT)
    outputs: List[Dict[str, float]] = []
    for row in input_data:
        n = row.get("num_params")
        p = row.get("parallel_size")
        if n is None or p is None:
            raise KeyError("Each input row must contain 'num_params' and 'parallel_size'.")
        y = _predict_loss(float(n), float(p), coeffs)
        outputs.append({"loss": float(y)})
    return outputs