SLD - Parallel Scaling Law - openhands + GPT-5

Best Run 1 R² = 0.999969

▼

Python

from __future__ import annotations
from typing import List, Dict

# Global exponents shared across groups (selected by cross-validated grid search)
_A = 0.26075
_B = 0.50575

# Per-group coefficients [c0, cN, cS, cNS]
_COEFS: Dict[str, tuple[float, float, float, float]] = {
    # c0: asymptotic loss as num_params, parallel_size -> infinity
    # cN: amplitude for num_params^{-A}
    # cS: amplitude for parallel_size^{-B}
    # cNS: interaction amplitude for (num_params^{-A} * parallel_size^{-B})
    "pile": (1.39800173, 114.189821, 0.0789779439, 5.29151618),
    "stack": (0.764687078, 63.5153262, 0.0446666145, 5.06084916),
}

# Fallback coefficients (mean over known groups) for unseen group names
_mean_coefs = tuple(
    sum(vals[i] for vals in _COEFS.values()) / len(_COEFS) for i in range(4)
)


def _predict_single(n: float, s: float, coefs: tuple[float, float, float, float]) -> float:
    if n <= 0 or s <= 0:
        # Degenerate case: return asymptote
        return coefs[0]
    n_term = n ** (-_A)
    s_term = s ** (-_B)
    c0, cN, cS, cNS = coefs
    return c0 + cN * n_term + cS * s_term + cNS * (n_term * s_term)


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    The scaling law uses a shared power-law transform in num_params and parallel_size
    with per-group amplitudes and intercept:

        loss = c0_g + cN_g * num_params^{-A} + cS_g * parallel_size^{-B} + cNS_g * (num_params^{-A} * parallel_size^{-B})

    Args:
        input_data: List of records with keys: 'num_params' and 'parallel_size'.
        group: Group name (e.g., 'stack', 'pile'). Functional form is shared across groups; coefficients vary by group.

    Returns:
        List of records with one key 'loss' per input row.
    """
    coefs = _COEFS.get(group, _mean_coefs)
    out: List[Dict[str, float]] = []
    for row in input_data:
        n = float(row.get("num_params", 0.0))
        s = float(row.get("parallel_size", 0.0))
        y = _predict_single(n, s, coefs)
        out.append({"loss": float(y)})
    return out

#2 Run 2 R² = 0.999960

▼

Python

from typing import List, Dict

# Scaling law with shared exponents across groups and group-specific coefficients.
# Formula: loss = L_g + A_g * num_params**(-alpha) + B_g * parallel_size**(-beta)
# Fitted values (from analysis on /app/data):
#   alpha = 0.25, beta = 0.5
#   pile: L=1.3602632039069371, A=98.21459269821332, B=0.101969095947293
#   stack: L=0.7368400879171292, A=55.921667192849526, B=0.06640252659247498

ALPHA = 0.25
BETA = 0.5

GROUP_PARAMS = {
    "pile": {
        "L": 1.3602632039069371,
        "A": 98.21459269821332,
        "B": 0.101969095947293,
    },
    "stack": {
        "L": 0.7368400879171292,
        "A": 55.921667192849526,
        "B": 0.06640252659247498,
    },
}

# Fallback parameters (average of known groups) in case an unknown group is requested
if GROUP_PARAMS:
    _Ls = [v["L"] for v in GROUP_PARAMS.values()]
    _As = [v["A"] for v in GROUP_PARAMS.values()]
    _Bs = [v["B"] for v in GROUP_PARAMS.values()]
    DEFAULT_PARAMS = {"L": sum(_Ls) / len(_Ls), "A": sum(_As) / len(_As), "B": sum(_Bs) / len(_Bs)}
else:
    DEFAULT_PARAMS = {"L": 0.0, "A": 0.0, "B": 0.0}


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    params = GROUP_PARAMS.get(group, DEFAULT_PARAMS)
    Lg, Ag, Bg = params["L"], params["A"], params["B"]

    outputs: List[Dict[str, float]] = []
    for row in input_data:
        n = float(row.get("num_params", 0.0))
        p = float(row.get("parallel_size", 1.0))
        # Avoid invalid values
        n = max(n, 1.0)
        p = max(p, 1.0)
        pred_loss = Lg + Ag * (n ** (-ALPHA)) + Bg * (p ** (-BETA))
        outputs.append({"loss": float(pred_loss)})
    return outputs

#3 Run 3 R² = 0.999441

▼

#4 Run 4 R² = 0.999387

▼

Python

from __future__ import annotations

from typing import List, Dict

# Discovered scaling law (shared exponents across groups, group-specific amplitude):
# loss = A_g * num_params**b * parallel_size**c
# Fitted on /app/data via linear regression on log(loss) with shared slopes and
# group-specific intercepts.

# Shared exponents
_B = -0.06544312287561933  # exponent for num_params
_C = -0.02049343910349895  # exponent for parallel_size

# Group-specific amplitudes A_g
_AMPLITUDE = {
    "pile": 7.844746369924655,
    "stack": 4.339456524197749,
}

# Fallback amplitude for unknown groups: geometric mean of known amplitudes
_default_amp = (1.0)
if _AMPLITUDE:
    import math
    vals = list(_AMPLITUDE.values())
    _default_amp = math.exp(sum(math.log(v) for v in vals) / len(vals))


def _safe_positive(x: float) -> float:
    # Guard against non-positive inputs (shouldn't occur in valid data)
    return x if x > 0 else 1e-12


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    The law uses a multiplicative power-law with shared exponents across groups
    and a group-specific amplitude (A_g):
        loss = A_g * num_params**b * parallel_size**c

    Args:
        input_data: List of data points with keys 'num_params' and 'parallel_size'.
        group: Name of the experimental group (e.g., 'stack', 'pile'). Determines A_g.

    Returns:
        A list with dictionaries containing {'loss': predicted_value} for each input.
    """
    A_g = _AMPLITUDE.get(group, _default_amp)
    b, c = _B, _C

    out: List[Dict[str, float]] = []
    for row in input_data:
        N = float(row.get("num_params", 0.0))
        P = float(row.get("parallel_size", 0.0))
        N = _safe_positive(N)
        P = _safe_positive(P)
        loss = A_g * (N ** b) * (P ** c)
        out.append({"loss": float(loss)})
    return out

#5 Run 5 R² = 0.999387

▼

Python

from __future__ import annotations

from typing import Dict, List
import math

# Pre-fitted parameters from pooled log-log regression on the provided dataset.
# Functional form (shared across groups):
#   loss = A_g * (num_params)^{-alpha} * (parallel_size)^{-beta}
# Group-specific amplitude A_g; shared exponents alpha, beta.

_ALPHA = 0.06544312287562275
_BETA = 0.020493439103498882
_A_BY_GROUP: Dict[str, float] = {
    "stack": 4.339456524197764,
    "pile": 7.844746369924683,
}
# Fallback amplitude for unknown groups (geometric mean of known A's)
_A_DEFAULT = math.sqrt(_A_BY_GROUP["stack"] * _A_BY_GROUP["pile"])


def _predict_one(num_params: float, parallel_size: float, A: float) -> float:
    # Guard against non-positive inputs
    n = max(float(num_params), 1.0)
    p = max(float(parallel_size), 1.0)
    return A * (n ** (-_ALPHA)) * (p ** (-_BETA))


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Required keys: 'num_params', 'parallel_size'.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is the same for all groups, while
                the amplitude parameter A may differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s) under key 'loss'.
    """
    A = _A_BY_GROUP.get(group, _A_DEFAULT)
    outputs: List[Dict[str, float]] = []
    for row in input_data:
        n = row.get("num_params", 0.0)
        p = row.get("parallel_size", 1.0)
        y = _predict_one(n, p, A)
        outputs.append({"loss": float(y)})
    return outputs

Parallel Scaling Law

All Runs (sorted by R²)