SLD - Parallel Scaling Law

All Runs (sorted by R²)

Best Run 1 R² = 0.999958

▼

Python

from __future__ import annotations
from typing import Dict, List
import math

# Discovered scaling law (shared exponents across groups):
#   loss = a_g + b_g * num_params^(-alpha) + d_g * parallel_size^(-beta)
# where g is the experimental group (e.g., 'stack', 'pile').
# The exponents (alpha, beta) are group-invariant; the coefficients a_g, b_g, d_g
# are group-specific.

# Global (group-invariant) exponents found by grid-search + least squares
_ALPHA = 0.2608
_BETA = 0.5071

# Group-specific coefficients (fitted on the provided dataset)
_GROUP_PARAMS: Dict[str, Dict[str, float]] = {
    # group: {a, b, d}
    "pile": {"a": 1.3820165417362469, "b": 118.17512888515972, "d": 0.10096113933098574},
    "stack": {"a": 0.7493041023195879, "b": 67.2875703412857, "d": 0.06574974881954163},
}

# Fallback parameters if an unknown group is provided. We take the simple
# average of known groups to avoid errors and provide a reasonable guess
# while retaining the same functional form.
if _GROUP_PARAMS:
    _FALLBACK = {
        "a": sum(p["a"] for p in _GROUP_PARAMS.values()) / len(_GROUP_PARAMS),
        "b": sum(p["b"] for p in _GROUP_PARAMS.values()) / len(_GROUP_PARAMS),
        "d": sum(p["d"] for p in _GROUP_PARAMS.values()) / len(_GROUP_PARAMS),
    }
else:
    _FALLBACK = {"a": 0.0, "b": 0.0, "d": 0.0}


def _predict_loss(num_params: float, parallel_size: float, params: Dict[str, float]) -> float:
    # Guard against invalid inputs
    if num_params <= 0:
        raise ValueError("num_params must be positive")
    if parallel_size <= 0:
        raise ValueError("parallel_size must be positive")
    return (
        params["a"]
        + params["b"] * (num_params ** (-_ALPHA))
        + params["d"] * (parallel_size ** (-_BETA))
    )


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Expected keys: 'num_params', 'parallel_size'.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is the same for all groups,
                but the constant parameters/coefficients differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s). Keys: 'loss'.
    """
    params = _GROUP_PARAMS.get(group, _FALLBACK)
    outputs: List[Dict[str, float]] = []
    for row in input_data:
        n = float(row.get("num_params"))
        p = float(row.get("parallel_size"))
        pred = _predict_loss(n, p, params)
        outputs.append({"loss": float(pred)})
    return outputs

#2 Run 2 R² = 0.999958

▼

Python

from __future__ import annotations

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Global exponents (shared across groups)
    ALPHA = 0.3
    BETA = 0.5

    # Per-group coefficients (c0, c1, c2) for
    # loss = c0_g + c1_g * num_params**(-ALPHA) + c2_g * parallel_size**(-BETA)
    COEFS = {
        "pile":  (1.4436845000000001, 234.328214, 0.101951443),
        "stack": (0.784327086,        133.428708, 0.0663924197),
    }

    # Fallback for unknown groups: average of known groups
    if group not in COEFS:
        if COEFS:
            c0 = sum(v[0] for v in COEFS.values()) / len(COEFS)
            c1 = sum(v[1] for v in COEFS.values()) / len(COEFS)
            c2 = sum(v[2] for v in COEFS.values()) / len(COEFS)
            COEFS[group] = (c0, c1, c2)
        else:
            COEFS[group] = (0.0, 0.0, 0.0)

    c0, c1, c2 = COEFS[group]

    outputs: list[dict[str, float]] = []
    for row in input_data:
        # Accept alternative keys if provided
        n = float(row.get("num_params", row.get("N", 0.0)))
        p = float(row.get("parallel_size", row.get("P", 1.0)))
        # Guard against non-positive values
        n = max(n, 1.0)
        p = max(p, 1.0)
        y = c0 + c1 * (n ** (-ALPHA)) + c2 * (p ** (-BETA))
        outputs.append({"loss": float(y)})
    return outputs

#3 Run 3 R² = 0.999441

▼

Python

from __future__ import annotations

import math
from typing import Dict, List

# Discovered scaling law:
#   loss = a_g + b_g * log10(num_params) + c_g * log2(parallel_size)
# Functional form is shared across groups; coefficients (a_g, b_g, c_g) differ by group.
# Coefficients were fitted via least squares on the provided dataset.
# Per-group coefficients:
#   stack: a=2.59963343, b=-0.16438025, c=-0.01656783
#   pile:  a=4.61839962, b=-0.28879470, c=-0.02553759
# Fallback pooled coefficients (all groups combined), used for unknown groups:
#   pooled: a=3.60901653, b=-0.22658747, c=-0.02105271

_COEFFICIENTS: Dict[str, Dict[str, float]] = {
    "stack": {"a": 2.59963343, "b": -0.16438025, "c": -0.01656783},
    "pile": {"a": 4.61839962, "b": -0.28879470, "c": -0.02553759},
    "__pooled__": {"a": 3.60901653, "b": -0.22658747, "c": -0.02105271},
}


def _predict_loss(num_params: float, parallel_size: float, coeffs: Dict[str, float]) -> float:
    # Guardrails to avoid math domain issues
    n = max(float(num_params), 1.0)
    p = max(float(parallel_size), 1.0)
    return (
        coeffs["a"]
        + coeffs["b"] * math.log10(n)
        + coeffs["c"] * math.log2(p)
    )


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    coeffs = _COEFFICIENTS.get(group, _COEFFICIENTS["__pooled__"])

    outputs: List[Dict[str, float]] = []
    for row in input_data:
        n = float(row.get("num_params", 0.0))
        p = float(row.get("parallel_size", 1.0))
        loss_hat = _predict_loss(n, p, coeffs)
        outputs.append({"loss": float(loss_hat)})
    return outputs

#4 Run 4 R² = 0.999433

▼

Python

from __future__ import annotations

from typing import Dict, List
import math

# Discovered scaling law (shared exponents across groups):
#   loss = c_g + k_g * (parallel_size ** a) * (num_params ** b)
# Fitted on the provided dataset with group-specific (c_g, k_g) and shared (a, b).
PARAMS = {
    # group: (c_g, k_g)
    "pile": (0.05832441220610305, 7.773058300897712),
    "stack": (0.0, 4.432661248584857),
}
# Shared exponents across groups
A_EXP = -0.020784939160544642  # exponent for parallel_size
B_EXP = -0.06644226777569631   # exponent for num_params

# Fallback parameters (if an unknown group is provided)
if PARAMS:
    _c_avg = sum(c for c, _ in PARAMS.values()) / len(PARAMS)
    _k_avg = sum(k for _, k in PARAMS.values()) / len(PARAMS)
else:
    _c_avg = 0.0
    _k_avg = 1.0


def _get_group_params(group: str) -> tuple[float, float, float, float]:
    c_g, k_g = PARAMS.get(group, (_c_avg, _k_avg))
    return c_g, k_g, A_EXP, B_EXP


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    c_g, k_g, a, b = _get_group_params(group)

    outputs: List[Dict[str, float]] = []
    for row in input_data:
        # Required inputs
        P = float(row.get("parallel_size", 1.0))
        N = float(row.get("num_params", 1.0))
        # Compute prediction
        # Ensure inputs are positive for power operation
        if P <= 0 or N <= 0:
            raise ValueError("parallel_size and num_params must be positive.")
        loss_hat = c_g + k_g * (math.pow(P, a)) * (math.pow(N, b))
        outputs.append({"loss": float(loss_hat)})
    return outputs

#5 Run 5 R² = 0.999387

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    The discovered law is a shared-exponent power law across groups:
        loss = A_g * num_params^alpha * parallel_size^beta

    where alpha and beta are shared across groups, and A_g is a group-specific
    amplitude. Parameters were fit via linear regression in log space on the
    provided dataset.

    Args:
        input_data: A list of dictionaries, each containing:
            - 'num_params': total number of model parameters (float)
            - 'parallel_size': degree of parallelism used (float)
        group: Name of the experimental group for which to make predictions.

    Returns:
        A list of dictionaries, each containing:
            - 'loss': predicted final language modeling loss (float)
    """
    import math

    # Shared exponents (fitted globally across groups)
    alpha = -0.06544312287562107  # exponent for num_params
    beta = -0.020493439103496065  # exponent for parallel_size

    # Group-specific amplitudes A_g (from pooled fit with group offset)
    A_by_group = {
        'stack': 4.339456524197748,
        'pile': 7.844746369924648,
    }

    # Fallback amplitude for unknown groups: geometric mean of known groups
    if group in A_by_group:
        A_g = A_by_group[group]
    else:
        # geometric mean as a neutral prior across groups
        logs = [math.log(v) for v in A_by_group.values()]
        A_g = math.exp(sum(logs) / len(logs))

    out = []
    for row in input_data:
        # Safety: ensure strictly positive values for power law
        N = float(row.get('num_params', 0.0))
        P = float(row.get('parallel_size', 0.0))
        N = max(N, 1e-12)
        P = max(P, 1e-12)

        pred_loss = A_g * (N ** alpha) * (P ** beta)
        out.append({'loss': float(pred_loss)})

    return out