SLD - Parallel Scaling Law

All Runs (sorted by R²)

Best Run 1 R² = 0.999964

▼

Python

from __future__ import annotations

from typing import Dict, List


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    The discovered law models the language modeling loss as a sum of a floor term
    plus two diminishing-return power-law contributions: one from the total
    number of parameters and one from the degree of model parallelism.

    Form:
        loss = c0[group] + c1[group] * num_params**(-a) + c2[group] * parallel_size**(-b)

    where (a, b) are shared across groups and the coefficients (c0, c1, c2) are
    group-specific constants.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Expected keys: 'num_params', 'parallel_size'.
        group: The name of the experimental group for which to make predictions.
               Supported groups: 'stack', 'pile'. If an unknown group is supplied,
               a conservative default is used (closest to the overall fit).

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s) under key 'loss'.
    """

    # Shared exponents (common functional form across groups)
    a = 0.26
    b = 0.49

    # Group-specific coefficients (fitted on the provided dataset)
    coeffs = {
        "stack": {"c0": 0.74683622, "c1": 66.36773787, "c2": 0.06734961},
        "pile": {"c0": 1.37803186, "c1": 116.559736, "c2": 0.103430375},
    }

    # Fallback: if unknown group, use coefficients closest to overall fit.
    params = coeffs.get(group)
    if params is None:
        # Default to a set of coefficients roughly in between the two groups.
        # This maintains functional validity while avoiding a hard failure.
        params = {"c0": 1.06, "c1": 90.0, "c2": 0.085}

    c0, c1, c2 = params["c0"], params["c1"], params["c2"]

    outputs: List[Dict[str, float]] = []
    for row in input_data:
        n = float(row.get("num_params", 0.0))
        p = float(row.get("parallel_size", 1.0))

        # Guard against degenerate inputs
        if n <= 0.0:
            raise ValueError("num_params must be positive")
        if p <= 0.0:
            raise ValueError("parallel_size must be positive")

        loss = c0 + c1 * (n ** (-a)) + c2 * (p ** (-b))
        outputs.append({"loss": float(loss)})

    return outputs

#2 Run 2 R² = 0.999952

▼

Python

from __future__ import annotations

from typing import Dict, List


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Expected keys: 'num_params', 'parallel_size'.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s) under key 'loss'.
    """

    # Per-group parameters for the additive separable power-law model:
    #   loss = a_g + b_g * num_params^(-alpha_g) + c_g * parallel_size^(-beta_g)
    # Fitted on the provided dataset; see explain.md for methodology and values.
    params = {
        # Group 'stack'
        'stack': {
            'a': 0.7581171833768028,
            'b': 66.36794926297301,
            'c': 0.05629561065261051,
            'alpha': 0.26,
            'beta': 0.64,
        },
        # Group 'pile'
        'pile': {
            'a': 1.3731766527713336,
            'b': 116.5596685778046,
            'c': 0.10821525527827004,
            'alpha': 0.26,
            'beta': 0.46,
        },
    }

    # Fallback: if an unknown group is passed, use averaged parameters
    # and the common exponent on num_params with an averaged beta.
    if group not in params:
        a_avg = (params['stack']['a'] + params['pile']['a']) / 2.0
        b_avg = (params['stack']['b'] + params['pile']['b']) / 2.0
        c_avg = (params['stack']['c'] + params['pile']['c']) / 2.0
        params[group] = {
            'a': a_avg,
            'b': b_avg,
            'c': c_avg,
            'alpha': 0.26,
            'beta': (params['stack']['beta'] + params['pile']['beta']) / 2.0,
        }

    p = params[group]

    outputs: List[Dict[str, float]] = []
    for row in input_data:
        n = float(row.get('num_params', 0.0))
        ps = float(row.get('parallel_size', 0.0))
        # Guard against invalid values
        if n <= 0 or ps <= 0:
            pred = float('nan')
        else:
            pred = p['a'] + p['b'] * (n ** (-p['alpha'])) + p['c'] * (ps ** (-p['beta']))
        outputs.append({'loss': float(pred)})

    return outputs

#3 Run 3 R² = 0.999643

▼

Python

from __future__ import annotations

from typing import Dict, List


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Global exponents (shared across groups)
    alpha = 0.21645409733622326
    beta = 0.06332528602088167

    # Group-specific intercept (L_inf) and amplitude (K)
    params = {
        "pile": {
            "L_inf": 1.35878468572794,
            "K": 58.33538329741633,
        },
        "stack": {
            "L_inf": 0.7397073464609784,
            "K": 33.454566214339,
        },
    }

    if group not in params:
        raise ValueError(f"Unsupported group: {group!r}. Supported: {sorted(params)}")

    L_inf = params[group]["L_inf"]
    K = params[group]["K"]

    outputs: List[Dict[str, float]] = []
    for row in input_data:
        try:
            num_params = float(row["num_params"])
            parallel_size = float(row["parallel_size"])
        except KeyError as e:
            raise KeyError(
                "Each input row must contain 'num_params' and 'parallel_size'."
            ) from e

        # Scaling law: loss = L_inf_g + K_g * num_params^{-alpha} * parallel_size^{-beta}
        loss = L_inf + K * (num_params ** (-alpha)) * (parallel_size ** (-beta))
        outputs.append({"loss": float(loss)})

    return outputs

#4 Run 4 R² = 0.999572

▼

Python

from __future__ import annotations

import math
from typing import Dict, List


def _predict_loss(num_params: float, parallel_size: float, a: float, b: float, c: float) -> float:
    """Compute loss using a log-linear power-law form.

    loss = exp(a + b * ln(num_params) + c * ln(parallel_size))
    """
    # Guard against non-positive inputs to logs; fall back to minimal positive value.
    n = max(float(num_params), 1e-12)
    p = max(float(parallel_size), 1e-12)
    return float(math.exp(a + b * math.log(n) + c * math.log(p)))


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Coefficients fitted in log-space (per-group) from the provided dataset.
    # Model: ln(loss) = a_g + b_g * ln(num_params) + c_g * ln(parallel_size)
    COEFS = {
        # group: (a, b, c)
        "stack": (1.48841967466, -0.066371142262, -0.022109243284),
        "pile": (2.039173496278, -0.064515103489, -0.018877634923),
    }

    # Fallback shared coefficients in case of an unknown group.
    GLOBAL = (1.763796585469, -0.065443122876, -0.020493439103)

    a, b, c = COEFS.get(group, GLOBAL)

    outputs: List[Dict[str, float]] = []
    for row in input_data:
        n = row.get("num_params")
        p = row.get("parallel_size")
        if n is None or p is None:
            raise KeyError("Each input row must include 'num_params' and 'parallel_size'.")
        loss = _predict_loss(n, p, a, b, c)
        outputs.append({"loss": loss})
    return outputs

#5 Run 5 R² = 0.994600

▼

Python

from __future__ import annotations

from typing import Dict, List


def _predict_loss(num_params: float, parallel_size: float, coeffs: Dict[str, float]) -> float:
    L_inf = coeffs["L_inf"]
    k = coeffs["k"]
    alpha = coeffs["alpha"]
    gamma = coeffs["gamma"]
    # Scaling law: L = L_inf + k * (num_params * parallel_size**gamma) ** (-alpha)
    n_eff = float(num_params) * (float(parallel_size) ** float(gamma))
    return float(L_inf + k * (n_eff ** (-alpha)))


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """

    # Fitted coefficients per group (derived from /app/data)
    GROUP_COEFFS: Dict[str, Dict[str, float]] = {
        # group: { L_inf, k, alpha, gamma }
        "pile": {"L_inf": 0.9994680998203908, "k": 14.076803963829533, "alpha": 0.1260671701767887, "gamma": 0.5},
        "stack": {"L_inf": 0.6239972998774463, "k": 11.721235202089247, "alpha": 0.15204205303795576, "gamma": 0.5},
    }

    # Default to an average of known groups if an unseen group is provided
    if group in GROUP_COEFFS:
        coeffs = GROUP_COEFFS[group]
    else:
        # Simple average of per-parameter values
        vals = list(GROUP_COEFFS.values())
        coeffs = {
            "L_inf": sum(v["L_inf"] for v in vals) / len(vals),
            "k": sum(v["k"] for v in vals) / len(vals),
            "alpha": sum(v["alpha"] for v in vals) / len(vals),
            "gamma": sum(v["gamma"] for v in vals) / len(vals),
        }

    outputs: List[Dict[str, float]] = []
    for row in input_data:
        n = float(row.get("num_params", 0.0))
        p = float(row.get("parallel_size", 1.0))
        pred_loss = _predict_loss(n, p, coeffs)
        outputs.append({"loss": pred_loss})

    return outputs