SLD - Parallel Scaling Law - terminus-2 + GPT-5

Best Run 1 R² = 0.999963

▼

Python

from __future__ import annotations
import math

MODEL = 1
PARAMS = {
    'stack': {'a': 0.7807154665753339, 'b': 105.92510486846706, 'c': 0.05930853037090343, 'alpha': 0.2869005799170186, 'beta': 0.5903836027749966},
    'pile': {'a': 1.3227092003096266, 'b': 82.24622115179116, 'c': 0.11745280123737169, 'alpha': 0.2395414702789555, 'beta': 0.4115597137836079},
}

def _predict_one(x: dict[str, float], coeffs: dict[str, float], model: int) -> float:
    N = float(x.get('num_params'))
    P = float(x.get('parallel_size'))
    if N <= 0 or P <= 0:
        raise ValueError("num_params and parallel_size must be positive")
    if model == 1:
        a = coeffs['a']; b = coeffs['b']; c = coeffs['c']; alpha = coeffs['alpha']; beta = coeffs['beta']
        return a + b * (N ** (-alpha)) + c * (P ** (-beta))
    elif model == 2:
        a = coeffs['a']; d = coeffs['d']; alpha = coeffs['alpha']
        return a + d * ((N*P) ** (-alpha))
    elif model == 3:
        a = coeffs['a']; b = coeffs['b']; d = coeffs['d']; alpha = coeffs['alpha']; beta = coeffs['beta']
        return a + b * (N ** (-alpha)) + d * ((N*P) ** (-beta))
    else:
        raise ValueError("Unknown model id")

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """Predicts output variables based on input variables according to a discovered scaling law."""
    if group not in PARAMS:
        # Fallback: average parameters across known groups for unseen group
        keys = next(iter(PARAMS.values())).keys()
        coeffs = {k: sum(p[k] for p in PARAMS.values())/len(PARAMS) for k in keys}
    else:
        coeffs = PARAMS[group]
    out = []
    for x in input_data:
        y = _predict_one(x, coeffs, int(MODEL))
        out.append({'loss': float(y)})
    return out

#2 Run 2 R² = 0.999588

▼

Python

from __future__ import annotations
from typing import List, Dict

# Fitted parameters per group for the scaling law:
# loss = L_inf + A * num_params**(-alpha) * parallel_size**(-beta)
PARAMS = {
    'stack': {'L_inf': 0.4906, 'A': 7.92697458504, 'alpha': 0.122304765784, 'beta': 0.0406343024103},
    'pile': {'L_inf': 1.2938, 'A': 40.6611812144, 'alpha': 0.194471508223, 'beta': 0.0568103691424},
}

def _predict_one(x: Dict[str, float], par: Dict[str, float]) -> Dict[str, float]:
    n = float(x.get('num_params', 0.0))
    p = float(x.get('parallel_size', 1.0))
    L_inf = par['L_inf']
    A = par['A']
    alpha = par['alpha']
    beta = par['beta']
    # Guardrails
    n = max(n, 1e-12)
    p = max(p, 1e-12)
    y = L_inf + A * (n ** (-alpha)) * (p ** (-beta))
    return {'loss': float(y)}


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    par = PARAMS.get(group)
    if par is None:
        # If unseen group, fall back to average of known parameters
        if PARAMS:
            import statistics as _st
            L_inf = _st.mean(v['L_inf'] for v in PARAMS.values())
            A = _st.mean(v['A'] for v in PARAMS.values())
            alpha = _st.mean(v['alpha'] for v in PARAMS.values())
            beta = _st.mean(v['beta'] for v in PARAMS.values())
            par = {'L_inf': L_inf, 'A': A, 'alpha': alpha, 'beta': beta}
        else:
            par = {'L_inf': 0.0, 'A': 1.0, 'alpha': 0.5, 'beta': 0.5}
    return [_predict_one(x, par) for x in input_data]

#3 Run 3 R² = 0.999572

▼

#4 Run 4 R² = 0.999456

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Embedded coefficients per group (fitted on training data)
    _MODEL = 'log_add_inter'
    _COEFFS = {
        'stack': [2.6297221965258952, -0.07281785641865243, -0.06734214565793178, 0.0020619859360822388],
        'pile': [4.649586613302491, -0.1269025023692098, -0.08186831350347822, 0.0021372473711799705],
    }
    # Select group coefficients, fallback to closest match or first available
    beta = _COEFFS.get(group)
    if beta is None and len(_COEFFS)>0:
        # try case-insensitive match
        for k in _COEFFS:
            if str(k).lower()==str(group).lower():
                beta = _COEFFS[k]
                break
    if beta is None and len(_COEFFS)>0:
        beta = next(iter(_COEFFS.values()))
    out = []
    import math
    for row in input_data:
        N = float(row.get("num_params", 0.0))
        P = float(row.get("parallel_size", 0.0))
        if _MODEL == "log_add":
            if N<=0 or P<=0:
                y = float("nan")
            else:
                x0, x1, x2 = 1.0, math.log(N), math.log(P)
                y = beta[0]*x0 + beta[1]*x1 + beta[2]*x2
        elif _MODEL == "inv_prod":
            denom = N*P
            x0, x1 = 1.0, (1.0/denom if denom!=0 else 0.0)
            y = beta[0]*x0 + beta[1]*x1
        elif _MODEL == "log_add_inter":
            if N<=0 or P<=0:
                y = float("nan")
            else:
                lnN = math.log(N); lnP = math.log(P)
                x = [1.0, lnN, lnP, lnN*lnP]
                y = sum(b*v for b,v in zip(beta, x))
        else:
            # Fallback: simple additive logs
            if N<=0 or P<=0:
                y = float("nan")
            else:
                y = beta[0] + beta[1]*math.log(N) + beta[2]*math.log(P)
        out.append({"loss": float(y)})
    return out

#5 Run 5 R² = 0.999411

▼

Python

# Auto-generated scaling law implementation
from __future__ import annotations
from math import pow

# Fitted global exponents
_ALPHA = 0.2205027977154814
_BETA = 0.06882704966401557
# Group-specific coefficients
_PARAMS = {'pile': {'a': 1.3716409140011552, 'b': 62.350188436049685}, 'stack': {'a': 0.7469274208971536, 'b': 35.773459284354644}}

# Fallback strategy if unseen group: use average of known groups
if _PARAMS:
    _FALLBACK = {'a': sum(v['a'] for v in _PARAMS.values())/len(_PARAMS),
                 'b': sum(v['b'] for v in _PARAMS.values())/len(_PARAMS)}
else:
    _FALLBACK = {'a': 0.0, 'b': 1.0}


def _coeffs_for(group: str):
    return _PARAMS.get(str(group), _FALLBACK)


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    coeffs = _coeffs_for(group)
    a = float(coeffs['a'])
    b = float(coeffs['b'])
    alpha = float(_ALPHA)
    beta = float(_BETA)

    outputs = []
    for row in input_data:
        N = float(row.get('num_params'))
        P = float(row.get('parallel_size'))
        denom = pow(N, alpha) * pow(P, beta)
        y = a + (b / denom if denom > 0 else 0.0)
        outputs.append({'loss': float(y)})
    return outputs

Parallel Scaling Law

All Runs (sorted by R²)