SLD - LR & Batch Size Scaling Law

All Runs (sorted by R²)

Best Run 1 R² = -0.451316

▼

Python

# Auto-generated scaling law
# lm_loss ≈ a0 + a1*L + a2*L^2 + b1*B + b2*B^2 + c1*D + d1*P
# where L=log10(lr), B=log10(bsz), D=log10(data_size), P=log10(non_embedding_param_size)

def _eval_formula(x, w):
    import math
    lr = float(x.get('lr', 0.0))
    bsz = float(x.get('bsz', 0.0))
    data_size = float(x.get('data_size', 0.0))
    p = float(x.get('non_embedding_param_size', 0.0))
    L = math.log10(lr if lr > 0 else 1e-12)
    B = math.log10(bsz if bsz > 0 else 1e-12)
    D = math.log10(data_size if data_size > 0 else 1e-12)
    P = math.log10(p if p > 0 else 1e-12)
    a0, a1, a2, b1, b2, c1, d1 = w
    y = a0 + a1*L + a2*L*L + b1*B + b2*B*B + c1*D + d1*P
    return y

_COEFFS = {
  "all_data": [
    9.520327923915133,
    0.7972975713031198,
    0.13716991794202735,
    -0.4500903878962123,
    0.09891540904376367,
    -0.277570570267418,
    -0.3053374704220124
  ]
}
_FALLBACK = [9.520327923915133, 0.7972975713031198, 0.13716991794202735, -0.4500903878962123, 0.09891540904376367, -0.277570570267418, -0.3053374704220124]

from typing import List, Dict

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    w = _COEFFS.get(group, _FALLBACK)
    out = []
    for x in input_data:
        y = _eval_formula(x, w)
        out.append({'lm_loss': float(y)})
    return out

#2 Run 2 R² = -0.773483

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    import math
    # Per-group coefficients for the log-linear (power-law) model fitted on the dataset.
    # Model: log(lm_loss) = c0 + c1*log(lr) + c2*log(bsz) + c3*log(data_size) + c4*log(non_embedding_param_size)
    # => lm_loss = exp(c0) * lr**c1 * bsz**c2 * data_size**c3 * non_embedding_param_size**c4
    COEFS = {"all_data": {"c0": 3.0805017396527683, "c1": 0.00863691905384939, "c2": -0.0005162836622543786, "c3": -0.04700957690670226, "c4": -0.05174150134631458, "rmse": 0.06897968965227057, "mape": 0.019885314277278004, "n": 2702}}

    # Fallback: if an unknown group is provided, use the closest (first) available group's coefficients.
    coef = COEFS.get(group)
    if coef is None and COEFS:
        coef = next(iter(COEFS.values()))

    results = []
    for row in input_data:
        lr = float(row.get('lr', 0.0))
        bsz = float(row.get('bsz', 0.0))
        data_size = float(row.get('data_size', 0.0))
        params = float(row.get('non_embedding_param_size', 0.0))

        # Guard against non-positive inputs for power-law; if invalid, return NaN
        if lr <= 0 or bsz <= 0 or data_size <= 0 or params <= 0 or coef is None:
            pred = float('nan')
        else:
            c0 = coef['c0']; c1 = coef['c1']; c2 = coef['c2']; c3 = coef['c3']; c4 = coef['c4']
            pred = math.exp(c0) * (lr ** c1) * (bsz ** c2) * (data_size ** c3) * (params ** c4)
        results.append({'lm_loss': float(pred)})
    return results

#3 Run 3 R² = -0.773483

▼

Python

# Auto-generated scaling law function
# Formula: lm_loss = K_g * lr**a_g * bsz**b_g * data_size**c_g * non_embedding_param_size**d_g
# Coefficients fitted per group using log-linear regression on the provided dataset.
from typing import List, Dict

_COEFS = {
  "all_data": {
    "K": 21.7693221886775,
    "a_lr": 0.00863691905384939,
    "b_bsz": -0.0005162836622543786,
    "c_data": -0.04700957690670226,
    "d_params": -0.05174150134631458,
    "r2_log": 0.8839684523261057,
    "n": 2702
  }
}

# Some datasets may have a single group name like "all" if no group column existed.
# The caller passes a group string; we'll fall back to the first available group if missing.

def _predict_one(x: Dict[str, float], coefs: Dict[str, float]) -> float:
    # Safe epsilon to avoid nonpositive logs
    eps = 1e-12
    lr = max(float(x.get('lr', 0.0)), eps)
    bsz = max(float(x.get('bsz', 0.0)), eps)
    data_size = max(float(x.get('data_size', 0.0)), eps)
    params = max(float(x.get('non_embedding_param_size', 0.0)), eps)
    K = float(coefs['K'])
    a = float(coefs['a_lr'])
    b = float(coefs['b_bsz'])
    c = float(coefs['c_data'])
    d = float(coefs['d_params'])
    return K * (lr ** a) * (bsz ** b) * (data_size ** c) * (params ** d)


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    if not isinstance(input_data, list):
        raise TypeError('input_data must be a list of dicts')
    coefs = _COEFS.get(group)
    if coefs is None:
        # Fallback: use the first group's coefficients
        if not _COEFS:
            raise ValueError('No coefficients available')
        first_group = next(iter(_COEFS))
        coefs = _COEFS[first_group]
    preds = []
    for x in input_data:
        y = _predict_one(x, coefs)
        preds.append({'lm_loss': float(y)})
    return preds

#4 Run 4 R² = -0.773483

▼

Python

from math import exp, log

# Learned coefficients per group (with a global fallback)
_COEFFS = {'all_data': {'w0': 3.0805017396527683, 'w_lr': 0.00863691905384939, 'w_bsz': -0.0005162836622543786, 'w_data': -0.04700957690670226, 'w_param': -0.05174150134631458, 'rmse_log': 0.026407643979996752, 'n': 2702}, '__global__': {'w0': 3.0805017396527683, 'w_lr': 0.00863691905384939, 'w_bsz': -0.0005162836622543786, 'w_data': -0.04700957690670226, 'w_param': -0.05174150134631458, 'rmse_log': 0.026407643979996752, 'n': 2702}}
_FALLBACK_GROUP = '__global__' if '__global__' in _COEFFS else (next(iter(_COEFFS)) if _COEFFS else None)


def _predict_one(x: dict, w: dict) -> dict:
    # x: dict with lr, bsz, data_size, non_embedding_param_size
    lr = float(x.get('lr', 0.0))
    bsz = float(x.get('bsz', 0.0))
    D = float(x.get('data_size', 0.0))
    P = float(x.get('non_embedding_param_size', 0.0))
    # Clamp to small positive values to avoid log(<=0)
    eps = 1e-12
    if lr <= 0: lr = eps
    if bsz <= 0: bsz = eps
    if D <= 0: D = eps
    if P <= 0: P = eps
    y_log = w['w0'] + w['w_lr'] * log(lr) + w['w_bsz'] * log(bsz) + w['w_data'] * log(D) + w['w_param'] * log(P)
    y = exp(y_log)
    return {'lm_loss': float(y)}


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is fixed (log-linear in inputs),
                but coefficients vary per group.

    Returns:
        A list of dictionaries, each with key 'lm_loss' containing the predicted value.
    """
    if not _COEFFS:
        # No coefficients available; return NaN-like values
        return [{'lm_loss': float('nan')} for _ in input_data]
    w = _COEFFS.get(group)
    if w is None:
        w = _COEFFS.get(_FALLBACK_GROUP)
    return [_predict_one(x, w) for x in input_data]