SLD - Domain Mixture Scaling Law

All Runs (sorted by R²)

Best Run 1 R² = 0.990428

▼

Python

from __future__ import annotations

import math
from typing import Dict, List


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    The law models each domain's validation loss as the sum of:
      - a group- and domain-specific intercept a_i,
      - a group- and domain-specific coefficient b_i times log(p_i + eps), capturing
        diminishing returns from allocating more mixture proportion to the same domain,
      - plus a linear combination of the proportions of the other domains (j != i),
        with group- and domain-specific coefficients c_{i,j}.

    Mathematically, for domain i in {1..5}:
        loss_i = a_i + b_i * log(p_i + eps) + sum_{j != i} c_{i,j} * p_j

    where p_k are the mixture proportions (sum_k p_k = 1), and eps is a small constant
    to handle zero proportions inside the logarithm.

    Args:
        input_data: List of dicts with keys 'proportion_domain_1'..'proportion_domain_5'.
        group: One of the experimental groups. The same functional form is used for all
               groups, with coefficients differing per group.

    Returns:
        A list of dicts with keys 'loss_domain_1'..'loss_domain_5'.
    """

    # Small constant to avoid log(0)
    EPS = 1e-6

    # Coefficients fitted per group on the provided dataset (/app/data), using the
    # model: loss_i = a_i + b_i * log(p_i + EPS) + sum_{j != i} c_{i,j} * p_j
    # For convenience, linear coefficients are stored as a full 5-length vector per domain
    # with 0.0 for the self-domain (j == i) entry.
    COEFFS = {
        "70M": {
            1: {"a": 2.352400, "b": -0.041342, "c": [0.000000, 0.552302, 0.679733, 0.457510, 0.478500]},
            2: {"a": 3.119185, "b": -0.005609, "c": [0.733329, 0.000000, 0.567223, 0.760307, 0.571576]},
            3: {"a": 1.557687, "b": -0.029500, "c": [1.776484, 1.574088, 0.000000, 1.672027, 1.590520]},
            4: {"a": 1.005729, "b": -0.040741, "c": [0.682161, 0.804593, 0.768164, 0.000000, 0.680742]},
            5: {"a": 3.401418, "b": -0.019938, "c": [0.282951, 0.204621, 0.280657, 0.244292, 0.000000]},
        },
        "160M": {
            1: {"a": 2.084419, "b": -0.039436, "c": [0.000000, 0.515541, 0.590549, 0.410446, 0.414215]},
            2: {"a": 2.848965, "b": -0.005760, "c": [0.664815, 0.000000, 0.533358, 0.698111, 0.486927]},
            3: {"a": 1.375788, "b": -0.028472, "c": [1.645880, 1.472320, 0.000000, 1.592583, 1.466833]},
            4: {"a": 0.822570, "b": -0.036176, "c": [0.633280, 0.747330, 0.680942, 0.000000, 0.623930]},
            5: {"a": 3.044954, "b": -0.020112, "c": [0.288934, 0.234711, 0.313982, 0.265677, 0.000000]},
        },
        "305M": {
            1: {"a": 1.965386, "b": -0.039011, "c": [0.000000, 0.461256, 0.591688, 0.362942, 0.378769]},
            2: {"a": 2.675656, "b": -0.004898, "c": [0.681773, 0.000000, 0.558797, 0.717652, 0.506549]},
            3: {"a": 1.389474, "b": -0.030900, "c": [1.455301, 1.326467, 0.000000, 1.424874, 1.288538]},
            4: {"a": 0.758123, "b": -0.034855, "c": [0.586244, 0.671620, 0.645107, 0.000000, 0.580221]},
            5: {"a": 2.880988, "b": -0.021162, "c": [0.278675, 0.225879, 0.321137, 0.249162, 0.000000]},
        },
        "410M": {
            1: {"a": 1.904173, "b": -0.038724, "c": [0.000000, 0.497929, 0.520547, 0.389682, 0.371875]},
            2: {"a": 2.648743, "b": -0.005145, "c": [0.632228, 0.000000, 0.458498, 0.688205, 0.451025]},
            3: {"a": 1.311117, "b": -0.031575, "c": [1.474932, 1.346313, 0.000000, 1.429078, 1.297670]},
            4: {"a": 0.726224, "b": -0.033638, "c": [0.560347, 0.717670, 0.657147, 0.000000, 0.569629]},
            5: {"a": 2.802291, "b": -0.021963, "c": [0.276436, 0.261534, 0.247464, 0.274675, 0.000000]},
        },
    }

    # Fallback: if an unknown group is provided, use the closest available group
    # by parameterization (default to the smallest model "70M").
    params_by_group = COEFFS.get(group)
    if params_by_group is None:
        params_by_group = COEFFS["70M"]

    outputs: List[Dict[str, float]] = []
    for row in input_data:
        # Read proportions in a fixed order
        p = [float(row.get(f"proportion_domain_{i}", 0.0)) for i in range(1, 6)]
        # Normalize defensively in case inputs are not perfectly normalized
        s = sum(p)
        if s > 0:
            p = [pi / s for pi in p]

        pred: Dict[str, float] = {}
        for i in range(1, 6):
            par = params_by_group[i]
            a = par["a"]
            b = par["b"]
            c = par["c"]  # length-5, zero at index i-1
            log_term = math.log(max(p[i - 1], 0.0) + EPS)
            linear_term = sum(c[j] * p[j] for j in range(5))
            y = a + b * log_term + linear_term
            pred[f"loss_domain_{i}"] = float(y)

        outputs.append(pred)

    return outputs

#2 Run 2 R² = 0.971446

▼

Python

from __future__ import annotations

import math
from typing import Dict, List


# Shared per-domain offset to make log well-defined at zero proportion.
# Selected via cross-group grid search to maximize average R^2.
_C_BY_DOMAIN: Dict[int, float] = {
    1: 0.003125,
    2: 0.0046875,
    3: 0.0015625,
    4: 0.003125,
    5: 0.0234375,
}

# Per-group, per-domain coefficients for the law:
#   loss_domain_i = a[g,i] + b[g,i] * log(proportion_domain_i + C[i])
_COEFS_BY_GROUP: Dict[str, Dict[int, Dict[str, float]]] = {
    "160M": {
        1: {"a": 2.2424684059708717, "b": -0.1412039367794934},
        2: {"a": 3.2541456054570035, "b": -0.0405858087415962},
        3: {"a": 2.567113771294527, "b": -0.11102183824132524},
        4: {"a": 1.167057464525046, "b": -0.1374903829439023},
        5: {"a": 3.085896059150068, "b": -0.13693966564752397},
    },
    "305M": {
        1: {"a": 2.101566717029658, "b": -0.137250480526824},
        2: {"a": 3.0994046220334157, "b": -0.03855781421113323},
        3: {"a": 2.4128405334920684, "b": -0.11468007010082179},
        4: {"a": 1.0709224798061698, "b": -0.13158540389099785},
        5: {"a": 2.918730301818813, "b": -0.13736039699677574},
    },
    "410M": {
        1: {"a": 2.0433633841009002, "b": -0.1355817799554127},
        2: {"a": 3.0302311023376722, "b": -0.03717104079844086},
        3: {"a": 2.341569758388262, "b": -0.11681678106085443},
        4: {"a": 1.0375379829435523, "b": -0.1281673168777551},
        5: {"a": 2.841139982418687, "b": -0.14126939233946334},
    },
    "70M": {
        1: {"a": 2.538957210154492, "b": -0.15195781604593908},
        2: {"a": 3.5809102439444924, "b": -0.044369404452315006},
        3: {"a": 2.8462143926943795, "b": -0.11662362761003567},
        4: {"a": 1.3734832511282675, "b": -0.15423705944877764},
        5: {"a": 3.4370247988143974, "b": -0.13346648082051366},
    },
}

# Fallback coefficients (mean across groups) used if an unknown group is provided.
_FALLBACK_COEFS: Dict[int, Dict[str, float]] = {
    1: {"a": 2.231588929313981, "b": -0.1414985033269173},
    2: {"a": 3.2411728934431463, "b": -0.040171017050871324},
    3: {"a": 2.5419346139673094, "b": -0.11478557925325927},
    4: {"a": 1.162250294600759, "b": -0.13787004079035822},
    5: {"a": 3.0706977855504913, "b": -0.13725898395106917},
}


def _predict_for_row(row: Dict[str, float], coefs: Dict[int, Dict[str, float]]) -> Dict[str, float]:
    out: Dict[str, float] = {}
    for i in range(1, 6):
        p = float(row.get(f"proportion_domain_{i}", 0.0))
        c = _C_BY_DOMAIN[i]
        a = coefs[i]["a"]
        b = coefs[i]["b"]
        # Numerically safe log with small positive offset.
        pred = a + b * math.log(max(p, 0.0) + c)
        out[f"loss_domain_{i}"] = float(pred)
    return out


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    coefs = _COEFS_BY_GROUP.get(group, _FALLBACK_COEFS)
    return [_predict_for_row(row, coefs) for row in input_data]

#3 Run 3 R² = 0.971092

▼

Python

# Discovered scaling law (shared functional form across groups):
#   loss_domain_k = a_{g,k} + b_{g,k} * (proportion_domain_k + c_{g,k}) ** d_{g,k}
# where g indexes the experimental group and k in {1..5} indexes the domain.


# Fitted parameters per group and per domain (k = 1..5).
# Values were obtained by least squares fitting on the provided dataset.
_PARAMS = {
    "160M": {
        "domain_1": {"a": 3.118296598901631, "b": -0.8877441483834596, "c": 1e-06, "d": 0.2},
        "domain_2": {"a": 3.3016069560703,   "b":  0.0018261061960447534, "c": 0.09183755102040818, "d": -1.9},
        "domain_3": {"a": 3.312678979117116, "b": -0.8785517989680411, "c": 1e-06, "d": 0.25},
        "domain_4": {"a": 1.0034194693515974, "b": 0.24243472606360897, "c": 0.010205061224489796, "d": -0.3},
        "domain_5": {"a": 4.118698802652438, "b": -1.040736728854194,  "c": 0.010205061224489796, "d": 0.15},
    },
    "305M": {
        "domain_1": {"a": 2.952929420274062, "b": -0.8629898487399583, "c": 1e-06, "d": 0.2},
        "domain_2": {"a": 3.127318377480758, "b":  0.007202777918698585, "c": 0.061225367346938786, "d": -1.15},
        "domain_3": {"a": 3.207800159763377, "b": -0.8322110176673488, "c": 1e-06, "d": 0.2},
        "domain_4": {"a": 0.9142458490282026, "b": 0.23205050091886964, "c": 0.010205061224489796, "d": -0.3},
        "domain_5": {"a": 0.2308748803753971, "b": 2.689570473211351,  "c": 0.030613183673469394, "d": -0.05},
    },
    "410M": {
        "domain_1": {"a": 2.884545514793264, "b": -0.852803939078441,  "c": 1e-06, "d": 0.2},
        "domain_2": {"a": 3.0790530221595462, "b": 0.001307985468446201, "c": 0.07142942857142857, "d": -1.8},
        "domain_3": {"a": 3.151353366782936, "b": -0.8477484549673475, "c": 1e-06, "d": 0.2},
        "domain_4": {"a": 0.884948385285892, "b": 0.22601551215198246, "c": 0.010205061224489796, "d": -0.3},
        "domain_5": {"a": 4.3056179059111725, "b": -1.4673725683803218, "c": 0.010205061224489796, "d": 0.1},
    },
    "70M": {
        "domain_1": {"a": 2.00061995681349,  "b": 0.5657494904538474,  "c": 0.010205061224489796, "d": -0.2},
        "domain_2": {"a": 3.6141195606157925, "b": 0.00530072540335944, "c": 0.10204161224489797, "d": -1.6},
        "domain_3": {"a": 3.6293616626237855, "b": -0.9227693166043902, "c": 1e-06, "d": 0.25},
        "domain_4": {"a": 1.1898035159399545, "b": 0.2720101134529132, "c": 0.010205061224489796, "d": -0.3},
        "domain_5": {"a": 3.249380658214184,  "b": 0.2049354477567703,  "c": 0.11224567346938777, "d": -0.55},
    },
}

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Select parameters for the group; if unseen, use average across known groups.
    if group in _PARAMS:
        params = _PARAMS[group]
    else:
        domains = [f"domain_{i}" for i in range(1, 6)]
        params = {}
        for d in domains:
            acc = {"a": 0.0, "b": 0.0, "c": 0.0, "d": 0.0}
            for g in _PARAMS.values():
                pd = g[d]
                acc["a"] += pd["a"]
                acc["b"] += pd["b"]
                acc["c"] += pd["c"]
                acc["d"] += pd["d"]
            n = float(len(_PARAMS))
            params[d] = {k: v / n for k, v in acc.items()}

    outputs: list[dict[str, float]] = []
    for row in input_data:
        out: dict[str, float] = {}
        for k in range(1, 6):
            p = float(row.get(f"proportion_domain_{k}", 0.0))
            par = params[f"domain_{k}"]
            a, b, c, d = par["a"], par["b"], par["c"], par["d"]
            # Ensure numerical stability for zero proportions
            pred = a + b * (p + c) ** d
            out[f"loss_domain_{k}"] = float(pred)
        outputs.append(out)
    return outputs

#4 Run 4 R² = 0.899201

▼

Python

from __future__ import annotations

import math
from typing import Dict, List


# Per-group, per-target coefficients fitted on /app/data using OLS.
# Model: loss_k = b0 + sum_j b_j * log(proportion_j + eps)
_EPS = 1e-6
_COEFS: Dict[str, Dict[int, List[float]]] = {
    "160M": {
        1: [
            2.4042025803406553,
            -0.04878456902733,
            -0.001374141990715709,
            0.00425143168670042,
            -0.004036425589256114,
            0.007660516801797778,
        ],
        2: [
            3.329935868696654,
            0.002613273350161564,
            -0.011891996557419415,
            2.5572674116154508e-05,
            0.003256629390122456,
            -0.005952205705195364,
        ],
        3: [
            2.8006343825165314,
            0.0038463371713819104,
            0.0010271048724917586,
            -0.03834514983761013,
            0.003370871701657902,
            -0.004992444549614571,
        ],
        4: [
            1.3842254371292473,
            0.0001770125184695076,
            0.0019008052563317707,
            0.00040985897229494966,
            -0.04321725706040489,
            0.005203775943861618,
        ],
        5: [
            3.22070848376419,
            0.005681276339156933,
            0.004826630941776033,
            -0.00156340623248917,
            0.005473022220321205,
            -0.030039287316337895,
        ],
    },
    "305M": {
        1: [
            2.254097761686427,
            -0.047618308465165196,
            -0.0013550046237553216,
            0.004003192579882288,
            -0.0042909223510698554,
            0.006758159630738282,
        ],
        2: [
            3.166848403701224,
            0.0023474329227610603,
            -0.011422287483401284,
            4.890168638858063e-06,
            0.0026987505976661923,
            -0.006498741023938888,
        ],
        3: [
            2.6500359123770685,
            0.0035548451771600445,
            0.0011888369505355075,
            -0.03964115398330717,
            0.0028959099742507494,
            -0.004828358464409514,
        ],
        4: [
            1.2751944568968825,
            -0.00024923215104993846,
            0.0015693015233153626,
            0.0004352251902864513,
            -0.041431801189773,
            0.00487564464080058,
        ],
        5: [
            3.0477226649403506,
            0.005424541586794049,
            0.004752807485933067,
            -0.0015473014680650466,
            0.004775541162988269,
            -0.030879417992197157,
        ],
    },
    "410M": {
        1: [
            2.1969109104720936,
            -0.04711053566759858,
            -0.0005929598868873091,
            0.0032935545744610275,
            -0.0034299883497122844,
            0.006245418652814477,
        ],
        2: [
            3.096774846113359,
            0.002683578481767577,
            -0.01079532012122322,
            -0.0006061709329291659,
            0.003289915492216653,
            -0.007059325448456723,
        ],
        3: [
            2.5854698484676923,
            0.0036493802056689617,
            0.0014981486567392278,
            -0.04057579585320029,
            0.003422309711002145,
            -0.005304641068008599,
        ],
        4: [
            1.2374283294627488,
            -0.0007019754734980677,
            0.0023220834389344196,
            0.00022483249867104974,
            -0.040255057354183485,
            0.0043998574943246395,
        ],
        5: [
            2.9734625677861892,
            0.005474979456664107,
            0.005292433569369652,
            -0.0021983339298417347,
            0.005328130514437292,
            -0.031848404298810445,
        ],
    },
    "70M": {
        1: [
            2.709753097020149,
            -0.05223448519596348,
            -0.002096832291136182,
            0.0049309629806633245,
            -0.005121637241168704,
            0.008582633603168494,
        ],
        2: [
            3.6589940901202946,
            0.0024594854221674217,
            -0.013037306540415955,
            0.00018013301869309603,
            0.0025694674548725075,
            -0.0055192991657138785,
        ],
        3: [
            3.0882519411499176,
            0.004138978755961604,
            0.000999612740187218,
            -0.040138664176247195,
            0.002610113875372775,
            -0.00489486237526904,
        ],
        4: [
            1.6123830068952152,
            -0.0002626923872205353,
            0.001840626589019361,
            0.0004871656033250259,
            -0.048580221686945084,
            0.0053741041526707105,
        ],
        5: [
            3.567736623439916,
            0.005906206376564464,
            0.00463119283202685,
            -0.0015851069219014631,
            0.004999662780701288,
            -0.029220774259506147,
        ],
    },
}


def _predict_row(props: List[float], coefs: List[float]) -> float:
    # coefs: [b0, b1..b5], props: [p1..p5]
    x = [1.0] + [math.log(max(p, 0.0) + _EPS) for p in props]
    return sum(c * xi for c, xi in zip(coefs, x))


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Choose coefficients for the requested group; if unseen, fall back to the
    # average of known groups to remain robust.
    if group in _COEFS:
        group_coefs = _COEFS[group]
    else:
        # Average coefficients across groups
        group_coefs = {}
        for k in range(1, 6):
            # Collect coefs per group for this k
            mats = [v[k] for v in _COEFS.values()]
            avg = [sum(col) / len(mats) for col in zip(*mats)]
            group_coefs[k] = avg

    outputs: List[Dict[str, float]] = []
    for row in input_data:
        props = [
            float(row.get(f"proportion_domain_{i}", 0.0)) for i in range(1, 6)
        ]
        pred = {}
        for k in range(1, 6):
            yk = _predict_row(props, group_coefs[k])
            pred[f"loss_domain_{k}"] = float(yk)
        outputs.append(pred)
    return outputs

#5 Run 5 R² = 0.834132

▼

Python

from __future__ import annotations
import math
from typing import Dict, List


# Coefficients fitted on the provided dataset using
# least squares for the model:
#   loss_domain_i = c_{g,i} + b_{g,i} * log(1 / (proportion_domain_i + eps))
# where g is the experimental group.
_COEFFS: Dict[str, Dict[int, tuple[float, float]]] = {
    "160M": {
        1: (2.437447, 0.022863),
        2: (3.321687, 0.005456),
        3: (2.834681, 0.016343),
        4: (1.410384, 0.020125),
        5: (3.155392, 0.016970),
    },
    "305M": {
        1: (2.290583, 0.022284),
        2: (3.163720, 0.005168),
        3: (2.688818, 0.016917),
        4: (1.303708, 0.019271),
        5: (2.988093, 0.017195),
    },
    "410M": {
        1: (2.229470, 0.022088),
        2: (3.091907, 0.005016),
        3: (2.622653, 0.017235),
        4: (1.264438, 0.018752),
        5: (2.912269, 0.017788),
    },
    "70M": {
        1: (2.750293, 0.024420),
        2: (3.655117, 0.005927),
        3: (3.127444, 0.017154),
        4: (1.646486, 0.022572),
        5: (3.504848, 0.016495),
    },
}


def _get_group_coeffs(group: str) -> Dict[int, tuple[float, float]]:
    """Return per-domain (c, b) coefficients for the given group.

    If the group is unknown, fall back to the average of known groups.
    """
    if group in _COEFFS:
        return _COEFFS[group]
    # Fallback: average coefficients across all known groups
    avg: Dict[int, List[float]] = {i: [0.0, 0.0] for i in range(1, 6)}
    n = float(len(_COEFFS))
    for g in _COEFFS.values():
        for i in range(1, 6):
            c, b = g[i]
            avg[i][0] += c / n
            avg[i][1] += b / n
    return {i: (avg[i][0], avg[i][1]) for i in range(1, 6)}


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    coeffs = _get_group_coeffs(group)
    eps = 1e-9

    outputs: List[Dict[str, float]] = []
    for row in input_data:
        pred: Dict[str, float] = {}
        for i in range(1, 6):
            q = float(row.get(f"proportion_domain_{i}", 0.0))
            c, b = coeffs[i]
            pred[f"loss_domain_{i}"] = c + b * math.log(1.0 / (q + eps))
        outputs.append(pred)
    return outputs