SLD - Data-Constrained Scaling Law - mini-swe-agent + GPT-5

Best Run 1 R² = 0.942963

▼

Python

from typing import List, Dict

def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    The functional form (shared across groups):
        loss = L_inf + A * params^{-a_p} + B * tokens^{-a_t} + C * unique_tokens^{-a_u}

    Where (L_inf, A, B, C, a_p, a_t, a_u) are group-specific constants.
    If an unknown group is provided, a default set of coefficients is used.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Expected keys: 'params', 'tokens', 'unique_tokens'.
        group: The name of the experimental group for which to make predictions.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s): {'loss': ...}.
    """
    # Coefficients fitted on the provided dataset.
    # Chosen family: additive inverse-power law
    #   loss = L_inf + A * P^{-a_p} + B * T^{-a_t} + C * U^{-a_u}
    COEFFS = {
        "all_data": {
            "Linf": 1.567348010743855,
            "A": 4786.152701939445,
            "B": 33007.3360235617,
            "C": 9.427421564925798,
            "ap": 0.5,
            "at": 0.5,
            "au": 0.1,
        }
    }

    # Fallback to 'all_data' if group not present
    params_for_group = COEFFS.get(group, COEFFS["all_data"])

    Linf = float(params_for_group["Linf"])
    A    = float(params_for_group["A"])
    B    = float(params_for_group["B"])
    C    = float(params_for_group["C"])
    ap   = float(params_for_group["ap"])
    at   = float(params_for_group["at"])
    au   = float(params_for_group["au"])

    eps = 1e-12  # numerical stability for very small/zero inputs

    outputs: List[Dict[str, float]] = []
    for record in input_data:
        P = float(record.get("params", 0.0))
        T = float(record.get("tokens", 0.0))
        U = float(record.get("unique_tokens", 0.0))

        # Guard against non-positive values in power transforms
        P_eff = max(P, eps)
        T_eff = max(T, eps)
        U_eff = max(U, eps)

        loss_pred = Linf + A * (P_eff ** (-ap)) + B * (T_eff ** (-at)) + C * (U_eff ** (-au))
        outputs.append({"loss": float(loss_pred)})

    return outputs

#2 Run 2 R² = 0.915943

▼

Python

# Auto-generated scaling law implementation
# Formula: loss = L_inf + A * params**(-alpha) + B * tokens**(-beta) + C * unique_tokens**(-gamma)
# Shared exponents across groups, per-group linear coefficients.

from typing import List, Dict

ALPHA = 0.50000000
BETA = 0.55000000
GAMMA = 0.15000000

COEFFS = {
    'all_data': dict(L0=1.977222446498, A=4677.914495956277, B=84210.885300063994, C=18.466557686574),
}

# Fallback coefficients (mean across groups) for unseen groups
if COEFFS:
    _L0_mean = sum(v['L0'] for v in COEFFS.values())/len(COEFFS)
    _A_mean  = sum(v['A']  for v in COEFFS.values())/len(COEFFS)
    _B_mean  = sum(v['B']  for v in COEFFS.values())/len(COEFFS)
    _C_mean  = sum(v['C']  for v in COEFFS.values())/len(COEFFS)
else:
    _L0_mean = 0.0; _A_mean = 0.0; _B_mean = 0.0; _C_mean = 0.0

def _get_coeffs(group: str):
    return COEFFS.get(group, dict(L0=_L0_mean, A=_A_mean, B=_B_mean, C=_C_mean))

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Required keys: 'params', 'tokens', 'unique_tokens'.
        group: The name of the experimental group for which to make predictions.
               The functional form of the law is the same for all groups, but the
               coefficients differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s): 'loss'.
    """
    co = _get_coeffs(group)
    L0 = float(co['L0']); A = float(co['A']); B = float(co['B']); C = float(co['C'])
    out: list[dict[str, float]] = []
    for row in input_data:
        p = float(row.get('params', 0.0))
        t = float(row.get('tokens', 0.0))
        u = float(row.get('unique_tokens', 0.0))
        # Guard against non-positive inputs
        p = p if p > 0.0 else 1e-12
        t = t if t > 0.0 else 1e-12
        u = u if u > 0.0 else 1e-12
        x1 = p ** (-ALPHA)
        x2 = t ** (-BETA)
        x3 = u ** (-GAMMA)
        y = L0 + A*x1 + B*x2 + C*x3
        out.append({'loss': float(y)})
    return out

#3 Run 3 R² = 0.914136

▼

Python

from typing import List, Dict

# Fitted parameters per group for the scaling law:
# L = L0 + A*params**(-alpha) + B*tokens**(-beta) + C*unique_tokens**(-gamma)
_FITTED = {
    'all_data': {'L0': 1.854266820557524, 'A': 5185.946367775831, 'B': 108444.27084241492, 'C': 14.150551846023221, 'alpha': 0.5065481743109205, 'beta': 0.5635672529657598, 'gamma': 0.12922388708956437}
}
_DEFAULT = {'L0': 1.85427, 'A': 5185.95, 'B': 108444, 'C': 14.1506, 'alpha': 0.506548, 'beta': 0.563567, 'gamma': 0.129224}

def _predict_one(x: Dict[str, float], p: Dict[str, float]) -> Dict[str, float]:
    N = float(x.get("params", 0.0))
    T = float(x.get("tokens", 0.0))
    U = float(x.get("unique_tokens", 0.0))
    # Guard against non-positive inputs
    N = max(N, 1e-12); T = max(T, 1e-12); U = max(U, 1e-12)
    L0 = p["L0"]; A=p["A"]; B=p["B"]; C=p["C"]
    alpha=p["alpha"]; beta=p["beta"]; gamma=p["gamma"]
    loss = L0 + A*(N**(-alpha)) + B*(T**(-beta)) + C*(U**(-gamma))
    return {"loss": float(loss)}

def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    params = _FITTED.get(group, _DEFAULT)
    return [_predict_one(d, params) for d in input_data]

#4 Run 4 R² = 0.897712

▼

Python

# Auto-generated scaling law implementation
# Formula: loss = c0 + c1 * params**(-a) + c2 * tokens**(-b) + c3 * unique_tokens**(-g)

from typing import List, Dict

_COEFS = {
  "all_data": {
    "c0": 1.9933119298760928,
    "c1": 4633.641756724846,
    "a": 0.5,
    "c2": 214260.62274056696,
    "b": 0.6,
    "c3": 19.15963110934607,
    "g": 0.15
  },
  "default": {
    "c0": 1.9933119298760928,
    "c1": 4633.641756724846,
    "a": 0.5,
    "c2": 214260.62274056696,
    "b": 0.6,
    "c3": 19.15963110934607,
    "g": 0.15
  }
}

def _predict_loss(p: float, t: float, u: float, coef: dict) -> float:
    if p <= 0 or t <= 0 or u <= 0:
        raise ValueError('params, tokens, and unique_tokens must be positive.')
    c0 = coef['c0']; c1 = coef['c1']; a = coef['a']
    c2 = coef['c2']; b = coef['b']
    c3 = coef['c3']; g = coef['g']
    return c0 + c1 * (p ** (-a)) + c2 * (t ** (-b)) + c3 * (u ** (-g))

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    coef = _COEFS.get(group, _COEFS.get('default'))
    outputs = []
    for row in input_data:
        p = float(row.get('params'))
        t = float(row.get('tokens'))
        u = float(row.get('unique_tokens'))
        pred = _predict_loss(p, t, u, coef)
        outputs.append({'loss': float(pred)})
    return outputs

#5 Run 5 R² = 0.843528

▼

Python

from math import log, exp
# Per-group coefficients for offset power-law model:
# loss = L0 + exp(d) * params^a * tokens^b * unique_tokens^c
COEFFS = {"all_data": {"a": -0.13740984834974235, "b": -0.11134994171296027, "c": -0.10211645745135395, "d": 7.67408816991796, "L0": 2.0592936, "rmse_log": 0.1833877158506439}}
GLOBAL = {"a": -0.13740984834974235, "b": -0.11134994171296027, "c": -0.10211645745135395, "d": 7.67408816991796, "L0": 2.0592936, "rmse_log": 0.1833877158506439}

def _predict_one(P: float, T: float, U: float, coef: dict) -> float:
    if P <= 0 or T <= 0 or U <= 0:
        return float("nan")
    a=coef["a"]; b=coef["b"]; c=coef["c"]; d=coef["d"]; L0=coef.get("L0", 0.0)
    return float(L0 + exp(d) * (P**a) * (T**b) * (U**c))

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    coef = COEFFS.get(group, GLOBAL)
    out = []
    for row in input_data:
        P = float(row.get("params", float("nan")))
        T = float(row.get("tokens", float("nan")))
        U = float(row.get("unique_tokens", float("nan")))
        pred = _predict_one(P,T,U,coef)
        out.append({"loss": pred})
    return out

Data-Constrained Scaling Law

All Runs (sorted by R²)