SLD - LR & Batch Size Scaling Law - mini-swe-agent + GPT-5

All Runs (sorted by R²)

Best Run 1 R² = 0.373525

▼

Python

# Auto-generated scaling law implementation
# Discovered via ridge regression (manual) on polynomial-in-log features
# Do not modify the function signature.

from typing import List, Dict
import math

FEATURES = ['bias', 'L', 'B', 'D', 'P', 'L2', 'B2', 'D2', 'P2', 'LB', 'LD', 'LP', 'BD', 'BP', 'DP']

COEFS_BY_GROUP = {'all_data': {'coef': [16.624581903612846, 0.2627109539547664, 0.8995972963599023, -2.109340807436253, -0.3416462681138454, 0.14849884087182352, 0.1269746750542109, 0.13485667144489863, 0.07916170471632446, -0.08188202638168432, -0.024745326001810515, 0.12219666925411721, -0.12293565944271072, -0.0525403200519685, -0.08250175820236673], 'uses_bias_feature': True}}

def _make_features_one(x: Dict[str, float]):
    # Compute polynomial-in-log features
    L = math.log10(x["lr"])
    B = math.log10(x["bsz"])
    D = math.log10(x["data_size"])
    P = math.log10(x["non_embedding_param_size"])
    feats = {
        "bias": 1.0,
        "L": L, "B": B, "D": D, "P": P,
        "L2": L*L, "B2": B*B, "D2": D*D, "P2": P*P,
        "LB": L*B, "LD": L*D, "LP": L*P,
        "BD": B*D, "BP": B*P, "DP": D*P,
    }
    return [feats[k] for k in FEATURES]

def _predict_one(x: Dict[str, float], group: str) -> float:
    # Fallback to any known group's coefficients if unseen group
    g = group if group in COEFS_BY_GROUP else (list(COEFS_BY_GROUP.keys())[0] if COEFS_BY_GROUP else None)
    if g is None:
        raise ValueError("No coefficients available for prediction.")
    coef = COEFS_BY_GROUP[g]["coef"]
    feats = _make_features_one(x)
    return sum(c*f for c, f in zip(coef, feats))

def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is the same for all groups,
                but the coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    outputs = []
    for x in input_data:
        y = _predict_one(x, group)
        outputs.append({"lm_loss": float(y)})
    return outputs

#2 Run 2 R² = 0.031133

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Coefficients per group for the model:
    # lm_loss = c0 + c1*log(N) + c2*log(D) + c3*log(B) + c4*log(LR) + c5*(log(LR))^2
    COEFS = {'all_data': {'intercept': 8.93757202949646, 'log_n': -0.13012928622451295, 'log_d': -0.12092765178779778, 'log_b': -0.0002436010749052914, 'log_lr': 0.3285833854688861, 'log_lr2': 0.024612644687545315}}

    # Fallback: if group not seen, use 'all_data' if available, else a conservative default
    if group not in COEFS:
        group_key = 'all_data' if 'all_data' in COEFS else next(iter(COEFS))
    else:
        group_key = group

    params = COEFS[group_key]

    outputs = []
    for row in input_data:
        try:
            N = float(row['non_embedding_param_size'])
            D = float(row['data_size'])
            B = float(row['bsz'])
            LR = float(row['lr'])
        except KeyError as e:
            raise KeyError(f"Missing required key: {e}")

        # Guard against non-positive values
        eps = 1e-12
        import math
        log_n = math.log(max(N, eps))
        log_d = math.log(max(D, eps))
        log_b = math.log(max(B, eps))
        log_lr = math.log(max(LR, eps))
        log_lr2 = log_lr * log_lr

        lm_loss = (
            params['intercept']
            + params['log_n'] * log_n
            + params['log_d'] * log_d
            + params['log_b'] * log_b
            + params['log_lr'] * log_lr
            + params['log_lr2'] * log_lr2
        )

        outputs.append({'lm_loss': float(lm_loss)})
    return outputs

#3 Run 3 R² = 0.025235

▼

Python

# Auto-generated scaling law implementation
# Functional form (same across groups):
# log(lm_loss) = a0 + a1*log(lr) + a2*(log(lr))**2 + b*log(bsz) + c*log(data_size) + d*log(non_embedding_param_size)
# lm_loss = exp( ... )
from __future__ import annotations
import math
from typing import List, Dict

COEFFS: dict[str, list[float]] = {
  "all_data": [
    3.4633428865264464,
    0.13057007473444138,
    0.009779161172905821,
    -0.0005055619243274072,
    -0.04809130067252434,
    -0.051116402150383135
  ]
}

def _safe_log(x: float, eps: float = 1e-12) -> float:
    return math.log(max(float(x), eps))

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Select coefficients for the group
    if group in COEFFS:
        beta = COEFFS[group]
    elif "default" in COEFFS:
        beta = COEFFS["default"]
    elif len(COEFFS) > 0:
        # Fallback to first available group
        beta = next(iter(COEFFS.values()))
    else:
        # No coefficients available; default to zero vector
        beta = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

    out: list[dict[str, float]] = []
    for row in input_data:
        lr = float(row.get("lr", 0.0))
        bsz = float(row.get("bsz", 0.0))
        data_size = float(row.get("data_size", 0.0))
        non_embedding_param_size = float(row.get("non_embedding_param_size", 0.0))

        l_lr = _safe_log(lr)
        l_b = _safe_log(bsz)
        l_d = _safe_log(data_size)
        l_p = _safe_log(non_embedding_param_size)

        a0, a1, a2, b, c, d = beta
        log_pred = a0 + a1 * l_lr + a2 * (l_lr ** 2) + b * l_b + c * l_d + d * l_p
        lm_loss = float(math.exp(log_pred))
        out.append({"lm_loss": lm_loss})
    return out

#4 Run 4 R² = -0.773483

▼

Python

# Auto-generated scaling law based on a log-linear (power-law) fit.
# Formula:
#    ln(lm_loss) = A_g + a_g_lr*ln(lr) + a_g_bsz*ln(bsz) + a_g_ds*ln(data_size) + a_g_np*ln(non_embedding_param_size)
# => lm_loss = exp(A_g) * lr^a_g_lr * bsz^a_g_bsz * data_size^a_g_ds * non_embedding_param_size^a_g_np

from typing import List, Dict
import math

COEFFICIENTS: Dict[str, Dict[str, float]] = {
  "all_data": {
    "intercept": 3.0805017396527683,
    "lr": 0.00863691905384939,
    "bsz": -0.0005162836622543786,
    "data_size": -0.04700957690670226,
    "non_embedding_param_size": -0.05174150134631458
  }
}
GLOBAL_COEFFICIENTS: Dict[str, float] = {
  "intercept": 3.0805017396527683,
  "lr": 0.00863691905384939,
  "bsz": -0.0005162836622543786,
  "data_size": -0.04700957690670226,
  "non_embedding_param_size": -0.05174150134631458
}

def _predict_one(x: Dict[str, float], group: str) -> float:
    # Use group-specific coefficients if available, else fallback to global
    gkey = str(group)
    c = COEFFICIENTS.get(gkey, GLOBAL_COEFFICIENTS)
    # Ensure all inputs are positive for log
    lr = float(x.get("lr", 0.0))
    bsz = float(x.get("bsz", 0.0))
    data_size = float(x.get("data_size", 0.0))
    non_embedding_param_size = float(x.get("non_embedding_param_size", 0.0))
    eps = 1e-12
    if lr <= 0 or bsz <= 0 or data_size <= 0 or non_embedding_param_size <= 0:
        # Graceful handling: clamp to tiny positive to avoid math domain errors
        lr = max(lr, eps)
        bsz = max(bsz, eps)
        data_size = max(data_size, eps)
        non_embedding_param_size = max(non_embedding_param_size, eps)
    ln_y = (
        c["intercept"]
        + c["lr"] * math.log(lr)
        + c["bsz"] * math.log(bsz)
        + c["data_size"] * math.log(data_size)
        + c["non_embedding_param_size"] * math.log(non_embedding_param_size)
    )
    y = math.exp(ln_y)
    return float(y)

def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    out = []
    for row in input_data:
        y = _predict_one(row, group)
        out.append({"lm_loss": y})
    return out

#5 Run 5 R² = -1.000000

▼

Python

import math
from typing import List, Dict

# Discovered scaling law parameters per group.
COEFFS = {
  "_default": {
    "L_inf": 1.4844436961875769,
    "a_bsz": -0.003095036209539229,
    "a_data_size": -0.11488605291781988,
    "a_lr": 0.020887414852540892,
    "a_non_embedding_param_size": -0.12526765402456455,
    "log_A": 5.278756801510673,
    "mse": 0.004834661925900249
  },
  "all_data": {
    "L_inf": 1.4844436961875769,
    "a_bsz": -0.003095036209539229,
    "a_data_size": -0.11488605291781988,
    "a_lr": 0.020887414852540892,
    "a_non_embedding_param_size": -0.12526765402456455,
    "log_A": 5.278756801510673,
    "mse": 0.004834661925900249
  }
}

def _predict_single(x: Dict[str, float], pars: Dict[str, float]) -> float:
    # Power-law with asymptote:
    # lm_loss = L_inf + A * lr^a_lr * bsz^a_bsz * data_size^a_data_size * non_embedding_param_size^a_non_embedding_param_size
    L = pars["L_inf"]
    log_A = pars["log_A"]
    # Ensure positive inputs for power
    lr = max(float(x.get("lr", 0.0)), 1e-12)
    bsz = max(float(x.get("bsz", 0.0)), 1e-12)
    data = max(float(x.get("data_size", 0.0)), 1.0)
    params = max(float(x.get("non_embedding_param_size", 0.0)), 1.0)
    expo = (
        log_A
        + pars["a_lr"] * math.log(lr)
        + pars["a_bsz"] * math.log(bsz)
        + pars["a_data_size"] * math.log(data)
        + pars["a_non_embedding_param_size"] * math.log(params)
    )
    return L + math.exp(expo)

def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    pars = COEFFS.get(group, COEFFS.get("_default", {}))
    if not pars:
        raise ValueError("No coefficients available for group and no default provided.")
    out = []
    for x in input_data:
        y = _predict_single(x, pars)
        out.append({"lm_loss": float(y)})
    return out