SLD - U-shaped Scaling Law - mini-swe-agent + GPT-5

All Runs (sorted by R²)

Best Run 1 R² = 0.302510

▼

Python

# Auto-generated convex (U-shaped) quadratic scaling law: brier_score vs log_flops
from typing import List, Dict

# y = a * (x - c)**2 + b, x = log_flops, y = brier_score
_COEFS: dict[str, dict[str, float]] = {
  "__default__": {
    "a": 0.016435167540703028,
    "b": -0.42693678124584933,
    "c": -1.6953621257789337
  },
  "abstract_narrative_understanding": {
    "a": 0.03559099077642667,
    "b": -0.644905899063743,
    "c": -1.6953621257789337
  },
  "analogical_similarity": {
    "a": 0.00015458082325933974,
    "b": -0.5443035235943459,
    "c": -1.6056066401185167
  },
  "arc": {
    "a": 0.010326836994446163,
    "b": -0.1361076284330853,
    "c": -1.6953621257789337
  },
  "arithmetic": {
    "a": 0.015354018314906261,
    "b": -0.31591913431197544,
    "c": -1.5743719559664417
  },
  "conceptual_combinations": {
    "a": 0.011559745311168344,
    "b": -0.4646413787056248,
    "c": -1.4823777596427932
  },
  "hellaswag": {
    "a": 0.007609182534322093,
    "b": -0.08846707981157574,
    "c": -1.6953621257789337
  },
  "hindu_knowledge": {
    "a": 0.01020143688094949,
    "b": -0.4433503348958553,
    "c": 1.2992805614553293
  },
  "mmlu": {
    "a": 0.01625188241125213,
    "b": -0.5485520085426114,
    "c": 2.0141120689193435
  },
  "parsinlu_qa_mc": {
    "a": 1e-08,
    "b": -0.4342415825508818,
    "c": -1.6953621257789337
  }
}

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    params = _COEFS.get(group, _COEFS["__default__"])
    a = float(params["a"])  # curvature (>= 0)
    b = float(params["b"])  # minimum brier_score at optimal c
    c = float(params["c"])  # optimal log_flops (vertex)

    outputs: list[dict[str, float]] = []
    for row in input_data:
        x = float(row["log_flops"]) 
        y = a * (x - c) ** 2 + b
        outputs.append({"brier_score": float(y)})
    return outputs

#2 Run 2 R² = 0.241497

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Convex U-shaped scaling law in log_flops:
    # Vertex form: brier_score = a * (log_flops - c)**2 + b with a >= 0
    # Implemented via equivalent polynomial coefficients:
    # brier_score = a2 * (log_flops**2) + a1 * log_flops + a0, where a2 = a, a1 = -2*a*c, a0 = a*c**2 + b
    coeffs = {
  "abstract_narrative_understanding": {
    "a2": 0.027345297981454717,
    "a1": 0.136239459030282,
    "a0": -0.5439812284717438
  },
  "analogical_similarity": {
    "a2": 0.00022819303071987078,
    "a1": 0.0010549746377413957,
    "a0": -0.5445517480226065
  },
  "arc": {
    "a2": 0.008151806820498677,
    "a1": 0.040613847108095076,
    "a0": -0.10844709861781436
  },
  "arithmetic": {
    "a2": 0.012583964497968407,
    "a1": 0.056605552958780614,
    "a0": -0.28129715701407854
  },
  "conceptual_combinations": {
    "a2": 0.009268857989096976,
    "a1": 0.038282720441071054,
    "a0": -0.4396996360981997
  },
  "hellaswag": {
    "a2": 0.006045219218956594,
    "a1": 0.030118428282210164,
    "a0": -0.06837445680140108
  },
  "hindu_knowledge": {
    "a2": 0.007961278176978049,
    "a1": -0.027690321689855524,
    "a0": -0.42519247446304
  },
  "mmlu": {
    "a2": 0.012201937081809734,
    "a1": -0.06337345377976922,
    "a0": -0.4806948100054965
  },
  "parsinlu_qa_mc": {
    "a2": 0.00032104940646667657,
    "a1": 0.0015995290118497466,
    "a0": -0.4366374889008974
  }
}
    default_coeffs = {
  "a2": 0.012506015226011717,
  "a1": 0.06230733891332299,
  "a0": -0.3797213963831994
}
    params = coeffs.get(group, default_coeffs)
    a2 = params["a2"]
    a1 = params["a1"]
    a0 = params["a0"]
    outputs: list[dict[str, float]] = []
    for row in input_data:
        x = float(row.get("log_flops", 0.0))
        y = a2 * (x ** 2) + a1 * x + a0
        outputs.append({"brier_score": float(y)})
    return outputs

#3 Run 3 R² = -1.000000

▼

Python

# Auto-generated scaling law implementation
from typing import List, Dict

# Quadratic coefficients per group for:
# brier_score_hat = a + b * log_flops + c * (log_flops)**2
COEFS: Dict[str, Dict[str, float]] = {
  "abstract_narrative_understanding": {
    "a": -0.5431407140744655,
    "b": 0.18472699005645873,
    "c": -0.001002095718967912
  },
  "analogical_similarity": {
    "a": -0.5405750537735581,
    "b": 0.0279112874834725,
    "c": -0.019175879672698435
  },
  "arc": {
    "a": -0.1071122327154294,
    "b": 0.11761949039897288,
    "c": -0.036868206393668744
  },
  "arithmetic": {
    "a": -0.2475326777122078,
    "b": 0.23537009797522832,
    "c": -0.12997814962868387
  },
  "conceptual_combinations": {
    "a": -0.40934554313141813,
    "b": 0.09692595522861085,
    "c": -0.07148356706471508
  },
  "hellaswag": {
    "a": -0.06719686154646047,
    "b": 0.09805145434945438,
    "c": -0.033670645755682356
  },
  "hindu_knowledge": {
    "a": -0.4103174193780911,
    "b": -0.031143510554884814,
    "c": -0.034402388960081354
  },
  "mmlu": {
    "a": -0.480364650219835,
    "b": -0.06297043488789662,
    "c": 0.011476264280523694
  },
  "parsinlu_qa_mc": {
    "a": -0.43495071806820146,
    "b": 0.0989058373264011,
    "c": -0.05656739537407183
  }
}
DEFAULT_COEFS: Dict[str, float] = {
  "a": -0.3784396938370407,
  "b": 0.07737556836857276,
  "c": 0.002644673247271387
}

def _get_x(d: Dict[str, float]) -> float:
    if "log_flops" in d:
        return float(d["log_flops"])
    if "flops" in d:
        import math
        return float(math.log(float(d["flops"])))
    raise KeyError("Expected 'log_flops' (or 'flops') in input datum.")

def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
               The functional form of the law is the same for all groups,
               but the constant parameters/coefficients differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    coefs = COEFS.get(group, DEFAULT_COEFS)
    a = float(coefs["a"])
    b = float(coefs["b"])
    c = float(coefs["c"])
    preds: List[Dict[str, float]] = []
    for d in input_data:
        x = _get_x(d)
        yhat = a + b * x + c * (x ** 2)
        preds.append({"brier_score": float(yhat)})
    return preds

#4 Run 4 R² = -1.000000

▼

Python

# Auto-generated scaling law implementation
# Formula: brier_score = a * (log_flops**2) + b * log_flops + c
# Coefficients are fitted per group; a single functional form across groups.

from typing import List, Dict

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Per-group coefficients (a, b, c) for: y = a*x^2 + b*x + c, with x = log_flops
    PARAMS = {
        "abstract_narrative_understanding": (-0.00100209571897, 0.184726990056, -0.543140714074),
        "analogical_similarity": (-0.0191758796727, 0.0279112874835, -0.540575053774),
        "arc": (-0.0368682063937, 0.117619490399, -0.107112232715),
        "arithmetic": (-0.129978149629, 0.235370097975, -0.247532677712),
        "conceptual_combinations": (-0.0714835670647, 0.0969259552286, -0.409345543131),
        "hellaswag": (-0.0336706457557, 0.0980514543495, -0.0671968615465),
        "hindu_knowledge": (-0.0344023889601, -0.0311435105549, -0.410317419378),
        "mmlu": (0.0114762642805, -0.0629704348879, -0.48036465022),
        "parsinlu_qa_mc": (-0.0565673953741, 0.0989058373264, -0.434950718068),
    }

    # Default/global coefficients (fallback if group not found)
    DEFAULT = (0.00264467324727, 0.0773755683686, -0.378439693837)

    a, b, c = PARAMS.get(group, DEFAULT)

    output = []
    for row in input_data:
        x = float(row.get("log_flops"))
        y_hat = a*(x**2) + b*x + c
        output.append({"brier_score": float(y_hat)})
    return output

#5 Run 5 R² = -1.000000

▼

Python

# Auto-generated scaling law implementation
# Model form: brier_score = a + b * log_flops + c * (log_flops ** 2)

from typing import List, Dict

# Per-group coefficients: a, b, c
COEFFS = {
    'abstract_narrative_understanding': (-0.543140714074, 0.184726990056, -0.00100209571897),
    'analogical_similarity': (-0.540575053774, 0.0279112874835, -0.0191758796727),
    'arc': (-0.107112232715, 0.117619490399, -0.0368682063937),
    'arithmetic': (-0.247532677712, 0.235370097975, -0.129978149629),
    'conceptual_combinations': (-0.409345543131, 0.0969259552286, -0.0714835670647),
    'hellaswag': (-0.0671968615465, 0.0980514543495, -0.0336706457557),
    'hindu_knowledge': (-0.410317419378, -0.0311435105549, -0.0344023889601),
    'mmlu': (-0.48036465022, -0.0629704348879, 0.0114762642805),
    'parsinlu_qa_mc': (-0.434950718068, 0.0989058373264, -0.0565673953741),
}

GLOBAL_COEFFS = (-0.378439693837, 0.0773755683686, 0.00264467324727)


def _predict_single(x: float, coeffs: tuple[float, float, float]) -> float:
    a, b, c = coeffs
    return a + b * x + c * (x ** 2)

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
               The functional form of the law is the same for all groups,
               but the coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    coeffs = COEFFS.get(group, GLOBAL_COEFFS)
    outputs: list[dict[str, float]] = []
    for row in input_data:
        if 'log_flops' not in row:
            raise ValueError("Each input row must contain 'log_flops'.")
        x = float(row['log_flops'])
        yhat = _predict_single(x, coeffs)
        outputs.append({'brier_score': float(yhat)})
    return outputs