SLD - U-shaped Scaling Law - openhands + GPT-5

All Runs (sorted by R²)

Best Run 1 R² = 0.241499

▼

Python

from typing import List, Dict

# Discovered U-shaped scaling law (shared functional form across groups):
#   brier_score = y0 + A * (log_flops - x0)**2
# Parameters (x0, y0, A) are fitted per group. If an unknown group is provided,
# a global fallback is used.

_PARAMS: Dict[str, Dict[str, float]] = {
    "__global__": {"x0": -2.491095, "y0": -0.457328, "A": 0.012506},
    "abstract_narrative_understanding": {"x0": -2.491095, "y0": -0.713674, "A": 0.027345},
    "analogical_similarity": {"x0": -2.311584, "y0": -0.545771, "A": 0.000228},
    "arc": {"x0": -2.491095, "y0": -0.159034, "A": 0.008152},
    "arithmetic": {"x0": -2.249114, "y0": -0.344953, "A": 0.012584},
    "conceptual_combinations": {"x0": -2.065126, "y0": -0.479229, "A": 0.009269},
    "hellaswag": {"x0": -2.491095, "y0": -0.105888, "A": 0.006045},
    "hindu_knowledge": {"x0": 1.739063, "y0": -0.449270, "A": 0.007961},
    "mmlu": {"x0": 2.596860, "y0": -0.562981, "A": 0.012202},
    "parsinlu_qa_mc": {"x0": -2.491095, "y0": -0.438630, "A": 0.000321},
}


def _predict_brier(log_flops: float, params: Dict[str, float]) -> float:
    x0 = params["x0"]
    y0 = params["y0"]
    A = params["A"]
    dx = log_flops - x0
    return y0 + A * (dx * dx)


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Functional form (shared across groups):
        brier_score = y0 + A * (log_flops - x0)**2

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values. Must include 'log_flops'.
        group: The name of the experimental group for which to make predictions.
               Same functional form for all groups; parameters differ per group.

    Returns:
        A list of dictionaries with the predicted 'brier_score' for each input.
    """
    params = _PARAMS.get(group, _PARAMS["__global__"])
    out: List[Dict[str, float]] = []
    for row in input_data:
        x = float(row["log_flops"])  # required input
        yhat = _predict_brier(x, params)
        out.append({"brier_score": float(yhat)})
    return out

#2 Run 2 R² = 0.225472

▼

Python

from __future__ import annotations
from typing import List, Dict

# Discovered U-shaped scaling law:
# brier_score = k[group] + a[group] * (log_flops - x0[group])**2
# Coefficients were fitted per group with robust least squares (a >= 0).

_PARAMS: Dict[str, Dict[str, float]] = {
    "abstract_narrative_understanding": {"k": -0.748594533797, "a": 0.024704417209, "x0": -2.899629454882},
    "analogical_similarity": {"k": -0.540756745907, "a": 1e-08, "x0": -2.898609539811},
    "arc": {"k": -0.156787963822, "a": 0.006677299024, "x0": -2.899629454882},
    "arithmetic": {"k": -0.278396491825, "a": 0.006154937508, "x0": -2.899629454878},
    "conceptual_combinations": {"k": -0.511782360394, "a": 0.007987946388, "x0": -2.899629454882},
    "hellaswag": {"k": -0.102537693293, "a": 0.004835027871, "x0": -2.899629454879},
    "hindu_knowledge": {"k": -0.461621509031, "a": 0.004504872299, "x0": 2.859498558188},
    "mmlu": {"k": -0.55637987553, "a": 0.014239476708, "x0": 2.259725760138},
    "parsinlu_qa_mc": {"k": -0.431186105608, "a": 1e-08, "x0": -2.897535806965},
    "__fallback__": {"k": -0.447878611334, "a": 0.034094242543, "x0": -0.895980722457},
}


def _predict(log_flops: float, p: Dict[str, float]) -> float:
    return p["k"] + p["a"] * (log_flops - p["x0"]) ** 2


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    params = _PARAMS.get(group, _PARAMS["__fallback__"])
    out: List[Dict[str, float]] = []
    for row in input_data:
        x = float(row.get("log_flops"))
        yhat = _predict(x, params)
        out.append({"brier_score": float(yhat)})
    return out

#3 Run 3 R² = 0.141215

▼

Python

from typing import List, Dict

# Discovered U-shaped scaling law (per-group parameters)
# Functional form: brier_score = d + a * (log_flops - c)**2, with a >= 0

# Per-group coefficients fitted via constrained non-linear least squares
COEFFS: Dict[str, Dict[str, float]] = {
    "abstract_narrative_understanding": {
        "a": 0.008467468113573256,
        "c": -10.0,
        "d": -1.3910788665280118,
    },
    "analogical_similarity": {
        "a": 0.00013496608274505986,
        "c": -10.0,
        "d": -0.5594784378689742,
    },
    "arc": {
        "a": 0.002670618953008418,
        "c": -10.0,
        "d": -0.3791484043679726,
    },
    "arithmetic": {
        "a": 0.003889937618786338,
        "c": -10.0,
        "d": -0.6770289854837191,
    },
    "conceptual_combinations": {
        "a": 0.002411414906857867,
        "c": -10.0,
        "d": -0.6810586583653832,
    },
    "hellaswag": {
        "a": 0.0020058213876494877,
        "c": -10.0,
        "d": -0.27226736346486907,
    },
    "hindu_knowledge": {
        "a": 0.0015094214985086043,
        "c": 10.0,
        "d": -0.5735731165193414,
    },
    "mmlu": {
        "a": 0.01147626953954232,
        "c": 2.7435063200062424,
        "d": -0.5667445498636151,
    },
    "parsinlu_qa_mc": {
        "a": 0.00032584434240010923,
        "c": -10.0,
        "d": -0.4746927422057731,
    },
}

# Fallback coefficients (pooled fit across all groups)
DEFAULT_COEFFS = {"a": 0.023678322162278612, "c": -0.8996294548824371, "d": -0.3978978627033718}


def _get_log_flops(row: Dict[str, float]) -> float:
    if "log_flops" in row and row["log_flops"] is not None:
        return float(row["log_flops"])
    # Optional fallback if only raw flops are provided
    flops = row.get("flops", None)
    if flops is not None and flops > 0:
        # log10(flops)
        import math

        return float(math.log10(flops))
    return 0.0


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    params = COEFFS.get(group, DEFAULT_COEFFS)
    a, c, d = float(params["a"]), float(params["c"]), float(params["d"])

    outputs: List[Dict[str, float]] = []
    for row in input_data:
        x = _get_log_flops(row)
        y_hat = d + a * (x - c) ** 2
        outputs.append({"brier_score": float(y_hat)})
    return outputs

#4 Run 4 R² = -1.000000

▼

Python

# Quadratic scaling law per group, fitted on the provided dataset at /app/data
# brier_score_hat = a * (log_flops)**2 + b * (log_flops) + c

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law is quadratic for all groups,
                while the coefficients differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    coeffs_by_group: dict[str, dict[str, float]] = {
        "abstract_narrative_understanding": {"a": -0.001002095718968019, "b": 0.18472699005645857, "c": -0.5431407140744654},
        "analogical_similarity": {"a": -0.019175879672698144, "b": 0.02791128748347238, "c": -0.540575053773558},
        "arc": {"a": -0.03686820639366876, "b": 0.1176194903989729, "c": -0.10711223271542945},
        "arithmetic": {"a": -0.12997814962868384, "b": 0.2353700979752282, "c": -0.24753267771220774},
        "conceptual_combinations": {"a": -0.07148356706471536, "b": 0.09692595522861094, "c": -0.40934554313141797},
        "hellaswag": {"a": -0.033670645755682356, "b": 0.09805145434945439, "c": -0.06719686154646048},
        "hindu_knowledge": {"a": -0.03440238896008094, "b": -0.031143510554884568, "c": -0.41031741937809096},
        "mmlu": {"a": 0.011476264280523023, "b": -0.06297043488789655, "c": -0.48036465021983477},
        "parsinlu_qa_mc": {"a": -0.05656739537407183, "b": 0.09890583732640096, "c": -0.43495071806820157},
    }
    global_coeffs: dict[str, float] = {"a": 0.0026446732472713928, "b": 0.07737556836857278, "c": -0.3784396938370408}

    coeffs = coeffs_by_group.get(group, global_coeffs)
    a, b, c = coeffs["a"], coeffs["b"], coeffs["c"]

    outputs: list[dict[str, float]] = []
    for row in input_data:
        if "log_flops" not in row:
            raise KeyError("Each input datum must include 'log_flops'.")
        x = float(row["log_flops"])  
        yhat = a * (x ** 2) + b * x + c
        outputs.append({"brier_score": float(yhat)})
    return outputs

#5 Run 5 R² = -1.000000

▼

Python

from __future__ import annotations
from typing import Dict, List

# Quadratic scaling law shared across groups:
#   brier_score = a * (log_flops**2) + b * log_flops + c
# Coefficients fitted per group via least-squares on /app/data
_COEFFS: Dict[str, tuple[float, float, float]] = {
    "abstract_narrative_understanding": (-0.001002095718968019, 0.18472699005645857, -0.5431407140744654),
    "analogical_similarity": (-0.019175879672698144, 0.02791128748347238, -0.540575053773558),
    "arc": (-0.03686820639366876, 0.1176194903989729, -0.10711223271542945),
    "arithmetic": (-0.12997814962868384, 0.2353700979752282, -0.24753267771220774),
    "conceptual_combinations": (-0.07148356706471536, 0.09692595522861094, -0.40934554313141797),
    "hellaswag": (-0.033670645755682356, 0.09805145434945439, -0.06719686154646048),
    "hindu_knowledge": (-0.03440238896008094, -0.031143510554884568, -0.41031741937809096),
    "mmlu": (0.011476264280523023, -0.06297043488789655, -0.48036465021983477),
    "parsinlu_qa_mc": (-0.05656739537407183, 0.09890583732640096, -0.43495071806820157),
}

# Global fallback if a new/unknown group appears at evaluation time
_DEFAULT_COEFFS: tuple[float, float, float] = (0.0026446732472713928, 0.07737556836857278, -0.3784396938370408)


def _predict_single(x: float, coeffs: tuple[float, float, float]) -> float:
    a, b, c = coeffs
    return a * (x ** 2) + b * x + c


def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    coeffs = _COEFFS.get(group, _DEFAULT_COEFFS)
    out: List[Dict[str, float]] = []
    for row in input_data:
        x = float(row["log_flops"])  # required input
        y = _predict_single(x, coeffs)
        out.append({"brier_score": float(y)})
    return out