SLD - U-shaped Scaling Law - terminus-2 + GPT-5

All Runs (sorted by R²)

Best Run 1 R² = 0.150082

▼

Python

# Auto-generated scaling law implementation
# U-shaped quadratic in log_flops: y = a * (log_flops - x0)**2 + c
from typing import List, Dict

# Per-group parameters fitted from /app/data
_PARAMS: dict[str, dict[str, float]] = {
  "mmlu": {
    "a": 0.011476264280523023,
    "x0": 2.7435075277399728,
    "c": -0.5667445812898367
  },
  "parsinlu_qa_mc": {
    "a": 0.0,
    "x0": 1.1106711713084738,
    "c": -0.434241496854291
  },
  "arithmetic": {
    "a": 0.0,
    "x0": 0.854807431011725,
    "c": -0.21644143686194878
  },
  "hindu_knowledge": {
    "a": 0.0,
    "x0": 0.15012816206281548,
    "c": -0.4266515719307887
  },
  "analogical_similarity": {
    "a": 0.0,
    "x0": 0.9563601615678085,
    "c": -0.5431939971887069
  },
  "conceptual_combinations": {
    "a": 0.0,
    "x0": 0.562905378864399,
    "c": -0.41087701400682625
  },
  "hellaswag": {
    "a": 0.0,
    "x0": 1.1106711713084738,
    "c": -0.02325897962914785
  },
  "arc": {
    "a": 0.0,
    "x0": 1.1106711713084738,
    "c": -0.04761015603522527
  },
  "abstract_narrative_understanding": {
    "a": 0.0,
    "x0": 1.1106711713084738,
    "c": -0.33990324185677395
  }
}

_DEF_GROUP = 'default'

def _predict_one(x: float, g: str) -> float:
    gkey = str(g)
    if gkey not in _PARAMS:
        # fallback to default if present, else use average of known params
        if _DEF_GROUP in _PARAMS:
            p = _PARAMS[_DEF_GROUP]
        else:
            # mean params
            import math
            if _PARAMS:
                a = sum(d['a'] for d in _PARAMS.values())/len(_PARAMS)
                x0 = sum(d['x0'] for d in _PARAMS.values())/len(_PARAMS)
                c = sum(d['c'] for d in _PARAMS.values())/len(_PARAMS)
                p = {'a':a,'x0':x0,'c':c}
            else:
                p = {'a':0.0,'x0':0.0,'c':0.0}
    else:
        p = _PARAMS[gkey]
    return p['a'] * (x - p['x0'])**2 + p['c']

def law(input_data: List[Dict[str, float]], group: str) -> List[Dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    out = []
    for row in input_data:
        # Expect 'log_flops' key
        x = float(row.get('log_flops'))
        yhat = _predict_one(x, group)
        out.append({'brier_score': float(yhat)})
    return out

#2 Run 2 R² = -0.168710

▼

Python

# Auto-generated scaling law implementation
from __future__ import annotations
from typing import List, Dict

_GROUP_COL = 'group'

# Vertex-form quadratic parameters per group: brier = a*(log_flops - c)**2 + d
_PARAMS = {'abstract_narrative_understanding': {'a': 0.001002095718968019, 'c': -90.58429644614202, 'd': -8.766188358818367}, 'analogical_similarity': {'a': 0.019175879672698144, 'c': 0.5477135995579335, 'd': -0.5581696684346125}, 'arc': {'a': 0.03686820639366876, 'c': -0.00909895560472036, 'd': -0.11949288521025427}, 'arithmetic': {'a': 0.12997814962868384, 'c': 0.24363619416244217, 'd': -0.3401225987599409}, 'conceptual_combinations': {'a': 0.07148356706471536, 'c': -0.039142050508100865, 'd': -0.470228179991121}, 'hellaswag': {'a': 0.033670645755682356, 'c': 0.12999784134055556, 'd': -0.07906997378756786}, 'hindu_knowledge': {'a': 0.03440238896008094, 'c': 0.4376516101821685, 'd': -0.44037886576008545}, 'mmlu': {'a': 0.011476264280523023, 'c': 2.7435075277399728, 'd': -0.5667445812898367}, 'parsinlu_qa_mc': {'a': 0.05656739537407183, 'c': 0.7118053877907184, 'd': -0.4826026673672854}}

if _PARAMS:
    _FALLBACK = {k: float(sum(p[k] for p in _PARAMS.values())/len(_PARAMS)) for k in ('a','c','d')}
else:
    _FALLBACK = {'a': 1.0, 'c': 0.0, 'd': 0.0}

def _get_params(group: str) -> Dict[str, float]:
    return _PARAMS.get(group, _FALLBACK)

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    params = _get_params(group)
    a = params['a']; c = params['c']; d = params['d']
    out: List[Dict[str, float]] = []
    for row in input_data:
        x = float(row.get('log_flops'))
        y = a * (x - c) ** 2 + d
        out.append({'brier_score': float(y)})
    return out

#3 Run 3 R² = -1.000000

▼

Python

# Auto-generated scaling law implementation
# Formula: brier_score = A[group] * log_flops**2 + B[group] * log_flops + C[group]
# Fitted on /app/data using least squares per group.
from __future__ import annotations
from typing import List, Dict

PARAMS = {
  "abstract_narrative_understanding": {
    "A": -0.001002095718968019,
    "B": 0.18472699005645857,
    "C": -0.5431407140744654
  },
  "analogical_similarity": {
    "A": -0.019175879672698144,
    "B": 0.02791128748347238,
    "C": -0.540575053773558
  },
  "arc": {
    "A": -0.03686820639366876,
    "B": 0.1176194903989729,
    "C": -0.10711223271542945
  },
  "arithmetic": {
    "A": -0.12997814962868384,
    "B": 0.2353700979752282,
    "C": -0.24753267771220774
  },
  "conceptual_combinations": {
    "A": -0.07148356706471536,
    "B": 0.09692595522861094,
    "C": -0.40934554313141797
  },
  "hellaswag": {
    "A": -0.033670645755682356,
    "B": 0.09805145434945439,
    "C": -0.06719686154646048
  },
  "hindu_knowledge": {
    "A": -0.03440238896008094,
    "B": -0.031143510554884568,
    "C": -0.41031741937809096
  },
  "mmlu": {
    "A": 0.011476264280523023,
    "B": -0.06297043488789655,
    "C": -0.48036465021983477
  },
  "parsinlu_qa_mc": {
    "A": -0.05656739537407183,
    "B": 0.09890583732640096,
    "C": -0.43495071806820157
  }
}
FALLBACK = {
  "A": 0.0026446732472713928,
  "B": 0.07737556836857278,
  "C": -0.3784396938370408
}

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    params = PARAMS.get(group, FALLBACK)
    A = params['A']; B = params['B']; C = params['C']
    out: list[dict[str, float]] = []
    for row in input_data:
        x = float(row.get('log_flops', 0.0))
        y = A*(x**2) + B*x + C
        out.append({'brier_score': float(y)})
    return out

#4 Run 4 R² = -1.000000

▼

Python

# Auto-generated scaling law implementation
# Formula: brier_score = a2 * (log_flops**2) + a1 * log_flops + a0
# Coefficients are per group; unknown groups fall back to global coefficients.
from typing import List, Dict

_COEFFS = {
  "abstract_narrative_understanding": {
    "a2": -0.001002095718967912,
    "a1": 0.18472699005645873,
    "a0": -0.5431407140744655
  },
  "analogical_similarity": {
    "a2": -0.019175879672698435,
    "a1": 0.0279112874834725,
    "a0": -0.5405750537735581
  },
  "arc": {
    "a2": -0.036868206393668744,
    "a1": 0.11761949039897288,
    "a0": -0.1071122327154294
  },
  "arithmetic": {
    "a2": -0.12997814962868387,
    "a1": 0.23537009797522832,
    "a0": -0.2475326777122078
  },
  "conceptual_combinations": {
    "a2": -0.07148356706471508,
    "a1": 0.09692595522861085,
    "a0": -0.40934554313141813
  },
  "hellaswag": {
    "a2": -0.033670645755682356,
    "a1": 0.09805145434945438,
    "a0": -0.06719686154646047
  },
  "hindu_knowledge": {
    "a2": -0.034402388960081354,
    "a1": -0.031143510554884814,
    "a0": -0.4103174193780911
  },
  "mmlu": {
    "a2": 0.011476264280523694,
    "a1": -0.06297043488789662,
    "a0": -0.480364650219835
  },
  "parsinlu_qa_mc": {
    "a2": -0.05656739537407183,
    "a1": 0.0989058373264011,
    "a0": -0.43495071806820146
  }
}
_GLOBAL = {
  "a2": 0.002644673247271387,
  "a1": 0.07737556836857276,
  "a0": -0.3784396938370407
}

def _predict_one(x: float, c: Dict[str, float]) -> float:
    return c["a2"] * (x ** 2) + c["a1"] * x + c["a0"]

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    coeffs = _COEFFS.get(group, _GLOBAL)
    out = []
    for row in input_data:
        x = float(row.get('log_flops'))
        y = _predict_one(x, coeffs)
        out.append({'brier_score': float(y)})
    return out