SLD - U-shaped Scaling Law - gemini-cli + Gemini 2.5 Flash

All Runs (sorted by R²)

Best Run 1 R² = 0.031044

▼

Python

import json
import os

# Load parameters once when the module is imported
_FITTED_PARAMETERS = {}
_PARAMS_FILE_PATH = os.path.join(os.path.dirname(__file__), "fitted_parameters.json")

try:
    with open(_PARAMS_FILE_PATH, 'r') as f:
        _FITTED_PARAMETERS = json.load(f)
except FileNotFoundError:
    print(f"Error: fitted_parameters.json not found at {_PARAMS_FILE_PATH}. "
          "The law function will not be able to make predictions.")
except json.JSONDecodeError:
    print(f"Error: Could not decode JSON from {_PARAMS_FILE_PATH}.")


def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    predictions = []

    if group not in _FITTED_PARAMETERS:
        # If the group is not found, return predictions with None for brier_score
        return [{"brier_score": None} for _ in input_data]

    params = _FITTED_PARAMETERS[group]
    A = params['A']
    B = params['B']
    C = params['C']

    for data_point in input_data:
        log_flops = data_point['log_flops']

        if log_flops == 0:
            # Handle division by zero for 1/log_flops. Return None or a default.
            # For U-shaped scaling, 0 log_flops is typically an extreme, often resulting in high brier_score.
            # For now, returning None as an indicator of an out-of-model-scope input.
            predicted_brier_score = None
        else:
            predicted_brier_score = A * log_flops + B * (1 / log_flops) + C
        predictions.append({'brier_score': predicted_brier_score})

    return predictions

#2 Run 2 R² = -1.000000

▼

Python

import json

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """

    # Fitted parameters for each group
    fitted_parameters = {
        "mmlu": {
            "a": 0.01147626903212752,
            "b": -0.06297044145041197,
            "c": -0.4803646498255163
        },
        "parsinlu_qa_mc": {
            "a": -0.05656739576030594,
            "b": 0.09890583803784526,
            "c": -0.4349507180875899
        },
        "arithmetic": {
            "a": -0.12997815043774216,
            "b": 0.2353700987909166,
            "c": -0.24753267731009526
        },
        "hindu_knowledge": {
            "a": -0.034402390318516406,
            "b": -0.031143510453820404,
            "c": -0.4103174189329031
        },
        "analogical_similarity": {
            "a": -0.019175880244981132,
            "b": 0.02791128902903651,
            "c": -0.5405750543843222
        },
        "conceptual_combinations": {
            "a": -0.07148356715072232,
            "b": 0.09692595523920844,
            "c": -0.40934554304731946
        },
        "hellaswag": {
            "a": -0.03367064564717706,
            "b": 0.09805145414703137,
            "c": -0.06719686151887498
        },
        "arc": {
            "a": -0.03686820640090799,
            "b": 0.11761949041426581,
            "c": -0.10711223272051366
        },
        "abstract_narrative_understanding": {
            "a": -0.0010020980462024098,
            "b": 0.18472700388137853,
            "c": -0.5431407255223071
        }
    }

    if group not in fitted_parameters:
        raise ValueError(f"No fitted parameters found for group: {group}")

    a = fitted_parameters[group]['a']
    b = fitted_parameters[group]['b']
    c = fitted_parameters[group]['c']

    predictions = []
    for data_point in input_data:
        log_flops = data_point['log_flops']
        # U-shaped law: brier_score = a * log_flops^2 + b * log_flops + c
        predicted_brier_score = a * (log_flops**2) + (b * log_flops) + c
        predictions.append({'brier_score': predicted_brier_score})

    return predictions

#3 Run 3 R² = -1.000000

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Fitted parameters for each group (a, b, c for brier_score = a * log_flops^2 + b * log_flops + c)
    # These parameters were derived from a quadratic regression on the provided dataset.
    fitted_params = {
        "mmlu": {"a": 0.01147626428052324, "b": -0.06297043488789657, "c": -0.4803646502198346},
        "parsinlu_qa_mc": {"a": -0.05656739537407177, "b": 0.09890583732640087, "c": -0.4349507180682014},
        "arithmetic": {"a": -0.12997814962868393, "b": 0.23537009797522834, "c": -0.2475326777122077},
        "hindu_knowledge": {"a": -0.03440238896008072, "b": -0.031143510554884634, "c": -0.4103174193780912},
        "analogical_similarity": {"a": -0.01917587967269841, "b": 0.027911287483472436, "c": -0.5405750537735577},
        "conceptual_combinations": {"a": -0.07148356706471519, "b": 0.096925955228611, "c": -0.4093455431314182},
        "hellaswag": {"a": -0.03367064575568235, "b": 0.09805145434945435, "c": -0.06719686154646046},
        "arc": {"a": -0.036868206393668744, "b": 0.11761949039897288, "c": -0.10711223271542948},
        "abstract_narrative_understanding": {"a": -0.0010020957189679732, "b": 0.18472699005645862, "c": -0.5431407140744655}
    }

    if group not in fitted_params:
        raise ValueError(f"Group '{group}' not found in fitted parameters.")

    a = fitted_params[group]["a"]
    b = fitted_params[group]["b"]
    c = fitted_params[group]["c"]

    predictions = []
    for data_point in input_data:
        log_flops = data_point["log_flops"]
        predicted_brier_score = a * (log_flops**2) + b * log_flops + c
        predictions.append({"brier_score": predicted_brier_score})

    return predictions

#4 Run 4 R² = -1.000000

▼

Python

import json
import os
import numpy as np

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """
    # Load fitted parameters from the JSON file
    script_dir = os.path.dirname(__file__)
    params_file_path = os.path.join(script_dir, '.gemini', 'tmp', 'fitted_params.json')

    try:
        with open(params_file_path, 'r') as f:
            fitted_params = json.load(f)
    except FileNotFoundError:
        raise RuntimeError(f"Fitted parameters file not found at {params_file_path}")
    except json.JSONDecodeError:
        raise RuntimeError(f"Error decoding JSON from {params_file_path}")

    if group not in fitted_params:
        raise ValueError(f"No fitted parameters found for group: {group}")

    A, B, C = fitted_params[group]

    predictions = []
    for data_point in input_data:
        if 'log_flops' not in data_point:
            raise ValueError("Input data point missing 'log_flops' key.")

        log_flops = data_point['log_flops']
        # Apply the U-shaped (quadratic) scaling law
        predicted_brier_score = A * log_flops**2 + B * log_flops + C
        predictions.append({'brier_score': predicted_brier_score})

    return predictions

#5 Run 5 R² = -1.000000

▼

Python

def law(input_data: list[dict[str, float]], group: str) -> list[dict[str, float]]:
    """
    Predicts output variables based on input variables according to a discovered scaling law.

    Args:
        input_data: A list of dictionaries, where each dictionary is a single data
                    point containing input variable names as keys and their
                    corresponding values.
        group: The name of the experimental group for which to make predictions.
                The functional form of the law must be the same for all groups,
                but the constant parameters/coefficients can differ per group.

    Returns:
        A list of dictionaries, corresponding to the input_data list, with each
        dictionary containing the predicted output variable(s).
    """

    # Fitted coefficients for each group (a*x^2 + b*x + c)
    group_coefficients = {
        'mmlu': {'a': 0.01147626428052324, 'b': -0.06297043488789657, 'c': -0.4803646502198346},
        'parsinlu_qa_mc': {'a': -0.05656739537407177, 'b': 0.09890583732640087, 'c': -0.4349507180682014},
        'arithmetic': {'a': -0.12997814962868393, 'b': 0.23537009797522834, 'c': -0.2475326777122077},
        'hindu_knowledge': {'a': -0.03440238896008072, 'b': -0.031143510554884634, 'c': -0.4103174193780912},
        'analogical_similarity': {'a': -0.01917587967269841, 'b': 0.027911287483472436, 'c': -0.5405750537735577},
        'conceptual_combinations': {'a': -0.07148356706471519, 'b': 0.096925955228611, 'c': -0.4093455431314182},
        'hellaswag': {'a': -0.03367064575568235, 'b': 0.09805145434945435, 'c': -0.06719686154646046},
        'arc': {'a': -0.036868206393668744, 'b': 0.11761949039897288, 'c': -0.10711223271542948},
        'abstract_narrative_understanding': {'a': -0.0010020957189679732, 'b': 0.18472699005645862, 'c': -0.5431407140744655},
    }

    if group not in group_coefficients:
        # Handle cases where the group is not found.
        # For a U-shaped scaling law, a reasonable default might be to return a very high brier score (worse performance)
        # or raise an error. Given the problem context of predicting performance, a safe default would be to indicate
        # poor performance or NaN if extrapolation is impossible for unseen groups.
        # For now, I'll return a list of dictionaries with 'brier_score': None for unknown groups.
        # This will need to be decided based on expected behavior for unseen groups in the hidden dataset.
        print(f"Warning: Group '{group}' not found in fitted coefficients. Returning None for brier_score.")
        return [{'brier_score': None} for _ in input_data]

    coeffs = group_coefficients[group]
    a, b, c = coeffs['a'], coeffs['b'], coeffs['c']

    predictions = []
    for data_point in input_data:
        log_flops = data_point.get('log_flops')
        if log_flops is None:
            predictions.append({'brier_score': None}) # Or raise an error
            continue
        
        # Calculate brier_score using the quadratic formula
        predicted_brier_score = a * (log_flops ** 2) + b * log_flops + c
        predictions.append({'brier_score': predicted_brier_score})

    return predictions