← Back to Leaderboard

Parallel Scaling Law

Agent: SLDAgent
Model: Gemini 2.5 Flash
Best R²: 0.999974
Mean R²: 0.999961
Min R²: 0.999954
Runs: 5

All Runs (sorted by R²)

Best Run 4 R² = 0.999974
Python
# EVOLVE-BLOCK-START
import numpy as np
from scipy.optimize import least_squares

def scaling_law_func(data_points, params):
    """
    Predicts language modeling loss based on num_params and parallel_size
    using a 4-parameter scaling law.

    Model Form: Loss = (A * (num_params ** -alpha) + B) * (parallel_size ** -beta)

    This model posits that the total loss is composed of a part that scales with
    model parameters (A * num_params^-alpha) and a base loss component (B).
    Both of these components are then reduced by parallelization (scaled by parallel_size^-beta).
    This implies that parallel augmentation reduces all components of the loss,
    including a baseline that is independent of num_params. This structure often
    provides a more accurate fit when the "irreducible" loss itself can be
    influenced by factors like parallelization.

    Parameters:
    - data_points: (N,2) array with columns [num_params, parallel_size]
    - params: 1D array of 4 parameters [A, alpha, B, beta]
        - A (p0): Coefficient for the num_params scaling term. Expected positive.
        - alpha (p1): Exponent for num_params. Expected positive, typically < 1.0.
        - B (p2): Base loss component, which is still subject to parallel scaling. Expected positive.
        - beta (p3): Exponent for parallel_size scaling. Expected positive, typically < 1.0.

    Returns:
    - Predicted loss values (N,)
    """
    num_params = data_points[:, 0]
    parallel_size = data_points[:, 1]

    # Unpack parameters
    A, alpha, B, beta = params

    # Calculate num_params scaling term
    # Using np.power for robust handling of exponents.
    num_params_scaled = np.power(num_params, -alpha)

    # Calculate parallel_size scaling term
    parallel_size_scaled = np.power(parallel_size, -beta)

    # Combine terms according to the chosen model form
    predicted_loss = (A * num_params_scaled + B) * parallel_size_scaled
    
    return predicted_loss

def fit_scaling_law(data_points, loss_values):
    """
    Optimizes the 4 parameters of the scaling_law_func to fit the given data.

    This function employs an improved initial guess strategy based on the range of input
    data and observed loss values, enhancing the robustness of the optimization process.
    It uses the least_squares method, which is well-suited for non-linear curve fitting
    with bounds, to ensure physical meaningfulness of the fitted parameters.

    Parameters:
    - data_points: (N,2) array with columns [num_params, parallel_size]
    - loss_values: Array of corresponding loss values (N,)

    Returns:
    - Optimized parameters (1D array of 4 parameters)
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values)

    # Residuals function for least_squares.
    # least_squares minimizes the sum of squares of the values returned by this function.
    def residuals(params, X_data, y_data):
        pred = scaling_law_func(X_data, params)
        # Ensure predictions are non-negative, as loss cannot be negative.
        # This prevents numerical issues (e.g., if negative predictions were to be logged later).
        pred = np.maximum(pred, 1e-9) 
        return pred - y_data # Return the difference between prediction and actual value

    # --- Initial Guess Calculation for Model: L = (A * N^-alpha + B) * P^-beta ---
    # These heuristics are derived to provide robust starting points for the optimizer,
    # aiding convergence and finding better minima across different datasets.
    num_params_data = X[:, 0]
    parallel_size_data = X[:, 1]
    
    loss_min_obs = np.min(y)
    loss_max_obs = np.max(y)
    num_params_min_obs = np.min(num_params_data)
    # parallel_size_min_obs is typically 1; parallel_size_max_obs is 4, based on problem description.
    parallel_size_min_obs = np.min(parallel_size_data) 
    parallel_size_max_obs = np.max(parallel_size_data) 

    # Initial guesses for exponents (alpha, beta).
    # These are common values for scaling exponents, indicating diminishing returns.
    initial_alpha = 0.15
    initial_beta = 0.1

    # Heuristic for initial B (base loss component before parallel scaling):
    # The minimum observed loss (loss_min_obs) typically occurs at the highest num_params
    # and highest parallel_size (parallel_size_max_obs).
    # At very large num_params (N -> inf), the term A * N^-alpha approaches 0.
    # So, loss_min_obs ≈ B * (parallel_size_max_obs ** -initial_beta).
    # Solving for B: B ≈ loss_min_obs / (parallel_size_max_obs ** -initial_beta).
    initial_B = loss_min_obs / (parallel_size_max_obs ** -initial_beta)
    initial_B = np.maximum(initial_B, 1e-9) # Ensure B is positive and not extremely small

    # Heuristic for initial A (coefficient for num_params scaling):
    # The maximum observed loss (loss_max_obs) typically occurs at the minimum num_params
    # (num_params_min_obs) and minimum parallel_size (parallel_size_min_obs = 1).
    # The model equation at this point is:
    # loss_max_obs = (A * (num_params_min_obs ** -initial_alpha) + B) * (parallel_size_min_obs ** -initial_beta)
    
    # Since parallel_size_min_obs is 1, (parallel_size_min_obs ** -initial_beta) simplifies to 1.
    # So, the equation becomes: loss_max_obs = A * (num_params_min_obs ** -initial_alpha) + B
    # Rearranging to solve for A:
    # A = (loss_max_obs - B) / (num_params_min_obs ** -initial_alpha)
    
    numerator_A = loss_max_obs - initial_B
    # A must be positive. If (loss_max_obs - initial_B) is negative (e.g., due to an overestimated initial_B),
    # clamp it to a small positive value to prevent negative A.
    numerator_A = np.maximum(numerator_A, 1e-9) 

    denominator_A = num_params_min_obs ** -initial_alpha
    denominator_A = np.maximum(denominator_A, 1e-9) # Prevent division by zero or very small numbers
    
    initial_A = numerator_A / denominator_A
    initial_A = np.clip(initial_A, 1e-3, 1e5) # Clamp A to a reasonable range to prevent extreme initial values

    # Assemble initial parameters in the order [A, alpha, B, beta]
    initial_params = np.array([initial_A, initial_alpha, initial_B, initial_beta])

    # Bounds for parameters [A, alpha, B, beta] for least_squares:
    # All parameters are expected to be positive for this model to make physical sense
    # in the context of diminishing loss returns. Exponents (alpha, beta) typically less than 1.0.
    bounds_lower = [1e-9, 1e-9, 1e-9, 1e-9]  # A, alpha, B, beta must be positive
    bounds_upper = [np.inf, 1.0, np.inf, 1.0] # alpha and beta typically < 1.0 for diminishing returns

    # Use least_squares with 'trf' method which is robust and handles bounds well.
    result = least_squares(residuals, initial_params, args=(X, y), 
                           bounds=(bounds_lower, bounds_upper), method='trf', loss='linear')

    # least_squares always returns result.x, which contains the optimized parameters.
    # No need for result.success check as least_squares is generally more robust
    # in returning a solution even if not globally optimal.
    params_opt = result.x

    return params_opt
# EVOLVE-BLOCK-END
#2 Run 2 R² = 0.999969
#3 Run 1 R² = 0.999954
#4 Run 3 R² = 0.999954
#5 Run 5 R² = 0.999954