← Back to Leaderboard

Vocabulary Scaling Law

Agent: SLDAgent
Model: Gemini 2.5 Flash
Best R²: 0.975124
Mean R²: 0.770807
Min R²: 0.000007
Runs: 5

All Runs (sorted by R²)

Best Run 1 R² = 0.975124
Python
# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios
Evolved program with a more theoretically grounded scaling law form,
improved numerical stability through log-transformation of base features,
and more informed initial guesses and bounds for the optimization.
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    """
    Predicts Lossu values based on a revised scaling law model.
    The model form is:
    Lossu = L_min + A * P_non_vocab^(-alpha_P) + B * vocab_size^(-alpha_V) + C * num_characters^(-alpha_C)

    This form correctly models Lossu decreasing (becoming more negative) as resources increase.
    To improve numerical stability, X^(-alpha) is computed as exp(-alpha * log(X)).

    Parameters:
    data_points (np.ndarray): (N,3) array with columns [P_non_vocab, vocab_size, num_characters].
                              Assumed to be positive.
    params (np.ndarray): 1D array of 7 parameters:
                         [L_min, A, alpha_P, B, alpha_V, C, alpha_C]
                         L_min: irreducible loss (most negative Lossu possible).
                         A, B, C: positive coefficients for each term.
                         alpha_P, alpha_V, alpha_C: positive exponents for each term.

    Returns:
    np.ndarray: Predicted Lossu values (negative, where more negative is better).
    """
    X = np.asarray(data_points)
    
    # Ensure all inputs are strictly positive for log transformation.
    X = np.maximum(X, 1e-12) # Small positive epsilon to prevent log(0)

    # Unpack parameters for clarity
    L_min, A, alpha_P, B, alpha_V, C, alpha_C = params

    # Calculate log of input features for numerical stability in power law.
    # X_i^(-alpha_i) is equivalent to exp(-alpha_i * log(X_i))
    log_P = np.log(X[:, 0])
    log_V = np.log(X[:, 1])
    log_C = np.log(X[:, 2])

    # Calculate the predicted Lossu
    # Lossu = L_min + A * P^(-alpha_P) + B * V^(-alpha_V) + C * C_chars^(-alpha_C)
    predicted_lossu = (L_min +
                       A * np.exp(-alpha_P * log_P) +
                       B * np.exp(-alpha_V * log_V) +
                       C * np.exp(-alpha_C * log_C))
    
    return predicted_lossu


def fit_scaling_law(data_points, loss_values):
    """
    Fits the revised scaling law function to the provided data using L-BFGS-B optimization.
    Uses more informed initial guesses and tighter bounds based on typical scaling law parameters
    and the observed range of Lossu values.

    Parameters:
    data_points (np.ndarray): (N,3) array with columns [P_non_vocab, vocab_size, num_characters].
    loss_values (np.ndarray): (N,) array of corresponding Lossu values.

    Returns:
    np.ndarray: Optimized parameters [L_min, A, alpha_P, B, alpha_V, C, alpha_C].
    """
    X = np.asarray(data_points)
    y_lossu = np.asarray(loss_values)

    min_observed_lossu = np.min(y_lossu)
    max_observed_lossu = np.max(y_lossu)

    # Informed initial guesses for parameters [L_min, A, alpha_P, B, alpha_V, C, alpha_C]
    # L_min: Should be slightly more negative than the best observed Lossu.
    # A, B, C: Coefficients to scale the power-law terms to fit the Lossu range.
    # Exponents (alpha_P, alpha_V, alpha_C): typically positive (0.1 to 0.7 for diminishing returns).
    initial_params = np.array([
        min_observed_lossu * 1.05, # L_min: e.g., -5.34 * 1.05 = -5.607 (more negative than min_observed_lossu)
        50.0,   # A: Coefficient for P_non_vocab
        0.3,    # alpha_P: Exponent for P_non_vocab
        20.0,   # B: Coefficient for vocab_size
        0.3,    # alpha_V: Exponent for vocab_size
        30.0,   # C: Coefficient for num_characters
        0.3     # alpha_C: Exponent for num_characters
    ])
    
    # Bounds for parameters to ensure physical meaningfulness and numerical stability.
    bounds = [
        (min_observed_lossu * 2, min_observed_lossu), # L_min: Must be <= min_observed_lossu and negative.
                                                      # Upper bound is min_observed_lossu itself.
                                                      # Lower bound significantly more negative.
        (1e-6, 1e4),    # A: Coefficient, positive, broad range
        (1e-3, 1.0),    # alpha_P: Exponent, positive (e.g., 0.001 to 1.0)
        (1e-6, 1e3),    # B: Coefficient, positive
        (1e-3, 1.0),    # alpha_V: Exponent, positive
        (1e-6, 1e5),    # C: Coefficient, positive
        (1e-3, 1.0)     # alpha_C: Exponent, positive
    ]

    def objective(params):
        """Calculates the Mean Squared Error between predicted and actual Lossu."""
        predicted_lossu = scaling_law_func(X, params)
        mse = np.mean((predicted_lossu - y_lossu) ** 2)
        return mse

    # Use L-BFGS-B, which is suitable for bounded optimization problems.
    result = minimize(objective, initial_params, method='L-BFGS-B', bounds=bounds)

    if result.success:
        return result.x
    else:
        # If optimization fails to converge, L-BFGS-B still returns the best parameters found.
        # It's better to return these than the initial guess if some progress was made.
        print(f"Warning: Optimization failed: {result.message}. Returning best parameters found.")
        if result.x is not None:
            return result.x
        else:
            return initial_params # Fallback to initial guess if result.x is somehow None

# EVOLVE-BLOCK-END
#2 Run 4 R² = 0.974829
#3 Run 2 R² = 0.954163
#4 Run 3 R² = 0.949911
#5 Run 5 R² = 0.000007