SLD - Vocabulary Scaling Law - SLDAgent + Gemini 2.5 Flash

All Runs (sorted by R²)

Best Run 1 R² = 0.975124

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios
Evolved program with a more theoretically grounded scaling law form,
improved numerical stability through log-transformation of base features,
and more informed initial guesses and bounds for the optimization.
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    """
    Predicts Lossu values based on a revised scaling law model.
    The model form is:
    Lossu = L_min + A * P_non_vocab^(-alpha_P) + B * vocab_size^(-alpha_V) + C * num_characters^(-alpha_C)

    This form correctly models Lossu decreasing (becoming more negative) as resources increase.
    To improve numerical stability, X^(-alpha) is computed as exp(-alpha * log(X)).

    Parameters:
    data_points (np.ndarray): (N,3) array with columns [P_non_vocab, vocab_size, num_characters].
                              Assumed to be positive.
    params (np.ndarray): 1D array of 7 parameters:
                         [L_min, A, alpha_P, B, alpha_V, C, alpha_C]
                         L_min: irreducible loss (most negative Lossu possible).
                         A, B, C: positive coefficients for each term.
                         alpha_P, alpha_V, alpha_C: positive exponents for each term.

    Returns:
    np.ndarray: Predicted Lossu values (negative, where more negative is better).
    """
    X = np.asarray(data_points)
    
    # Ensure all inputs are strictly positive for log transformation.
    X = np.maximum(X, 1e-12) # Small positive epsilon to prevent log(0)

    # Unpack parameters for clarity
    L_min, A, alpha_P, B, alpha_V, C, alpha_C = params

    # Calculate log of input features for numerical stability in power law.
    # X_i^(-alpha_i) is equivalent to exp(-alpha_i * log(X_i))
    log_P = np.log(X[:, 0])
    log_V = np.log(X[:, 1])
    log_C = np.log(X[:, 2])

    # Calculate the predicted Lossu
    # Lossu = L_min + A * P^(-alpha_P) + B * V^(-alpha_V) + C * C_chars^(-alpha_C)
    predicted_lossu = (L_min +
                       A * np.exp(-alpha_P * log_P) +
                       B * np.exp(-alpha_V * log_V) +
                       C * np.exp(-alpha_C * log_C))
    
    return predicted_lossu


def fit_scaling_law(data_points, loss_values):
    """
    Fits the revised scaling law function to the provided data using L-BFGS-B optimization.
    Uses more informed initial guesses and tighter bounds based on typical scaling law parameters
    and the observed range of Lossu values.

    Parameters:
    data_points (np.ndarray): (N,3) array with columns [P_non_vocab, vocab_size, num_characters].
    loss_values (np.ndarray): (N,) array of corresponding Lossu values.

    Returns:
    np.ndarray: Optimized parameters [L_min, A, alpha_P, B, alpha_V, C, alpha_C].
    """
    X = np.asarray(data_points)
    y_lossu = np.asarray(loss_values)

    min_observed_lossu = np.min(y_lossu)
    max_observed_lossu = np.max(y_lossu)

    # Informed initial guesses for parameters [L_min, A, alpha_P, B, alpha_V, C, alpha_C]
    # L_min: Should be slightly more negative than the best observed Lossu.
    # A, B, C: Coefficients to scale the power-law terms to fit the Lossu range.
    # Exponents (alpha_P, alpha_V, alpha_C): typically positive (0.1 to 0.7 for diminishing returns).
    initial_params = np.array([
        min_observed_lossu * 1.05, # L_min: e.g., -5.34 * 1.05 = -5.607 (more negative than min_observed_lossu)
        50.0,   # A: Coefficient for P_non_vocab
        0.3,    # alpha_P: Exponent for P_non_vocab
        20.0,   # B: Coefficient for vocab_size
        0.3,    # alpha_V: Exponent for vocab_size
        30.0,   # C: Coefficient for num_characters
        0.3     # alpha_C: Exponent for num_characters
    ])
    
    # Bounds for parameters to ensure physical meaningfulness and numerical stability.
    bounds = [
        (min_observed_lossu * 2, min_observed_lossu), # L_min: Must be <= min_observed_lossu and negative.
                                                      # Upper bound is min_observed_lossu itself.
                                                      # Lower bound significantly more negative.
        (1e-6, 1e4),    # A: Coefficient, positive, broad range
        (1e-3, 1.0),    # alpha_P: Exponent, positive (e.g., 0.001 to 1.0)
        (1e-6, 1e3),    # B: Coefficient, positive
        (1e-3, 1.0),    # alpha_V: Exponent, positive
        (1e-6, 1e5),    # C: Coefficient, positive
        (1e-3, 1.0)     # alpha_C: Exponent, positive
    ]

    def objective(params):
        """Calculates the Mean Squared Error between predicted and actual Lossu."""
        predicted_lossu = scaling_law_func(X, params)
        mse = np.mean((predicted_lossu - y_lossu) ** 2)
        return mse

    # Use L-BFGS-B, which is suitable for bounded optimization problems.
    result = minimize(objective, initial_params, method='L-BFGS-B', bounds=bounds)

    if result.success:
        return result.x
    else:
        # If optimization fails to converge, L-BFGS-B still returns the best parameters found.
        # It's better to return these than the initial guess if some progress was made.
        print(f"Warning: Optimization failed: {result.message}. Returning best parameters found.")
        if result.x is not None:
            return result.x
        else:
            return initial_params # Fallback to initial guess if result.x is somehow None

# EVOLVE-BLOCK-END

#2 Run 4 R² = 0.974829

▼

Python

# EVOLVE-BLOCK-START
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    # data_points: (N,3) array with columns [P_non_vocab, vocab_size, num_characters]
    # params: Array of up to 7 parameters
    # Returns: Predicted Lossu values

    P_non_vocab = data_points[:, 0]
    vocab_size = data_points[:, 1]
    num_characters = data_points[:, 2]

    # Ensure params is a 1D array of 7 elements.
    if params.ndim > 1:
        params = params[0]

    # Parameters structure: [C0, log_C1, log_C2, log_C3, E1, E2, E3]
    # C0: Offset term (irreducible loss)
    # log_C1, log_C2, log_C3: Logarithms of positive coefficients for the inverse power law terms
    # E1, E2, E3: Positive exponents for the inverse power law terms
    C0, log_C1, log_C2, log_C3, E1, E2, E3 = params

    # Exponentiate log_C_i to get actual coefficients. This implicitly ensures C_i are positive.
    C1 = np.exp(log_C1)
    C2 = np.exp(log_C2)
    C3 = np.exp(log_C3)

    # The scaling law function: C0 + C1 * P_non_vocab^(-E1) + C2 * vocab_size^(-E2) + C3 * num_characters^(-E3)
    # This form represents loss decreasing as an inverse power law of increasing resources,
    # asymptotically approaching C0.
    # Given data characteristics, P, V, C are always positive, so np.power is safe.
    # Exponents E1, E2, E3 are positive, making the overall exponent negative.
    term_P = C1 * np.power(P_non_vocab, -E1)
    term_V = C2 * np.power(vocab_size, -E2)
    term_C = C3 * np.power(num_characters, -E3)

    pred_Lossu = C0 + term_P + term_V + term_C

    return pred_Lossu


def fit_scaling_law(data_points, loss_values):
    # data_points: (N,3) array with columns [P_non_vocab, vocab_size, num_characters]
    # lossu_values: Array of corresponding Lossu values
    # Returns: Optimized parameters (up to 7 parameters)

    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values)

    # Initial guess for parameters: [C0, log_C1, log_C2, log_C3, E1, E2, E3]
    # C0 (asymptotic minimum loss) is initialized slightly below the minimum observed Lossu.
    # log_C_i (log of coefficients) are initialized to 0.0, corresponding to C_i = 1.0.
    # E_i (exponents) are initialized to small positive values.
    initial_guess = np.array([
        np.min(y) - 0.1, # C0: Asymptotic minimum Lossu
        0.0,             # log_C1: log(1.0) for P_non_vocab term
        0.0,             # log_C2: log(1.0) for vocab_size term
        0.0,             # log_C3: log(1.0) for num_characters term
        0.1,             # E1: Exponent for P_non_vocab term (positive, usually <1)
        0.1,             # E2: Exponent for vocab_size term (positive, usually <1)
        0.1              # E3: Exponent for num_characters term (positive, usually <1)
    ])

    # Bounds for parameters: [(min, max), ...] for each of the 7 parameters
    # These bounds help guide the optimization to physically meaningful parameter values.
    # C0: Asymptotic loss should be negative and strictly less than observed min Lossu.
    # log_C_i: Allow C_i to range from very small (e.g., 1e-7) to very large (e.g., 1e13).
    # E_i: Positive and typically less than 1 for diminishing returns.
    bounds = [
        (-10.0, np.min(y) - 0.001),   # C0: Lower bound for asymptotic loss
        (np.log(1e-7), np.log(1e13)), # log_C1: bounds for log(C1), allowing C1 to be in [1e-7, 1e13]
        (np.log(1e-7), np.log(1e13)), # log_C2: bounds for log(C2)
        (np.log(1e-7), np.log(1e13)), # log_C3: bounds for log(C3)
        (0.001, 1.0),                 # E1: Exponent for P_non_vocab term
        (0.001, 1.0),                 # E2: Exponent for vocab_size term
        (0.001, 1.0)                  # E3: Exponent for num_characters term
    ]

    def objective(params):
        pred = scaling_law_func(X, params)
        # Safeguard against non-finite predictions (NaN or Inf) during optimization.
        # This is crucial for robustness, as intermediate parameter values might lead to numerical issues,
        # preventing the optimizer from getting stuck.
        if not np.all(np.isfinite(pred)):
            return np.inf # Return a very high cost (infinity) if predictions are not finite
        mse = np.mean((pred - y) ** 2)
        return mse

    # Use L-BFGS-B for bounded optimization, which is robust for this type of problem.
    # Increased maxiter to allow more iterations for potentially better convergence, especially
    # with wider bounds and a possibly complex objective landscape.
    result = minimize(objective, initial_guess, method='L-BFGS-B', bounds=bounds, options={'maxiter': 5000})

    # Return optimized parameters if successful, otherwise the initial guess.
    params_opt = result.x if result.success else initial_guess

    # The system expects parameters to be returned in a (T, P) shape, where T=1 for a single target.
    # So, reshape the 1D optimized parameters array to (1, num_params).
    return params_opt[None, :]
# EVOLVE-BLOCK-END

#3 Run 2 R² = 0.954163

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios.
This evolved program introduces a refined scaling law function and an improved
optimization algorithm to better model the relationship between LLM resources
and unigram-normalized loss (Lossu).

The new scaling law function models Lossu as a sum of power laws applied to
log-transformed input features, plus a bias term. This form is chosen for
several reasons:
1.  **Mathematical Accuracy:** Log-transforming inputs before applying power laws
    often better captures relationships when features span many orders of magnitude,
    a common characteristic of scaling laws in machine learning. This functional form
    has demonstrated strong performance in previous evaluations (e.g., R2 of ~0.8).
2.  **Parameter Efficiency:** It uses exactly 7 parameters, satisfying the constraint.
    The parameters are: [c_P, c_V, c_C, e_P, e_V, e_C, bias].
3.  **Numerical Stability:** Log transformations inherently stabilize calculations for
    very large input values. The use of bounds in optimization further enhances this.
    All input features are guaranteed to be > 1, so `np.log(X)` is always positive and well-defined.
4.  **Theoretical Stability:** This functional form is widely used in empirical
    scaling law studies for its ability to model asymptotic behavior.

The `fit_scaling_law` optimization algorithm is enhanced with:
1.  **Informed Initialization:** Initial guesses for parameters are refined based on
    the expected range of Lossu values and the typical behavior of power-law components
    (e.g., `c_i` values are set to reflect the likely contribution of each term, and
    `e_i` values are set to a more pronounced negative exponent).
2.  **Parameter Bounds:** Strict bounds are applied to ensure parameters remain
    physically meaningful (e.g., coefficients are positive, exponents are negative
    for decreasing loss). The upper bounds for coefficients `c_i` are relaxed to `None`
    to allow the optimizer more freedom if very large coefficients are needed to scale
    very small `log(X)^e` terms.
3.  **L-BFGS-B Method:** This method is well-suited for bound-constrained
    non-linear optimization and generally performs robustly on such problems.
4.  **Tuned Options:** `maxiter` is increased, and `ftol`, `gtol` are tightened to
    encourage more thorough convergence and find a better local minimum.
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    """
    Predicts Lossu values based on input features and scaling law parameters.
    The model form is:
    Lossu = c_P * (log(P_non_vocab))^e_P + c_V * (log(vocab_size))^e_V + c_C * (log(num_characters))^e_C + bias

    Args:
        data_points (np.ndarray): (N,3) array with columns [P_non_vocab, vocab_size, num_characters].
                                  All feature values are expected to be > 1 based on data characteristics.
        params (np.ndarray): Array of 7 parameters: [c_P, c_V, c_C, e_P, e_V, e_C, bias].

    Returns:
        np.ndarray: Predicted Lossu values for each data point.
    """
    X = np.atleast_2d(np.asarray(data_points)) # Ensure X is 2D, even for single data point

    # Apply log transformation to inputs.
    # All input features are guaranteed to be > 1, so np.log(X) will always be positive and well-defined.
    log_X = np.log(X) # (N, 3)

    # Unpack parameters: 3 coefficients, 3 exponents, 1 bias
    c_P, c_V, c_C, e_P, e_V, e_C, bias = params

    # Calculate each term. For loss to decrease (become more negative) as resources increase,
    # coefficients (c_i) should be positive and exponents (e_i) should be negative.
    # This makes each term positive and decreasing, so the sum decreases towards the negative bias.
    # No need for epsilon with log_X values as they are guaranteed to be significantly > 0.
    term_P = c_P * (log_X[:, 0] ** e_P)
    term_V = c_V * (log_X[:, 1] ** e_V)
    term_C = c_C * (log_X[:, 2] ** e_C)
    
    pred = term_P + term_V + term_C + bias
    
    return pred


def fit_scaling_law(data_points, loss_values):
    """
    Optimizes the parameters of the scaling law function to best fit the observed Lossu values.

    Args:
        data_points (np.ndarray): (N,3) array with columns [P_non_vocab, vocab_size, num_characters].
        loss_values (np.ndarray): (N,) array of corresponding Lossu values.

    Returns:
        np.ndarray: Optimized parameters (7 parameters) for the scaling_law_func.
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values)
    
    # Ensure y is 1D for consistent min/max calculation and objective function
    y_flat = y.flatten()

    # Parameters to optimize: [c_P, c_V, c_C, e_P, e_V, e_C, bias]

    # Initial guess for parameters:
    # - Coefficients (c_P, c_V, c_C): Initialized to 5.0. This is a more informed guess than 1.0,
    #   considering the range of Lossu (approx 4.8 units) and the expected output from
    #   (log_X^e) terms (which can be small if 'e' is sufficiently negative).
    # - Exponents (e_P, e_V, e_C): Initialized to -0.5. This provides a more pronounced
    #   decreasing trend than -0.1, which is common in scaling laws.
    # - Bias: Represents the asymptotic minimum loss. Initialized slightly below the
    #   minimum observed Lossu, as the other terms are positive and decrease towards zero.
    initial_params = np.array([
        5.0, 5.0, 5.0,               # c_P, c_V, c_C (positive coefficients)
        -0.5, -0.5, -0.5,            # e_P, e_V, e_C (negative exponents)
        np.min(y_flat) - 0.5         # bias (asymptotic minimum Lossu)
    ])

    # Parameter bounds for L-BFGS-B optimizer:
    # - Coefficients (c_i): Constrained to be positive. The upper bound is set to `None`
    #   to allow the optimizer full freedom, as very large coefficients might be needed
    #   to scale terms with very negative exponents effectively.
    # - Exponents (e_i): Constrained to be negative. A range of -5.0 to -1e-6 is chosen
    #   as reasonable for empirical scaling law exponents.
    # - Bias: Should be close to or below the minimum observed Lossu. The range
    #   `min_observed_loss - 10.0` to `min_observed_loss + 0.1` provides flexibility
    #   around the observed minimum.
    min_observed_loss = np.min(y_flat)
    bounds = [
        (1e-6, None),                # c_P (must be positive, no upper bound)
        (1e-6, None),                # c_V (must be positive)
        (1e-6, None),                # c_C (must be positive)
        (-5.0, -1e-6),               # e_P (must be negative, reasonable range for exponents)
        (-5.0, -1e-6),               # e_V (must be negative)
        (-5.0, -1e-6),               # e_C (must be negative)
        (min_observed_loss - 10.0, min_observed_loss + 0.1) # bias (should be near or below min_observed_loss)
    ]

    def objective(params):
        """Calculates the Mean Squared Error (MSE) between predicted and actual Lossu."""
        # The scaling_law_func expects params as a 1D array of 7 elements
        pred = scaling_law_func(X, params)
        mse = np.mean((pred - y) ** 2)
        return mse

    # Use L-BFGS-B, which is suitable for bound-constrained optimization
    # and generally performs well on non-linear least squares problems.
    # Increased maxiter and tightened tolerances to encourage more thorough convergence.
    result = minimize(objective, initial_params, method='L-BFGS-B', bounds=bounds, 
                      options={'maxiter': 5000, 'ftol': 1e-10, 'gtol': 1e-8})

    if result.success:
        optimized_params = result.x
    else:
        # If optimization fails, return the initial parameters and print a warning.
        # This provides a fallback and indicates potential issues with convergence.
        print(f"Warning: Optimization failed. Message: {result.message}. Returning initial guess.")
        optimized_params = initial_params

    return optimized_params
# EVOLVE-BLOCK-END

#4 Run 3 R² = 0.949911

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios
Evolved program with a refined scaling law function and an improved optimization algorithm.

The scaling law function models Lossu as a sum of power laws applied to log-transformed
input features, plus a bias term. This form is chosen for numerical stability with large
input ranges, better capture of non-linear relationships, and common practice in scaling laws.
It uses exactly 7 parameters, satisfying the constraint.

The optimization algorithm uses a more informed initialization, expanded parameter bounds,
and increased iteration limits for the L-BFGS-B method, which is well-suited for bound-constrained
non-linear optimization. It also includes robustness checks for numerical stability during fitting.
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    """
    Predicts Lossu values based on input features and scaling law parameters.
    The model form is:
    Lossu = c_P * (log(P_non_vocab))^e_P + c_V * (log(vocab_size))^e_V + c_C * (log(num_characters))^e_C + bias

    Args:
        data_points (np.ndarray): (N,3) array with columns [P_non_vocab, vocab_size, num_characters].
                                  All feature values are expected to be > 1.
        params (np.ndarray): Array of 7 parameters: [c_P, c_V, c_C, e_P, e_V, e_C, bias].

    Returns:
        np.ndarray: Predicted Lossu values for each data point.
    """
    X = np.atleast_2d(np.asarray(data_points)) # (N, 3)

    # Apply natural logarithm transformation to inputs for numerical stability and better fitting.
    # All input features are guaranteed to be > 1 based on data characteristics,
    # so np.log(X) will always be positive and well-defined.
    log_X = np.log(X) # (N, 3)

    # Unpack parameters: 3 coefficients, 3 exponents, 1 bias
    c_P, c_V, c_C, e_P, e_V, e_C, bias = params

    # Calculate each term. With c_i > 0 and e_i < 0, each term is positive and decreases
    # (becomes smaller positive) as the corresponding log(resource) increases.
    # This leads to Lossu decreasing (becoming more negative) towards the 'bias' asymptote.
    # Handle potential runtime warnings for power of zero if e.g. log_X becomes 0,
    # though with inputs > 1, log_X is always > 0.
    term_P = c_P * (log_X[:, 0] ** e_P)
    term_V = c_V * (log_X[:, 1] ** e_V)
    term_C = c_C * (log_X[:, 2] ** e_C)
    
    pred = term_P + term_V + term_C + bias
    
    return pred


def fit_scaling_law(data_points, loss_values):
    """
    Optimizes the parameters of the scaling law function to best fit the observed Lossu values.

    Args:
        data_points (np.ndarray): (N,3) array with columns [P_non_vocab, vocab_size, num_characters].
        loss_values (np.ndarray): (N,) array of corresponding Lossu values.

    Returns:
        np.ndarray: Optimized parameters (7 parameters) for the scaling_law_func.
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values).flatten() # Ensure y is 1D

    min_loss = np.min(y)

    # Parameters: [c_P, c_V, c_C, e_P, e_V, e_C, bias]

    # Adjusted initial guess for parameters:
    # - Coefficients (c_P, c_V, c_C): Start at a moderate positive value (e.g., 5.0)
    # - Exponents (e_P, e_V, e_C): Start at a more negative value (e.g., -0.5)
    # - Bias: Represents the asymptotic minimum loss. Initialize slightly below the minimum observed loss.
    initial_params = np.array([
        5.0, 5.0, 5.0,               # c_P, c_V, c_C
        -0.5, -0.5, -0.5,            # e_P, e_V, e_C
        min_loss - 1.0               # bias (heuristic based on observed loss range)
    ])

    # Expanded parameter bounds for L-BFGS-B optimizer to allow for wider search space:
    # - Coefficients (c_i): Constrained to be positive. Expanded upper bound to allow larger values.
    # - Exponents (e_i): Constrained to be negative. Expanded lower bound to allow steeper power laws.
    #   Using -1e-6 instead of 0 to avoid numerical issues with `x**0` and ensure terms decrease.
    # - Bias: Represents the asymptotic minimum loss. Expanded range to allow more flexibility.
    bounds = [
        (1e-6, 1e5),                 # c_P (must be positive, allowing larger scale)
        (1e-6, 1e5),                 # c_V (must be positive, allowing larger scale)
        (1e-6, 1e5),                 # c_C (must be positive, allowing larger scale)
        (-20.0, -1e-6),              # e_P (must be negative, allowing steeper decay)
        (-20.0, -1e-6),              # e_V (must be negative, allowing steeper decay)
        (-20.0, -1e-6),              # e_C (must be negative, allowing steeper decay)
        (min_loss - 50.0, min_loss + 0.5) # bias (wider range, allowing the asymptote to be slightly above min_loss)
    ]

    def objective(params):
        # The scaling_law_func expects params as a 1D array of 7 elements
        pred = scaling_law_func(X, params)
        
        # Robustness check: penalize NaN or Inf predictions heavily
        # This helps the optimizer avoid regions that lead to numerical instability.
        if np.any(np.isnan(pred)) or np.any(np.isinf(pred)):
            return 1e10 # Return a very large error for invalid predictions

        mse = np.mean((pred - y) ** 2)
        return mse

    # Use L-BFGS-B, which is suitable for bound-constrained optimization
    # and generally performs well on non-linear least squares problems.
    # Increased maxiter and tightened tolerances for potentially better convergence.
    result = minimize(objective, initial_params, method='L-BFGS-B', bounds=bounds, 
                      options={'maxiter': 5000, 'ftol': 1e-10, 'gtol': 1e-8})

    if result.success:
        optimized_params = result.x
    else:
        # If optimization fails (e.g., did not converge), return the initial parameters
        # and print a warning for debugging.
        print(f"Warning: Optimization failed. Message: {result.message}")
        optimized_params = initial_params

    return optimized_params
# EVOLVE-BLOCK-END

#5 Run 5 R² = 0.000007

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios.
This evolved program aims to significantly improve fitness by addressing key numerical
stability and optimization robustness issues present in the previous version.
The primary improvements include:
1.  **Fixed Feature Normalization:** Applied within `scaling_law_func` using global reference
    constants (`P_REF`, `V_REF`, `C_REF`). This is crucial for handling input features
    spanning vastly different magnitudes (e.g., 10^7 to 10^12), preventing numerical
    instability during power law calculations and making parameter interpretation more consistent.
2.  **Multi-Restart Optimization:** The `fit_scaling_law` function now employs an increased
    number of optimization restarts (`n_restarts = 40`) with randomized initial guesses. This
    strategy is vital for non-convex objective functions like scaling law fitting, helping
    to escape local minima and find a more globally optimal solution.
3.  **Refined Parameter Bounds and Initial Guesses:** Bounds for coefficients (A, B, C) and
    exponents (alpha_P, alpha_V, alpha_C) are carefully set to ensure physical meaningfulness
    and numerical stability for normalized inputs. Initial guesses are dynamically estimated
    from observed loss values and randomized to thoroughly explore the parameter space.
4.  **Theoretically Grounded Functional Form:** The core scaling law function retains the
    additive inverse power law form (Lossu = -( A * (Feature/Ref)^(-alpha) + ... + L_min )),
    which is a widely accepted and parameter-efficient model, effectively utilizing the
    maximum allowed 7 parameters. This form captures diminishing returns as resources increase.
"""
import numpy as np
from scipy.optimize import minimize

# Define fixed reference scales for feature normalization.
# These constants are chosen to roughly center the input features around 1 or a manageable range,
# improving numerical stability for the power law calculations.
# - P_non_vocab: Data ranges from 3.3e7 to 1.1e9. P_REF = 1e8 is a good central value.
# - vocab_size: Data ranges from 4096 to 96256. V_REF = 1e4 is a good central value.
# - num_characters: Data ranges from 1e8 to 5e12. C_REF = 1e12 is chosen to prevent
#   the smallest normalized values from becoming extremely small (and thus their inverse
#   power becoming extremely large), which can destabilize optimization. This brings
#   num_characters into a range of ~1e-4 to 5.
P_REF = 1e8
V_REF = 1e4
C_REF = 1e12 

def scaling_law_func(data_points, params):
    """
    Predicts Lossu values based on a scaling law model with fixed feature normalization.
    The model form is:
    Lossu = -( A * (P_non_vocab/P_REF)^(-alpha_P) + B * (vocab_size/V_REF)^(-alpha_V) + C * (num_characters/C_REF)^(-alpha_C) + L_min )

    Parameters:
    data_points (np.ndarray): (N,3) array with columns [P_non_vocab, vocab_size, num_characters].
                              Assumed to be positive.
    params (np.ndarray): 1D array of 7 parameters:
                         [A, alpha_P, B, alpha_V, C, alpha_C, L_min]
                         A, B, C: positive coefficients for each term.
                         alpha_P, alpha_V, alpha_C: positive exponents for each term.
                         L_min: positive irreducible loss component (in the positive-loss space).

    Returns:
    np.ndarray: Predicted Lossu values (negative, where more negative is better).
    """
    X = np.asarray(data_points)
    
    # Apply fixed normalization using predefined reference scales
    # and ensure all normalized inputs are positive to prevent issues with fractional powers.
    X_norm = np.copy(X).astype(float) # Ensure float type for division and powers
    X_norm[:, 0] = np.maximum(X_norm[:, 0] / P_REF, 1e-12) # P_non_vocab
    X_norm[:, 1] = np.maximum(X_norm[:, 1] / V_REF, 1e-12) # vocab_size
    X_norm[:, 2] = np.maximum(X_norm[:, 2] / C_REF, 1e-12) # num_characters

    # Unpack parameters for clarity
    A, alpha_P, B, alpha_V, C, alpha_C, L_min = params

    # Calculate the positive-valued loss components (Y)
    # Each term represents a diminishing return as the corresponding factor increases.
    predicted_Y = (A * (X_norm[:, 0] ** -alpha_P) +
                   B * (X_norm[:, 1] ** -alpha_V) +
                   C * (X_norm[:, 2] ** -alpha_C) +
                   L_min)
    
    # Lossu measures improvement over unigram (negative = better),
    # so we return the negative of the predicted positive-valued loss Y.
    return -predicted_Y


def fit_scaling_law(data_points, loss_values):
    """
    Fits the scaling law function to the provided data using L-BFGS-B optimization.
    This version incorporates:
    1. Fixed feature normalization (handled by `scaling_law_func` using global constants).
    2. More extensive multiple restarts to better explore the parameter space and avoid local minima.
    3. Refined initial parameter guesses and randomization ranges tailored for normalized inputs
       and the varying impact of different features.
    4. Adjusted parameter bounds for exponents to allow for a wider search.

    Parameters:
    data_points (np.ndarray): (N,3) array with columns [P_non_vocab, vocab_size, num_characters].
    loss_values (np.ndarray): (N,) array of corresponding Lossu values.

    Returns:
    np.ndarray: Optimized parameters [A, alpha_P, B, alpha_V, C, alpha_C, L_min].
    """
    X = np.asarray(data_points)
    y_lossu = np.asarray(loss_values)
    
    # Convert Lossu (negative = better) to positive loss (Y) for easier interpretation of L_min
    y_positive_loss = -y_lossu

    min_observed_positive_loss = np.min(y_positive_loss)
    max_observed_positive_loss = np.max(y_positive_loss)
    
    # Define parameter bounds to ensure physical meaningfulness and numerical stability.
    # Coefficients (A, B, C) and L_min must be positive.
    # Exponents (alpha_P, alpha_V, alpha_C) must be positive and typically not extremely large.
    # L_min should be positive and strictly less than the minimum observed positive loss.
    bounds = [
        (1e-10, None),                  # A: Coefficient, positive.
        (1e-9, 2.0),                    # alpha_P: Exponent, positive and up to 2.0 (common for scaling laws).
        (1e-10, None),                  # B: Coefficient, positive.
        (1e-9, 2.0),                    # alpha_V: Exponent, positive and up to 2.0.
        (1e-10, None),                  # C: Coefficient, positive.
        (1e-9, 2.0),                    # alpha_C: Exponent, positive and up to 2.0.
        (1e-9, min_observed_positive_loss - 1e-6) # L_min: Must be positive and strictly less than min observed loss.
    ]
    # Ensure L_min upper bound is valid; if calculated upper bound is too tight or invalid, set a small default.
    if bounds[6][1] <= 1e-9:
        bounds[6] = (1e-9, 0.1) 
    
    def objective(params):
        """Calculates the Mean Squared Error between predicted and actual Lossu."""
        predicted_lossu = scaling_law_func(X, params)
        mse = np.mean((predicted_lossu - y_lossu) ** 2)
        return mse

    best_params = None
    min_mse = np.inf

    n_restarts = 40 # Increased number of restarts for more thorough exploration of the parameter space.

    # Base initial guess for parameters, adjusted for the feature normalization and their relative impact.
    # L_min is initialized as a fraction of the minimum observed positive loss.
    l_min_init_guess = min_observed_positive_loss * 0.1 if min_observed_positive_loss * 0.1 > 1e-9 else 0.01
    if l_min_init_guess > bounds[6][1]: # Ensure the initial guess for L_min is within its upper bound
        l_min_init_guess = bounds[6][1] / 2 
    if l_min_init_guess < bounds[6][0]: # Ensure the initial guess for L_min is within its lower bound
        l_min_init_guess = bounds[6][0] * 2

    # Estimated coefficients based on typical median normalized feature contributions (assuming alpha=0.5)
    # and distributing the average loss. 'C' is smaller as 'num_characters' normalized factor is often larger.
    base_initial_params = np.array([
        2.5,    # A: Coefficient for P_non_vocab term (normalized)
        0.5,    # alpha_P: Exponent
        2.5,    # B: Coefficient for vocab_size term (normalized)
        0.5,    # alpha_V: Exponent
        0.25,   # C: Coefficient for num_characters term (normalized)
        0.5,    # alpha_C: Exponent
        l_min_init_guess # L_min: Irreducible loss component
    ])

    # Randomization ranges for coefficients A, B, C.
    # Allowing a wider range for coefficients to capture diverse contributions,
    # especially considering the wide range of normalized feature values.
    coeff_rand_min_val = 1e-6 # Allows coefficients to be very small if a feature has minimal impact
    coeff_rand_max_val = max_observed_positive_loss * 15 # Allows for larger coefficients if a feature dominates or normalized factor is small

    for i in range(n_restarts):
        # Generate randomized initial guesses for each parameter.
        # Exponents are randomized across their full defined bounds.
        # L_min is randomized within its valid range.
        l_min_rand_upper = min(bounds[6][1], (min_observed_positive_loss + l_min_init_guess) / 2)
        
        rand_params = np.array([
            np.random.uniform(max(bounds[0][0], coeff_rand_min_val), coeff_rand_max_val), # A
            np.random.uniform(bounds[1][0], bounds[1][1]),                                # alpha_P (full range)
            np.random.uniform(max(bounds[2][0], coeff_rand_min_val), coeff_rand_max_val), # B
            np.random.uniform(bounds[3][0], bounds[3][1]),                                # alpha_V (full range)
            np.random.uniform(max(bounds[4][0], coeff_rand_min_val), coeff_rand_max_val), # C
            np.random.uniform(bounds[5][0], bounds[5][1]),                                # alpha_C (full range)
            np.random.uniform(bounds[6][0], l_min_rand_upper)                             # L_min
        ])
        
        # Perform optimization using L-BFGS-B, which is suitable for bounded problems.
        result = minimize(objective, rand_params, method='L-BFGS-B', bounds=bounds)

        if result.success:
            current_mse = objective(result.x)
            if current_mse < min_mse:
                min_mse = current_mse
                best_params = result.x

    if best_params is not None:
        return best_params
    else:
        # Fallback: If all randomized restarts fail, attempt one final optimization with the fixed base initial guess.
        print("Warning: All randomized optimization restarts failed. Attempting final optimization with base initial guess.")
        result = minimize(objective, base_initial_params, method='L-BFGS-B', bounds=bounds)
        if result.success:
            return result.x
        else:
            # If even the base guess fails, return the base initial guess as a last resort.
            print(f"Warning: Final optimization with base guess failed: {result.message}. Returning base initial guess.")
            return base_initial_params

# EVOLVE-BLOCK-END