SLD - Parallel Scaling Law - SLDAgent + Gemini 2.5 Flash

All Runs (sorted by R²)

Best Run 4 R² = 0.999974

▼

Python

# EVOLVE-BLOCK-START
import numpy as np
from scipy.optimize import least_squares

def scaling_law_func(data_points, params):
    """
    Predicts language modeling loss based on num_params and parallel_size
    using a 4-parameter scaling law.

    Model Form: Loss = (A * (num_params ** -alpha) + B) * (parallel_size ** -beta)

    This model posits that the total loss is composed of a part that scales with
    model parameters (A * num_params^-alpha) and a base loss component (B).
    Both of these components are then reduced by parallelization (scaled by parallel_size^-beta).
    This implies that parallel augmentation reduces all components of the loss,
    including a baseline that is independent of num_params. This structure often
    provides a more accurate fit when the "irreducible" loss itself can be
    influenced by factors like parallelization.

    Parameters:
    - data_points: (N,2) array with columns [num_params, parallel_size]
    - params: 1D array of 4 parameters [A, alpha, B, beta]
        - A (p0): Coefficient for the num_params scaling term. Expected positive.
        - alpha (p1): Exponent for num_params. Expected positive, typically < 1.0.
        - B (p2): Base loss component, which is still subject to parallel scaling. Expected positive.
        - beta (p3): Exponent for parallel_size scaling. Expected positive, typically < 1.0.

    Returns:
    - Predicted loss values (N,)
    """
    num_params = data_points[:, 0]
    parallel_size = data_points[:, 1]

    # Unpack parameters
    A, alpha, B, beta = params

    # Calculate num_params scaling term
    # Using np.power for robust handling of exponents.
    num_params_scaled = np.power(num_params, -alpha)

    # Calculate parallel_size scaling term
    parallel_size_scaled = np.power(parallel_size, -beta)

    # Combine terms according to the chosen model form
    predicted_loss = (A * num_params_scaled + B) * parallel_size_scaled
    
    return predicted_loss

def fit_scaling_law(data_points, loss_values):
    """
    Optimizes the 4 parameters of the scaling_law_func to fit the given data.

    This function employs an improved initial guess strategy based on the range of input
    data and observed loss values, enhancing the robustness of the optimization process.
    It uses the least_squares method, which is well-suited for non-linear curve fitting
    with bounds, to ensure physical meaningfulness of the fitted parameters.

    Parameters:
    - data_points: (N,2) array with columns [num_params, parallel_size]
    - loss_values: Array of corresponding loss values (N,)

    Returns:
    - Optimized parameters (1D array of 4 parameters)
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values)

    # Residuals function for least_squares.
    # least_squares minimizes the sum of squares of the values returned by this function.
    def residuals(params, X_data, y_data):
        pred = scaling_law_func(X_data, params)
        # Ensure predictions are non-negative, as loss cannot be negative.
        # This prevents numerical issues (e.g., if negative predictions were to be logged later).
        pred = np.maximum(pred, 1e-9) 
        return pred - y_data # Return the difference between prediction and actual value

    # --- Initial Guess Calculation for Model: L = (A * N^-alpha + B) * P^-beta ---
    # These heuristics are derived to provide robust starting points for the optimizer,
    # aiding convergence and finding better minima across different datasets.
    num_params_data = X[:, 0]
    parallel_size_data = X[:, 1]
    
    loss_min_obs = np.min(y)
    loss_max_obs = np.max(y)
    num_params_min_obs = np.min(num_params_data)
    # parallel_size_min_obs is typically 1; parallel_size_max_obs is 4, based on problem description.
    parallel_size_min_obs = np.min(parallel_size_data) 
    parallel_size_max_obs = np.max(parallel_size_data) 

    # Initial guesses for exponents (alpha, beta).
    # These are common values for scaling exponents, indicating diminishing returns.
    initial_alpha = 0.15
    initial_beta = 0.1

    # Heuristic for initial B (base loss component before parallel scaling):
    # The minimum observed loss (loss_min_obs) typically occurs at the highest num_params
    # and highest parallel_size (parallel_size_max_obs).
    # At very large num_params (N -> inf), the term A * N^-alpha approaches 0.
    # So, loss_min_obs ≈ B * (parallel_size_max_obs ** -initial_beta).
    # Solving for B: B ≈ loss_min_obs / (parallel_size_max_obs ** -initial_beta).
    initial_B = loss_min_obs / (parallel_size_max_obs ** -initial_beta)
    initial_B = np.maximum(initial_B, 1e-9) # Ensure B is positive and not extremely small

    # Heuristic for initial A (coefficient for num_params scaling):
    # The maximum observed loss (loss_max_obs) typically occurs at the minimum num_params
    # (num_params_min_obs) and minimum parallel_size (parallel_size_min_obs = 1).
    # The model equation at this point is:
    # loss_max_obs = (A * (num_params_min_obs ** -initial_alpha) + B) * (parallel_size_min_obs ** -initial_beta)
    
    # Since parallel_size_min_obs is 1, (parallel_size_min_obs ** -initial_beta) simplifies to 1.
    # So, the equation becomes: loss_max_obs = A * (num_params_min_obs ** -initial_alpha) + B
    # Rearranging to solve for A:
    # A = (loss_max_obs - B) / (num_params_min_obs ** -initial_alpha)
    
    numerator_A = loss_max_obs - initial_B
    # A must be positive. If (loss_max_obs - initial_B) is negative (e.g., due to an overestimated initial_B),
    # clamp it to a small positive value to prevent negative A.
    numerator_A = np.maximum(numerator_A, 1e-9) 

    denominator_A = num_params_min_obs ** -initial_alpha
    denominator_A = np.maximum(denominator_A, 1e-9) # Prevent division by zero or very small numbers
    
    initial_A = numerator_A / denominator_A
    initial_A = np.clip(initial_A, 1e-3, 1e5) # Clamp A to a reasonable range to prevent extreme initial values

    # Assemble initial parameters in the order [A, alpha, B, beta]
    initial_params = np.array([initial_A, initial_alpha, initial_B, initial_beta])

    # Bounds for parameters [A, alpha, B, beta] for least_squares:
    # All parameters are expected to be positive for this model to make physical sense
    # in the context of diminishing loss returns. Exponents (alpha, beta) typically less than 1.0.
    bounds_lower = [1e-9, 1e-9, 1e-9, 1e-9]  # A, alpha, B, beta must be positive
    bounds_upper = [np.inf, 1.0, np.inf, 1.0] # alpha and beta typically < 1.0 for diminishing returns

    # Use least_squares with 'trf' method which is robust and handles bounds well.
    result = least_squares(residuals, initial_params, args=(X, y), 
                           bounds=(bounds_lower, bounds_upper), method='trf', loss='linear')

    # least_squares always returns result.x, which contains the optimized parameters.
    # No need for result.success check as least_squares is generally more robust
    # in returning a solution even if not globally optimal.
    params_opt = result.x

    return params_opt
# EVOLVE-BLOCK-END

#2 Run 2 R² = 0.999969

▼

Python

# EVOLVE-BLOCK-START
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    """
    Predicts language modeling loss based on num_params and parallel_size
    using a 4-parameter scaling law.

    Model Form: Loss = (A * (num_params ** -alpha) + B) * (parallel_size ** -beta)

    Parameters:
    - data_points: (N,2) array with columns [num_params, parallel_size]
    - params: 1D array of 4 parameters [A, alpha, B, beta]
        - A (p0): Coefficient for num_params scaling term.
        - alpha (p1): Exponent for num_params (should be positive).
        - B (p2): Base loss component.
        - beta (p3): Exponent for parallel_size scaling (should be positive).

    Returns:
    - Predicted loss values (N,)
    """
    num_params = data_points[:, 0]
    parallel_size = data_points[:, 1]

    # Unpack parameters
    A, alpha, B, beta = params

    # Calculate num_params scaling term
    # np.power handles potential large numbers and negative exponents robustly.
    num_params_scaled = np.power(num_params, -alpha)

    # Calculate parallel_size scaling term
    parallel_size_scaled = np.power(parallel_size, -beta)

    # Combine terms according to the chosen model form
    predicted_loss = (A * num_params_scaled + B) * parallel_size_scaled
    
    return predicted_loss

def fit_scaling_law(data_points, loss_values):
    """
    Optimizes the 4 parameters of the scaling_law_func to fit the given data.

    Parameters:
    - data_points: (N,2) array with columns [num_params, parallel_size]
    - loss_values: Array of corresponding loss values (N,)

    Returns:
    - Optimized parameters (1D array of 4 parameters)
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values)

    # Objective function to minimize (Mean Squared Error)
    def objective(params):
        pred = scaling_law_func(X, params)
        # Ensure predictions are non-negative, as loss cannot be negative.
        # A small epsilon prevents issues with log(0) if further transformations are applied.
        pred = np.maximum(pred, 1e-9) 
        mse = np.mean((pred - y) ** 2)
        return mse

    # Initial guess for parameters [A, alpha, B, beta]
    # These heuristics are based on typical scaling law exponents and observed data ranges.
    # A (p0): Coefficient, needs to scale num_params^-alpha to match loss magnitude.
    # alpha (p1): Exponent for num_params, typically between 0.05 and 0.3.
    # B (p2): Base loss component, often around the minimum observed loss or a fraction of it.
    # beta (p3): Exponent for parallel_size, similar range to alpha.
    
    # A initial guess that produces values in the observed loss range.
    initial_params = np.array([50.0, 0.15, 1.0, 0.1])

    # Bounds for parameters using L-BFGS-B:
    # All parameters are expected to be positive for this model to make physical sense.
    # Exponents (alpha, beta) are typically less than 1.0.
    bounds = [
        (1e-9, None),  # A (p0): Must be positive
        (1e-9, 1.0),   # alpha (p1): Must be positive, usually < 1.0
        (1e-9, None),  # B (p2): Must be positive
        (1e-9, 1.0)    # beta (p3): Must be positive, usually < 1.0
    ]

    # Use L-BFGS-B, which supports bounds for robust optimization.
    result = minimize(objective, initial_params, method='L-BFGS-B', bounds=bounds)

    # Return optimized parameters if successful, otherwise return the initial guess.
    params_opt = result.x if result.success else initial_params

    return params_opt
# EVOLVE-BLOCK-END

#3 Run 1 R² = 0.999954

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios
This evolved program refines the 4-parameter scaling law and optimization strategy
 by incorporating a more robust, data-driven initialization for the parameters
 using log-linear regression, alongside existing numerical stability improvements
 and carefully selected bounds.

The scaling law form remains: Loss = A * (num_params / N_scale)^(-alpha) * (parallel_size)^(-beta) + B.
This formulation ensures that increasing num_params or parallel_size leads to decreasing loss.
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    """
    Predicts language modeling loss based on model parameter count and parallel size.
    Uses a 4-parameter scaling law: L = A * (N/N_scale)^(-alpha) * P^(-beta) + B.
    Here, alpha and beta are expected to be positive exponents, indicating diminishing
    returns as N and P increase. N_scale is 1e9.

    Args:
        data_points (np.ndarray): (N,2) array with columns [num_params, parallel_size].
        params (np.ndarray): Array of 4 parameters [A, alpha, beta, B].
                             A: Scaling coefficient (positive).
                             alpha: Positive exponent for num_params.
                             beta: Positive exponent for parallel_size.
                             B: Irreducible loss (offset, positive).

    Returns:
        np.ndarray: Predicted loss values.
    """
    num_params_raw = data_points[:, 0]
    parallel_size = data_points[:, 1]

    # Normalize num_params by 1e9 (billions) to improve numerical stability during calculation.
    N_SCALE = 1e9
    num_params_scaled = num_params_raw / N_SCALE

    A, alpha, beta, B = params

    # Using log-exp transformation for numerical stability in power calculations.
    log_num_params_scaled = np.log(num_params_scaled)
    log_parallel_size = np.log(parallel_size)

    # Calculate terms (N/N_SCALE)^(-alpha) and P^(-beta)
    term_num_params = np.exp(-alpha * log_num_params_scaled)
    term_parallel_size = np.exp(-beta * log_parallel_size)

    pred_loss = A * term_num_params * term_parallel_size + B
    
    # Loss values must be non-negative. Clamp predictions at a very small positive value.
    return np.maximum(pred_loss, 1e-9)


def fit_scaling_law(data_points, loss_values):
    """
    Fits the scaling law function to the provided data using bounded optimization.
    Incorporates a data-driven initialization strategy using log-linear regression.

    Args:
        data_points (np.ndarray): (N,2) array with columns [num_params, parallel_size].
        loss_values (np.ndarray): Array of corresponding loss values (1D).

    Returns:
        np.ndarray: Optimized parameters [A, alpha, beta, B].
    """
    num_params_raw = data_points[:, 0]
    parallel_size_raw = data_points[:, 1]
    y = np.asarray(loss_values)

    def objective(params):
        """Objective function to minimize (Mean Squared Error)."""
        pred = scaling_law_func(data_points, params)
        mse = np.mean((pred - y) ** 2)
        return mse

    # --- Data-driven initial guesses for parameters [A, alpha, beta, B] ---
    # 1. Estimate B_init: Must be positive and strictly less than the minimum observed loss.
    # Using 0.999 to be very close but still allow for some fitting space.
    B_init_candidate = np.min(y) * 0.999
    # Fallback to a lower multiplier if 0.999 makes B_init_candidate too close or equal to min(y).
    # This is crucial for np.log(y - B_init) to be well-defined.
    if B_init_candidate >= np.min(y) - 1e-9: # Check if it's too close or equal
        B_init_candidate = np.min(y) * 0.9 # Use a more conservative multiplier
        if B_init_candidate < 1e-9: B_init_candidate = 1e-9 # Ensure B_init is at least a small positive value

    N_SCALE = 1e9 # Normalization constant for num_params

    # 2. Prepare data for log-linear regression to estimate A, alpha, beta
    # The scaling law can be linearized as: log(L - B) = log(A) - alpha * log(N/N_SCALE) - beta * log(P)
    # Filter data points where (y - B_init_candidate) is positive for log transform
    mask = (y - B_init_candidate) > 1e-9 # Ensure strictly positive difference
    
    # Need at least 3 data points for a linear regression with 3 coefficients (log(A), alpha, beta)
    # Adding a constant term (intercept) requires at least 1 more point than the number of variables.
    # So, 3 coefficients + 1 intercept = 4 points.
    if np.sum(mask) < 4:
        # Fallback to previous heuristic if not enough valid points for regression
        alpha_init = 0.1
        beta_init = 0.05
        B_init = B_init_candidate
        
        median_N_scaled = np.median(num_params_raw) / N_SCALE
        median_P = np.median(parallel_size_raw)
        
        A_init_denominator = np.exp(-alpha_init * np.log(median_N_scaled)) * np.exp(-beta_init * np.log(median_P))
        
        numerator = np.mean(y) - B_init
        if numerator <= 1e-9:
            numerator = np.mean(y) * 0.5
            if numerator < 1e-9: numerator = 1e-9
        A_init = numerator / A_init_denominator
        if A_init < 1e-9 or not np.isfinite(A_init):
            A_init = np.max(y) * 2
            if A_init < 1e-9: A_init = 1.0
        
        initial_params = np.array([A_init, alpha_init, beta_init, B_init])
    else:
        y_masked = y[mask]
        num_params_masked = num_params_raw[mask]
        parallel_size_masked = parallel_size_raw[mask]
        
        Y_reg = np.log(y_masked - B_init_candidate)
        X_reg = np.column_stack([
            np.ones(len(y_masked)),                       # Intercept term for log(A)
            -np.log(num_params_masked / N_SCALE),         # Coefficient for alpha
            -np.log(parallel_size_masked)                 # Coefficient for beta
        ])

        try:
            # Perform linear regression using numpy's least squares solver
            coeffs, residuals, rank, s = np.linalg.lstsq(X_reg, Y_reg, rcond=None)
            log_A_init_reg, alpha_init_reg, beta_init_reg = coeffs
            
            # Convert log_A back to A
            A_init = np.exp(log_A_init_reg)
            alpha_init = alpha_init_reg
            beta_init = beta_init_reg
            B_init = B_init_candidate

            # Clip initial guesses to be within reasonable bounds for the optimizer
            # This prevents starting from values that are physically implausible or
            # could cause numerical issues for the optimizer.
            A_init = np.clip(A_init, 1e-9, None) # A must be positive
            alpha_init = np.clip(alpha_init, 1e-9, 1.0) # alpha typically between 0 and 1
            beta_init = np.clip(beta_init, 1e-9, 1.0)   # beta typically between 0 and 1
            
            initial_params = np.array([A_init, alpha_init, beta_init, B_init])

        except Exception as e:
            # Fallback to the previous heuristic if linear regression encounters an error
            print(f"Warning: Linear regression for initial guess failed: {e}. Falling back to heuristic.")
            alpha_init = 0.1
            beta_init = 0.05
            B_init = B_init_candidate
            median_N_scaled = np.median(num_params_raw) / N_SCALE
            median_P = np.median(parallel_size_raw)
            A_init_denominator = np.exp(-alpha_init * np.log(median_N_scaled)) * np.exp(-beta_init * np.log(median_P))
            numerator = np.mean(y) - B_init
            if numerator <= 1e-9:
                numerator = np.mean(y) * 0.5
                if numerator < 1e-9: numerator = 1e-9
            A_init = numerator / A_init_denominator
            if A_init < 1e-9 or not np.isfinite(A_init):
                A_init = np.max(y) * 2
                if A_init < 1e-9: A_init = 1.0
            initial_params = np.array([A_init, alpha_init, beta_init, B_init])

    # Bounds for parameters [A, alpha, beta, B].
    # These bounds constrain the optimization to physically meaningful regions.
    bounds = [
        (1e-9, None),                   # A: Must be positive.
        (1e-9, 1.0),                    # alpha: Must be positive and typically < 1.0 for diminishing returns.
        (1e-9, 1.0),                    # beta: Must be positive and typically < 1.0.
        (1e-9, np.min(y) * 0.999)       # B: Must be positive and strictly less than the minimum observed loss.
    ]
    # Ensure the upper bound for B is sensible if np.min(y) is very small.
    if bounds[3][1] < 1e-9: bounds[3] = (1e-9, 1e-9) 

    # Use 'L-BFGS-B' for bounded optimization, which is suitable for this problem.
    result = minimize(objective, initial_params, method='L-BFGS-B', bounds=bounds)

    if result.success:
        return result.x
    else:
        # If optimization fails, print a warning and return the initial parameters as a fallback.
        print(f"Warning: Optimization failed. Message: {result.message}. Returning initial parameters.")
        return initial_params
# EVOLVE-BLOCK-END

#4 Run 3 R² = 0.999954

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios.
This program utilizes a multiplicative power law model with an irreducible loss term,
which has shown strong performance in modeling LLM scaling behavior across different
parallel configurations and datasets. It explicitly models the combined effect of
model parameters and parallelization, alongside a fundamental lower bound on loss.

The optimization algorithm is evolved to use `scipy.optimize.least_squares` with the
Trust Region Reflective ('trf') method. This is a more specialized and robust approach
for non-linear least squares problems, offering better convergence properties and
handling of bounds compared to general-purpose minimizers like L-BFGS-B when the
objective is a sum of squares (like MSE). Initial parameter guesses are also refined
to be more data-driven, further aiding optimization stability and accuracy.
"""
import numpy as np
from scipy.optimize import least_squares

def scaling_law_func(data_points, params):
    """
    Predicts language modeling loss based on model parameters and parallel size.
    The chosen scaling law is: Loss = A * (num_params^-alpha) * (parallel_size^-beta) + B
    This form captures the diminishing returns from increasing model size and parallelization
    through multiplicative power laws, and includes an irreducible loss component 'B'.

    Args:
        data_points (np.ndarray): A (N, 2) array where N is the number of data points.
                                  Column 0 is 'num_params' (model parameter count).
                                  Column 1 is 'parallel_size' (number of parallel copies).
        params (np.ndarray): A 1D array of 4 parameters: [A, alpha, beta, B].
                             A: Coefficient for the combined power law term.
                             alpha: Exponent for num_params (expected positive).
                             beta: Exponent for parallel_size (expected positive).
                             B: Irreducible loss (expected positive).

    Returns:
        np.ndarray: Predicted loss values, a (N,) array.
    """
    num_params = data_points[:, 0]
    parallel_size = data_points[:, 1]

    # Unpack the 4 parameters
    A, alpha, beta, B = params

    # Use log-transformed features for numerical stability when computing power laws.
    # This approach (exp(-exponent * log(base))) is generally more stable than base**(-exponent)
    # for a wide range of values, especially when exponents are small or bases are large.
    log_num_params = np.log(num_params)
    log_parallel_size = np.log(parallel_size)

    # Calculate the power law terms: num_params^(-alpha) and parallel_size^(-beta)
    term_num_params = np.exp(-alpha * log_num_params)
    term_parallel_size = np.exp(-beta * log_parallel_size)

    # Combine terms according to the scaling law: A * N^(-alpha) * P^(-beta) + B
    pred_loss = A * term_num_params * term_parallel_size + B

    return pred_loss


def fit_scaling_law(data_points, loss_values):
    """
    Fits the scaling law function to the provided data using bounded non-linear least squares.
    The `scipy.optimize.least_squares` method with 'trf' (Trust Region Reflective) is used
    because it is well-suited for bounded, non-linear curve fitting problems, directly
    minimizing the sum of squares of residuals.

    Args:
        data_points (np.ndarray): A (N, 2) array with columns [num_params, parallel_size].
        loss_values (np.ndarray): A (N,) array of corresponding loss values.

    Returns:
        np.ndarray: Optimized parameters [A, alpha, beta, B] (a 1D array of 4 parameters).
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values)

    # Calculate more informed initial guesses for parameters [A, alpha, beta, B].
    # These heuristics are based on typical observations in LLM scaling laws
    # and the range of the provided loss data, aiming for better convergence.
    y_max = np.max(y)
    y_min = np.min(y)
    min_num_params = np.min(X[:, 0])
    min_parallel_size = np.min(X[:, 1])

    alpha_init = 0.1        # Typical exponent for num_params (e.g., 0.05 to 0.15)
    beta_init = 0.05        # Exponent for parallel_size, often smaller than alpha
    B_init = y_min * 0.9    # Irreducible loss, slightly below the minimum observed loss

    # Calculate A_init based on the other initial guesses and the maximum observed loss.
    # We approximate (y_max - B_init) as A_init * (min_num_params^-alpha_init) * (min_parallel_size^-beta_init).
    # This provides a more data-driven initial guess for A, improving optimization start.
    term_at_min_inputs = np.exp(-alpha_init * np.log(min_num_params)) * np.exp(-beta_init * np.log(min_parallel_size))
    # Add a small epsilon to the denominator to prevent potential division by zero, though unlikely with positive inputs.
    A_init = (y_max - B_init) / (term_at_min_inputs + 1e-9) 

    initial_params = np.array([A_init, alpha_init, beta_init, B_init])

    # Define bounds for parameters to enforce physical and theoretical constraints:
    # A, alpha, beta, B must all be positive.
    # Exponents alpha and beta are typically less than 1.0 for diminishing returns.
    # `least_squares` expects bounds as a tuple of two arrays: (lower_bounds, upper_bounds).
    lower_bounds = [1e-6, 1e-6, 1e-6, 1e-6]
    upper_bounds = [np.inf, 1.0, 1.0, np.inf] # `np.inf` serves as 'None' for upper bounds

    def residuals(params):
        """
        Residuals function for `least_squares`, calculated as (predicted - actual).
        `least_squares` minimizes the sum of squares of these residuals.
        """
        pred = scaling_law_func(X, params)
        return pred - y

    # Perform the optimization using `least_squares` with the 'trf' method,
    # which is suitable for bounded problems.
    result = least_squares(residuals, initial_params, bounds=(lower_bounds, upper_bounds), method='trf')

    # Return the optimized parameters. `least_squares` typically returns a valid `x`
    # even if convergence status is not optimal, so direct return is robust.
    return result.x
# EVOLVE-BLOCK-END

#5 Run 5 R² = 0.999954

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios
This evolved program refines the robust scaling law and optimization method
for enhanced accuracy, numerical stability, and realism of fitted parameters.
It combines the effective scaling law form with an optimization strategy that
balances robustness and flexibility to achieve higher fitness.
The functional form remains a power law with an irreducible loss, well-suited
for modeling diminishing returns in LLM scaling.

This version introduces a normalization constant for `num_params` within the
scaling law function to improve numerical stability during optimization and
provide a more interpretable `A` parameter, representing the coefficient
at a reference scale (1 billion parameters). It also uses a more refined
initial guess for `A` and robust bounds for all parameters, particularly for
the irreducible loss `B`, to ensure theoretical consistency and numerical stability.
"""
import numpy as np
from scipy.optimize import minimize

# Define a scaling constant for num_params to improve numerical stability and parameter interpretability.
# This makes the 'A' parameter represent the base loss contribution for a model with 1 billion parameters
# when parallel_size is 1.
NUM_PARAMS_SCALE_REF = 1e9 # Reference point: 1 Billion parameters

def scaling_law_func(data_points, params):
    """
    Predicts language modeling loss based on model parameters and parallel size.
    The scaling law used is: Loss = A * (num_params_scaled^-alpha) * (parallel_size^-beta) + B
    where num_params_scaled = num_params / NUM_PARAMS_SCALE_REF.
    This form captures the diminishing returns from increasing model size and parallelization,
    and includes an irreducible loss component. It uses exactly 4 parameters.

    Args:
        data_points (np.ndarray): A (N, 2) array where N is the number of data points.
                                  Column 0 is 'num_params' (model parameter count).
                                  Column 1 is 'parallel_size' (number of parallel copies).
        params (np.ndarray): A 1D array of 4 parameters: [A, alpha, beta, B].
                             A: Coefficient for the power law term (expected positive),
                                scaled for num_params at NUM_PARAMS_SCALE_REF.
                             alpha: Exponent for num_params (expected positive).
                             beta: Exponent for parallel_size (expected positive).
                             B: Irreducible loss (expected positive).

    Returns:
        np.ndarray: Predicted loss values, a (N,) array.
    """
    num_params = data_points[:, 0]
    parallel_size = data_points[:, 1]

    # Unpack the 4 parameters. The optimization's bounds ensure these are positive,
    # so explicit clamping inside the function is not strictly necessary and removed for clarity.
    A, alpha, beta, B = params

    # Apply scaling to num_params for numerical stability and consistent parameter magnitude.
    num_params_scaled = num_params / NUM_PARAMS_SCALE_REF

    # Use log-transformed features for numerical stability when computing power laws.
    # np.log(x) and np.exp(y) are generally more stable than x**y.
    # np.maximum(x, 1e-9) is used to prevent log(0) or log of very small numbers,
    # ensuring robustness although inputs are expected positive.
    log_num_params_scaled = np.log(np.maximum(num_params_scaled, 1e-9))
    log_parallel_size = np.log(np.maximum(parallel_size, 1e-9))

    # Calculate the power law terms: num_params_scaled^(-alpha) and parallel_size^(-beta)
    # This is mathematically equivalent to exp(-alpha * log(num_params_scaled)) and exp(-beta * log(parallel_size)),
    # offering better numerical stability for exponents.
    term_num_params = np.exp(-alpha * log_num_params_scaled)
    term_parallel_size = np.exp(-beta * log_parallel_size)

    # Combine terms according to the scaling law: A * (N/N_ref)^(-alpha) * P^(-beta) + B
    pred_loss = A * term_num_params * term_parallel_size + B

    return pred_loss


def fit_scaling_law(data_points, loss_values):
    """
    Fits the scaling law function to the provided data using bounded optimization.

    Args:
        data_points (np.ndarray): A (N, 2) array with columns [num_params, parallel_size].
        loss_values (np.ndarray): A (N,) array of corresponding loss values.

    Returns:
        np.ndarray: Optimized parameters [A, alpha, beta, B] (a 1D array of 4 parameters).
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values)

    num_params_data = X[:, 0]
    parallel_size_data = X[:, 1]

    # Initial guesses for parameters [A, alpha, beta, B]
    # These are chosen based on typical scaling law observations and data characteristics.
    
    # B: Irreducible loss. Should be positive and less than the minimum observed loss.
    B_init = np.min(y) * 0.95 # A slightly more conservative estimate than 0.9 for robustness.
    
    # alpha: Exponent for num_params. Typical LLM scaling is around 0.05 to 0.15.
    alpha_init = 0.1
    # beta: Exponent for parallel_size. Expected to be positive, often smaller than alpha.
    beta_init = 0.05
    
    # A: Coefficient for the power law term.
    # Estimate A_init more robustly by using the mean values of data and loss,
    # considering the NUM_PARAMS_SCALE_REF.
    # Formula: Loss = A * (num_params / N_ref)^(-alpha) * parallel_size^(-beta) + B
    # So, A = (Loss - B) * (num_params / N_ref)^alpha * parallel_size^beta
    mean_num_params = np.mean(num_params_data)
    mean_parallel_size = np.mean(parallel_size_data)
    mean_loss = np.mean(y)

    # Ensure (mean_loss - B_init) is positive for a valid A_init calculation.
    # If not, fall back to a simpler, safe A_init.
    if mean_loss <= B_init:
        # Fallback if B_init is too high or loss values are very flat/low.
        # This A_init is scaled to roughly match the expected loss contribution.
        A_init = (np.max(y) - B_init) * 1.5 if np.max(y) > B_init else 1.0 
    else:
        # Calculate A_init based on the formula, using scaled num_params
        A_init = (mean_loss - B_init) * \
                 (mean_num_params / NUM_PARAMS_SCALE_REF)**alpha_init * \
                 mean_parallel_size**beta_init
        # Ensure A_init is not too small, must be positive
        A_init = max(A_init, 1e-6)

    initial_params = np.array([A_init, alpha_init, beta_init, B_init])

    # Bounds for parameters to ensure theoretical validity and numerical stability.
    # Reverting to wider upper bounds for alpha and beta (1.0) as observed in higher-scoring
    # previous attempts. This provides the optimizer with more flexibility to find the best fit,
    # especially with limited data or if optimal exponents are slightly outside typical ranges.
    # A: Must be positive.
    # alpha, beta: Must be positive. Upper bounds are relaxed to 1.0.
    # B: Must be positive and strictly less than the minimum observed loss.
    # This upper bound for B enforces its interpretation as an irreducible loss component.
    bounds = [(1e-6, None), (1e-6, 1.0), (1e-6, 1.0), (1e-6, np.min(y) - 1e-6)]
    # Note: np.min(y) - 1e-6 is safe as min(y) for this dataset is ~0.99.

    def objective(params):
        """Objective function to minimize (Mean Squared Error)."""
        pred = scaling_law_func(X, params)
        # Return a very large error if predictions are not finite (e.g., NaN, Inf).
        # This robustness check helps the optimizer avoid problematic parameter regions
        # and prevents issues if intermediate calculations result in non-finite values.
        if not np.all(np.isfinite(pred)):
            return np.inf
        mse = np.mean((pred - y) ** 2)
        return mse

    # Use L-BFGS-B for bounded optimization, which is efficient for moderate-sized problems
    # and allows specifying parameter constraints.
    # Increased maxiter to 1000 to allow for more convergence steps,
    # improving the chance of finding a better local optimum, aligning with high-performing programs.
    result = minimize(objective, initial_params, method='L-BFGS-B', bounds=bounds, options={'maxiter': 1000})

    # Return the optimized parameters if the optimization was successful;
    # otherwise, return the initial parameters as a fallback (though success is usually expected).
    params_opt = result.x if result.success else initial_params
    return params_opt
# EVOLVE-BLOCK-END