SLD - Parallel Scaling Law - SLDAgent + Gemini 3 Pro Preview

All Runs (sorted by R²)

Best Run 1 R² = 0.999954

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios
Improved program using Variable Projection (VarPro) with Global Grid Search.
The model L = E + A * N^-alpha * K^-beta is fitted by optimizing the non-linear exponents (alpha, beta)
via a dense grid search followed by L-BFGS-B refinement. The linear parameters (E, A) are solved 
analytically at each step using a 2D Non-Negative Least Squares (NNLS) solver.
Model: L = E + A * (N/1e9)^(-alpha) * K^(-beta)
Uses 4 parameters: [E, A, alpha, beta]
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    """
    Computes predicted loss using a power-law scaling model.
    Model: L = E + A * (N/1e9)^(-alpha) * K^(-beta)
    
    Args:
        data_points: (N, 2) array [num_params, parallel_size]
        params: (4,) or (T, 4) array [E, A, alpha, beta]
    
    Returns:
        Predicted loss values (N,) or (N, T)
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=float))
    params = np.asarray(params, dtype=float)

    # Handle parameter batching
    if params.ndim == 1:
        params = params[None, :]  # (1, 4)
    
    # Extract inputs and normalize num_params (billions)
    N_scaled = X[:, 0] / 1.0e9 
    K = X[:, 1]

    # Extract parameters
    E     = params[:, 0]
    A     = params[:, 1]
    alpha = params[:, 2]
    beta  = params[:, 3]

    # Calculate power law terms with broadcasting
    # N_scaled: (N,), alpha: (T,) -> (N, T)
    term_N = N_scaled[:, None] ** (-alpha[None, :])
    term_K = K[:, None] ** (-beta[None, :])
    
    # Combined model: E + A * N^-alpha * K^-beta
    pred = E[None, :] + A[None, :] * term_N * term_K

    # Return appropriate shape
    if pred.shape[1] == 1:
        return pred[:, 0]
    return pred


def fit_scaling_law(data_points, loss_values):
    """
    Fits the scaling law parameters using VarPro with grid search initialization.
    Optimizes (alpha, beta) while solving (E, A) analytically with non-negativity constraints.
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=float))
    y_all = np.asarray(loss_values, dtype=float)
    
    # Handle multi-target fitting
    if y_all.ndim == 1:
        y_all = y_all[:, None]
        
    num_targets = y_all.shape[1]
    optimized_params = []
    
    # Normalize inputs
    N_scaled = X[:, 0] / 1.0e9
    K = X[:, 1]
    
    # Define grid for global search of non-linear parameters
    # alpha (model scaling) usually 0.1-1.5, beta (parallel) usually 0.0-0.5
    grid_alpha = np.linspace(0.01, 1.5, 15)
    grid_beta = np.linspace(0.0, 0.6, 10)
    grid_points = [(a, b) for a in grid_alpha for b in grid_beta]
    
    for i in range(num_targets):
        y = y_all[:, i]
        
        # Inner solver: Given alpha, beta, find optimal E, A >= 0
        def solve_linear_params(alpha, beta):
            # Feature Z = N^-alpha * K^-beta
            Z = (N_scaled ** -alpha) * (K ** -beta)
            
            # We want min || [1 Z] @ [E, A].T - y ||^2 s.t. E,A >= 0
            # Analytical 2D NNLS
            sum_Z = np.sum(Z)
            sum_Z2 = np.sum(Z**2)
            sum_y = np.sum(y)
            sum_yZ = np.sum(y * Z)
            n = len(y)
            
            det = n * sum_Z2 - sum_Z**2
            
            best_mse = np.inf
            best_EA = (0.0, 0.0) # (E, A)
            
            # 1. Unconstrained Solution
            if det > 1e-13:
                E_unc = (sum_Z2 * sum_y - sum_Z * sum_yZ) / det
                A_unc = (n * sum_yZ - sum_Z * sum_y) / det
                if E_unc >= 0 and A_unc >= 0:
                    mse = np.mean((E_unc + A_unc * Z - y)**2)
                    return mse, E_unc, A_unc
            
            # 2. Boundary A=0 (Model is constant E)
            E_only = max(0.0, sum_y / n)
            mse_E = np.mean((E_only - y)**2)
            if mse_E < best_mse:
                best_mse = mse_E
                best_EA = (E_only, 0.0)
                
            # 3. Boundary E=0 (Model is A * Z)
            if sum_Z2 > 1e-13:
                A_only = max(0.0, sum_yZ / sum_Z2)
                mse_A = np.mean((A_only * Z - y)**2)
                if mse_A < best_mse:
                    best_mse = mse_A
                    best_EA = (0.0, A_only)
            
            return best_mse, best_EA[0], best_EA[1]

        # Step 1: Global Grid Search
        best_grid_mse = np.inf
        best_grid_params = (0.5, 0.1)
        
        for alpha_try, beta_try in grid_points:
            mse, _, _ = solve_linear_params(alpha_try, beta_try)
            if mse < best_grid_mse:
                best_grid_mse = mse
                best_grid_params = (alpha_try, beta_try)
                
        # Step 2: Local Refinement
        def objective(p):
            mse, _, _ = solve_linear_params(p[0], p[1])
            return mse
        
        # Bounds: alpha > 0, beta >= 0.
        bounds = [(0.0, 5.0), (0.0, 2.0)]
        
        try:
            res = minimize(objective, best_grid_params, method='L-BFGS-B', bounds=bounds)
            alpha_opt, beta_opt = res.x
        except:
            alpha_opt, beta_opt = best_grid_params
            
        # Recover linear parameters
        _, E_opt, A_opt = solve_linear_params(alpha_opt, beta_opt)
        
        optimized_params.append([E_opt, A_opt, alpha_opt, beta_opt])
            
    final_params = np.array(optimized_params)
    
    if num_targets == 1:
        return final_params[0]
    return final_params
# EVOLVE-BLOCK-END

#2 Run 2 R² = 0.999954

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios
Improved program using Variable Projection (VarPro) with Non-Negative Least Squares (NNLS).
The problem is separable: L = E + A * (N^-alpha * K^-beta).
For fixed (alpha, beta), parameters (E, A) are solved optimally via NNLS.
We optimize (alpha, beta) using Nelder-Mead on the projection residual.
Model: L = E + A * (N/1e9)^(-alpha) * K^(-beta)
Parameters: [E, A, alpha, beta]
"""
import numpy as np
from scipy.optimize import nnls, minimize

def scaling_law_func(data_points, params):
    """
    Computes predicted loss using a power-law scaling model.
    Model: L = E + A * (N/1e9)^(-alpha) * K^(-beta)
    
    Args:
        data_points: (N, 2) array [num_params, parallel_size]
        params: (4,) or (T, 4) array [E, A, alpha, beta]
    
    Returns:
        Predicted loss values (N,) or (N, T)
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=float))
    params = np.asarray(params, dtype=float)
    
    # Handle parameter batching
    if params.ndim == 1:
        params = params[None, :]
    
    # Inputs: N (billions), K (parallel size)
    N = X[:, 0] / 1.0e9
    K = X[:, 1]
    
    # Parameters: E, A, alpha, beta
    E = params[:, 0]
    A = params[:, 1]
    alpha = params[:, 2]
    beta = params[:, 3]
    
    # Broadcasting: (N_samples, 1) vs (1, N_params)
    # term = N^-alpha * K^-beta
    term = (N[:, None] ** -alpha[None, :]) * (K[:, None] ** -beta[None, :])
    
    pred = E[None, :] + A[None, :] * term
    
    # Return appropriate shape
    if pred.shape[1] == 1:
        return pred[:, 0]
    return pred

def fit_scaling_law(data_points, loss_values):
    """
    Fits parameters [E, A, alpha, beta] using Variable Projection.
    Outer loop optimizes non-linear parameters (alpha, beta).
    Inner loop optimizes linear parameters (E, A) using NNLS subject to E>=0, A>=0.
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=float))
    y = np.asarray(loss_values, dtype=float)
    
    squeeze_output = False
    if y.ndim == 1:
        y = y[:, None]
        squeeze_output = True
        
    N_sc = X[:, 0] / 1.0e9
    K = X[:, 1]
    
    # Starting points for (alpha, beta) search
    # Cover various scaling regimes: standard, slow, fast, parallel-heavy
    initial_guesses = [
        [0.5, 0.1],
        [0.2, 0.2],
        [1.0, 0.1],
        [0.1, 0.5],
        [0.01, 0.01]
    ]
    
    final_params = []
    
    for i in range(y.shape[1]):
        yi = y[:, i]
        
        # Pre-allocate design matrix container: M = [1, F]
        M = np.ones((len(yi), 2))
        
        def objective(exponents):
            # Use abs to ensure decay (positive alpha, beta)
            a = abs(exponents[0])
            b = abs(exponents[1])
            
            # Compute feature F = N^-a * K^-b
            F = (N_sc ** -a) * (K ** -b)
            M[:, 1] = F
            
            # Solve non-negative least squares: min ||M*w - yi||^2, w >= 0
            # Returns w=[E, A] and residual norm
            w, rnorm = nnls(M, yi)
            return rnorm
            
        best_mse = np.inf
        best_p = [0.0, 1.0, 0.5, 0.1]
        
        for p0 in initial_guesses:
            # Nelder-Mead is robust to the non-smooth gradient of the NNLS projection
            res = minimize(
                objective, 
                p0, 
                method='Nelder-Mead', 
                options={'xatol': 1e-4, 'fatol': 1e-4, 'maxiter': 200}
            )
            
            # Reconstruct full parameters from best result
            alpha_opt, beta_opt = abs(res.x)
            F = (N_sc ** -alpha_opt) * (K ** -beta_opt)
            M[:, 1] = F
            w_opt, _ = nnls(M, yi)
            E_opt, A_opt = w_opt
            
            # Calculate MSE for comparison
            pred = E_opt + A_opt * F
            mse = np.mean((pred - yi)**2)
            
            if mse < best_mse:
                best_mse = mse
                best_p = [E_opt, A_opt, alpha_opt, beta_opt]
                
        final_params.append(best_p)
            
    return np.array(final_params[0]) if squeeze_output else np.array(final_params)
# EVOLVE-BLOCK-END

#3 Run 3 R² = 0.999952

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios.
Implements a robust power-law model: L(N, K) = E + A * (N/1e9)^-alpha * K^-beta.
Uses exactly 4 parameters: [E, log(A), alpha, beta].
Fitting uses a grid-search initialized L-BFGS-B optimization to handle non-convexity.
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    """
    Computes loss predictions based on model size (N) and parallel size (K).
    
    Args:
        data_points: (N_samples, 2) array of [num_params, parallel_size]
        params: (4,) or (T, 4) array of [E, log_A, alpha, beta]
        
    Returns:
        (N_samples,) or (N_samples, T) array of predicted losses
    """
    # Ensure inputs are correct shape/type
    X = np.atleast_2d(np.asarray(data_points, dtype=float))
    P = np.asarray(params, dtype=float)
    
    # Handle single vs batch parameters
    if P.ndim == 1:
        P = P[None, :]
        
    # Extract features
    # num_params normalized to billions for numerical stability
    n = X[:, 0] / 1.0e9 
    k = X[:, 1]
    
    # Extract parameters
    # E: Asymptotic irreducible loss
    # log_A: Log scale factor (A = exp(log_A))
    # alpha: Power law exponent for model size
    # beta: Power law exponent for parallel size
    E     = P[:, 0]
    log_A = P[:, 1]
    alpha = P[:, 2]
    beta  = P[:, 3]
    
    # Compute power law terms with broadcasting
    # n is (N,), alpha is (T,) -> result (N, T)
    term_n = n[:, None] ** (-alpha[None, :])
    term_k = k[:, None] ** (-beta[None, :])
    A      = np.exp(log_A[None, :])
    
    # Combine: L = E + A * n^-alpha * k^-beta
    pred = E[None, :] + A * term_n * term_k
    
    # Flatten if single target
    if pred.shape[1] == 1:
        return pred[:, 0]
    return pred


def fit_scaling_law(data_points, loss_values):
    """
    Fits the scaling law parameters to the provided data.
    Uses grid search for the asymptote E and linear regression for initialization,
    followed by constrained non-linear optimization.
    
    Args:
        data_points: (N, 2) array
        loss_values: (N,) or (N, T) array
        
    Returns:
        (4,) or (T, 4) array of optimized parameters
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=float))
    y = np.asarray(loss_values, dtype=float)
    
    # Handle dimensions
    is_1d = (y.ndim == 1)
    if is_1d:
        y = y[:, None]
        
    n_samples, n_targets = y.shape
    
    # Prepare features for initialization
    n_norm = X[:, 0] / 1.0e9
    k_raw  = X[:, 1]
    
    log_n = np.log(n_norm)
    log_k = np.log(k_raw)
    
    # Matrix for linear system: log(L-E) = log(A) - alpha*log(n) - beta*log(k)
    # Unknowns: [log(A), alpha, beta]
    # Matrix columns corresponding to unknowns: [1, -log_n, -log_k]
    M = np.column_stack([np.ones_like(log_n), -log_n, -log_k])
    
    final_params = []
    
    for i in range(n_targets):
        yi = y[:, i]
        min_loss = np.min(yi)
        
        # --- Stage 1: Initialization ---
        # We grid search for the best E (asymptote) that linearizes the data well.
        # E must be strictly less than min_loss for the log transform to work.
        # We check candidates in range [0, min_loss).
        
        best_init = None
        best_mse = np.inf
        
        # 15 candidates from 0 to slightly below min_loss
        if min_loss > 1e-6:
            e_candidates = np.linspace(0, min_loss - 1e-4, 15)
        else:
            e_candidates = [0.0]
            
        for E_try in e_candidates:
            # Shift data
            y_shift = yi - E_try
            if np.any(y_shift <= 0): continue
            
            log_y = np.log(y_shift)
            
            # Linear least squares
            try:
                coeffs, _, _, _ = np.linalg.lstsq(M, log_y, rcond=None)
                log_A_est, alpha_est, beta_est = coeffs
                
                # Evaluate this initialization
                # (Re-calculate prediction to check fit quality)
                pred = E_try + np.exp(log_A_est) * (n_norm**-alpha_est) * (k_raw**-beta_est)
                mse = np.mean((pred - yi)**2)
                
                if mse < best_mse:
                    best_mse = mse
                    best_init = np.array([E_try, log_A_est, alpha_est, beta_est])
            except:
                continue
        
        # Fallback initialization if grid search fails
        if best_init is None:
            # Heuristic fallback: E=min-0.1, A=1, alpha=0.5, beta=0.5
            best_init = np.array([max(0, min_loss - 0.1), 0.0, 0.5, 0.5])
            
        # --- Stage 2: Refinement ---
        # L-BFGS-B optimization with bounds
        
        def objective(p):
            E, log_A, alpha, beta = p
            # Prediction
            term = np.exp(log_A) * (n_norm**-alpha) * (k_raw**-beta)
            pred = E + term
            return np.mean((pred - yi)**2)
        
        # Bounds
        # E in [0, min_loss) approx. We allow E to go up to min_loss.
        # log_A unbounded.
        # alpha, beta >= 0 (decaying loss).
        bounds = [
            (0, min_loss - 1e-9), # E
            (None, None),         # log_A
            (0, None),            # alpha
            (0, None)             # beta
        ]
        
        # Run optimization
        res = minimize(objective, best_init, method='L-BFGS-B', bounds=bounds)
        
        if res.success:
            final_params.append(res.x)
        else:
            final_params.append(best_init)
            
    final_params = np.array(final_params)
    
    if is_1d:
        return final_params[0]
    return final_params
# EVOLVE-BLOCK-END

#4 Run 4 R² = 0.999951

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios
Improved program with a physically motivated power law form and robust "Scan & Refine" fitting.
Model: L = E + A * (N/1e9)^(-alpha) * K^(-beta)
Uses 4 parameters: [E, A, alpha, beta]
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    """
    Computes predicted loss using a power-law scaling model.
    
    Args:
        data_points: (N, 2) array [num_params, parallel_size]
        params: (4,) or (T, 4) array [E, A, alpha, beta]
    
    Returns:
        Predicted loss values (N,) or (N, T)
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=float))
    params = np.asarray(params, dtype=float)

    # Handle parameter batching
    if params.ndim == 1:
        params = params[None, :]  # (1, 4)
    
    # Extract inputs and normalize num_params for stability (billions)
    N_scaled = X[:, 0] / 1.0e9 
    K = X[:, 1]

    # Extract parameters
    # E: Irreducible loss (asymptote)
    # A: Scaling coefficient
    # alpha: Scaling exponent for model size
    # beta: Scaling exponent for parallel size
    E     = params[:, 0]
    A     = params[:, 1]
    alpha = params[:, 2]
    beta  = params[:, 3]

    # Calculate power law terms with broadcasting
    # N_scaled: (N,), alpha: (T,) -> (N, T)
    term_N = N_scaled[:, None] ** (-alpha[None, :])
    term_K = K[:, None] ** (-beta[None, :])
    
    # Combined model: E + A * N^-alpha * K^-beta
    pred = E[None, :] + A[None, :] * term_N * term_K

    # Return appropriate shape
    if pred.shape[1] == 1:
        return pred[:, 0]
    return pred


def fit_scaling_law(data_points, loss_values):
    """
    Fits the scaling law parameters using a robust two-stage approach:
    1. Grid search for asymptote E with linear regression for other params
    2. Constrained non-linear optimization (L-BFGS-B) for refinement
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=float))
    y = np.asarray(loss_values, dtype=float)
    
    # Handle multi-target fitting if y is 2D
    if y.ndim == 1:
        y_2d = y[:, None]
    else:
        y_2d = y
        
    num_targets = y_2d.shape[1]
    optimized_params = []
    
    # Normalize inputs matching scaling_law_func
    N_scaled = X[:, 0] / 1.0e9
    K = X[:, 1]
    log_N = np.log(N_scaled)
    log_K = np.log(K)
    
    # Design matrix for linear least squares: [1, -log(N), -log(K)]
    M = np.column_stack([np.ones_like(log_N), -log_N, -log_K])
    
    for i in range(num_targets):
        yi = y_2d[:, i]
        min_loss = np.min(yi)
        
        # --- Stage 1: Grid Search Initialization ---
        # Scan possible values of E to find the best basin of attraction.
        # Grid focuses on values close to min_loss but strictly smaller.
        grid = np.concatenate([
            np.linspace(0, min_loss * 0.9, 10),
            np.linspace(min_loss * 0.9, min_loss - 1e-4, 15)
        ])
        E_candidates = np.unique(grid)
        
        best_init = None
        best_mse = float('inf')
        
        for E_try in E_candidates:
            if E_try >= min_loss: continue
            
            # Linearize: log(y - E) = log(A) - alpha*log(N) - beta*log(K)
            y_shift = yi - E_try
            if np.any(y_shift <= 1e-10): continue
            
            try:
                coeffs, _, _, _ = np.linalg.lstsq(M, np.log(y_shift), rcond=None)
                log_A_est, alpha_est, beta_est = coeffs
                
                # Physical constraint hints (allow small negatives to be fixed by optimizer)
                if alpha_est < -0.1 or beta_est < -0.1: continue

                A_est = np.exp(log_A_est)
                
                # Evaluate MSE in original space
                pred = E_try + A_est * (N_scaled ** -alpha_est) * (K ** -beta_est)
                mse = np.mean((pred - yi) ** 2)
                
                if mse < best_mse:
                    best_mse = mse
                    best_init = [E_try, A_est, alpha_est, beta_est]
            except Exception:
                continue
        
        # Fallback
        if best_init is None:
            best_init = [min_loss * 0.8, 1.0, 0.5, 0.5]
            
        # --- Stage 2: Constrained Optimization ---
        def objective(p):
            E_val, A_val, alpha_val, beta_val = p
            pred = E_val + A_val * (N_scaled ** -alpha_val) * (K ** -beta_val)
            return np.mean((pred - yi) ** 2)
        
        # Bounds: E < min_loss, others > 0
        bounds = [
            (0, min_loss - 1e-6), # E
            (1e-9, None),         # A
            (0, 10),              # alpha
            (0, 10)               # beta
        ]
        
        # Clamp init values to bounds to avoid immediate failure
        p0 = np.array(best_init)
        p0[2] = max(0.01, p0[2]) # alpha
        p0[3] = max(0.01, p0[3]) # beta
        
        try:
            result = minimize(objective, p0, method='L-BFGS-B', bounds=bounds)
            p_final = result.x
        except Exception:
            p_final = p0
            
        optimized_params.append(p_final)
            
    final_params = np.array(optimized_params)
    
    if num_targets == 1:
        return final_params[0]
    return final_params
# EVOLVE-BLOCK-END

#5 Run 5 R² = 0.999914

▼

Python

# EVOLVE-BLOCK-START
import numpy as np
from scipy.optimize import minimize, nnls

def scaling_law_func(data_points, params):
    """
    Computes loss L = E + A * (N/1e9)^-alpha * K^-beta
    
    Args:
        data_points: (N, 2) array [num_params, parallel_size]
        params: (4,) or (M, 4) array [E, A, alpha, beta]
    Returns:
        Predicted loss (N,) or (N, M)
    """
    X = np.asarray(data_points, dtype=float)
    if X.ndim == 1: X = X[None, :]
    
    P = np.asarray(params, dtype=float)
    if P.ndim == 1: P = P[None, :]
    
    # Inputs: N in billions, K is raw count
    # Pre-scaling N helps numerical stability of the power law base
    N_scaled = X[:, 0] / 1.0e9
    K = X[:, 1]
    
    # Extract parameters
    E = P[:, 0]
    A = P[:, 1]
    alpha = P[:, 2]
    beta = P[:, 3]
    
    # Broadcasting: (Samples, 1) vs (1, ParamSets)
    # Term 1: Model size scaling
    # N_scaled (N,) -> (N, 1), alpha (M,) -> (1, M)
    term_N = N_scaled[:, None] ** -alpha[None, :]
    
    # Term 2: Parallel size scaling
    # K (N,) -> (N, 1), beta (M,) -> (1, M)
    term_K = K[:, None] ** -beta[None, :]
    
    # Combined: E + A * term_N * term_K
    pred = E[None, :] + A[None, :] * term_N * term_K
    
    # Flatten if single parameter set
    if pred.shape[1] == 1:
        return pred[:, 0]
    return pred

def fit_scaling_law(data_points, loss_values):
    """
    Fits parameters [E, A, alpha, beta] using Variable Projection (VarPro).
    
    The problem is separable:
    L(E, A, alpha, beta) = sum( (y - (E + A * N^-alpha * K^-beta))^2 )
    
    For fixed (alpha, beta), the optimal (E, A) can be found via 
    Non-Negative Least Squares (NNLS) on the features [1, N^-alpha * K^-beta].
    
    We optimize the outer variables (alpha, beta) using L-BFGS-B, evaluating
    the inner variables (E, A) on the fly. This projects the difficult 4D 
    optimization surface onto a smoother 2D surface.
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=float))
    Y = np.atleast_2d(np.asarray(loss_values, dtype=float))
    if loss_values.ndim == 1: Y = Y.T
    
    N_scaled = X[:, 0] / 1.0e9
    K = X[:, 1]
    
    fitted_params = []
    
    # Design matrix container for the inner linear problem
    # Column 0 is always 1s (intercept E)
    # Column 1 will be updated with the power law term (slope A)
    M = np.ones((X.shape[0], 2))
    
    for i in range(Y.shape[1]):
        y = Y[:, i]
        
        # Define the profile loss function
        def objective(exponents):
            alpha, beta = exponents
            # Compute the power law feature
            # Add small epsilon to bases to avoid division by zero (though inputs are >0)
            z = (N_scaled ** -alpha) * (K ** -beta)
            M[:, 1] = z
            
            # Solve non-negative linear least squares: min ||M*w - y||^2 s.t. w >= 0
            # w = [E, A]
            # NNLS is robust and naturally enforces E >= 0, A >= 0
            w, resid_norm = nnls(M, y)
            
            # Return MSE
            return (resid_norm ** 2) / len(y)

        # Strategy: Coarse Grid Search -> Gradient Descent
        # This helps avoid local minima in the exponent space
        best_loss = float('inf')
        best_x0 = [0.5, 0.5]
        
        # Search grid for exponents
        grid_points = [0.05, 0.2, 0.5, 0.8, 1.2, 2.0]
        for a0 in grid_points:
            for b0 in grid_points:
                loss = objective([a0, b0])
                if loss < best_loss:
                    best_loss = loss
                    best_x0 = [a0, b0]
        
        # Fine-tune alpha, beta using L-BFGS-B
        # Bounds ensure exponents stay positive and reasonable
        bounds = [(1e-5, 20.0), (1e-5, 20.0)]
        
        try:
            res = minimize(
                objective, 
                best_x0, 
                method='L-BFGS-B', 
                bounds=bounds,
                ftol=1e-13,
                gtol=1e-13
            )
            alpha_opt, beta_opt = res.x
        except:
            alpha_opt, beta_opt = best_x0
            
        # Reconstruct the linear parameters E, A for the optimal exponents
        z = (N_scaled ** -alpha_opt) * (K ** -beta_opt)
        M[:, 1] = z
        w_opt, _ = nnls(M, y)
        E_opt, A_opt = w_opt
        
        fitted_params.append([E_opt, A_opt, alpha_opt, beta_opt])
            
    final_params = np.array(fitted_params)
    return final_params[0] if Y.shape[1] == 1 else final_params
# EVOLVE-BLOCK-END