SLD - Vocabulary Scaling Law - SLDAgent + Gemini 3 Pro Preview

All Runs (sorted by R²)

Best Run 2 R² = 0.979398

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning.
Model: Lossu = Bias + c1*(N/S1)^e1 + c2*(V/S2)^e2 + c3*(D/S3)^e3
Optimization: Variable Projection (VarPro) driven by Differential Evolution.
1. Separates non-linear parameters (exponents) from linear parameters (coeffs, bias).
2. Uses Differential Evolution to globally search for optimal exponents, robust to local minima.
3. Uses Ridge-regularized Linear Least Squares in the inner loop to handle collinearity 
   (e.g., when exponents are close to 0 or each other).
4. Final refinement with OLS ensures unbiased estimates.
"""
import numpy as np
from scipy.optimize import differential_evolution

# Fixed scaling factors to normalize inputs (N, V, D)
# Centers the feature distribution around 1.0 for numerical stability
SCALES = np.array([1e9, 1e4, 1e11])

def scaling_law_func(data_points, params):
    """
    Predicts Lossu.
    params: [Bias, c1, c2, c3, e1, e2, e3]
    """
    X = np.atleast_2d(np.asarray(data_points))
    # Normalize inputs
    X_norm = X / SCALES[None, :]
    
    params = np.asarray(params)
    if params.ndim == 1:
        params = params[None, :]
    
    # Unpack
    bias      = params[:, 0]
    coeffs    = params[:, 1:4]
    exponents = params[:, 4:7]
    
    # Compute power terms: (X_norm)^e
    # Use abs for safety, though inputs are positive
    # Broadcasting: (N, 1, 3) ** (1, T, 3) -> (N, T, 3)
    terms = (np.abs(X_norm[:, None, :]) + 1e-12) ** exponents[None, :, :]
    
    # Weighted sum: Bias + c1*T1 + c2*T2 + c3*T3
    pred = (coeffs[None, :, :] * terms).sum(axis=2) + bias[None, :]
    
    return pred[:, 0] if pred.shape[1] == 1 else pred

def fit_scaling_law(data_points, loss_values):
    """
    Fits parameters using Variable Projection with Differential Evolution.
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values)
    if y.ndim == 1:
        y = y[:, None]
        
    N_samples = X.shape[0]
    N_targets = y.shape[1]
    
    X_norm = X / SCALES[None, :]
    # Precompute log for speed in optimization loop
    # Add epsilon to avoid log(0)
    log_X = np.log(np.abs(X_norm) + 1e-12)
    
    results = []
    
    for t in range(N_targets):
        y_curr = y[:, t]
        
        # Inner solver: Ridge Regression to find optimal coeffs for given exponents
        # Returns MSE
        def objective(exps):
            # exps: [e1, e2, e3]
            # Terms: exp(e * log_x)
            terms = np.exp(exps[None, :] * log_X)
            
            # Design matrix: [1, terms]
            A = np.column_stack([np.ones(N_samples), terms])
            
            # Ridge solve: (A.T A + alpha I) w = A.T y
            # Regularization prevents singularity when exponents are similar or 0
            alpha = 1e-7
            AtA = A.T @ A
            Aty = A.T @ y_curr
            
            # Regularize diagonal
            reg_matrix = np.eye(A.shape[1]) * alpha
            reg_matrix[0, 0] = 0 # Do not penalize Bias intercept
            
            try:
                w = np.linalg.solve(AtA + reg_matrix, Aty)
            except np.linalg.LinAlgError:
                return 1e10 # Fail
            
            # MSE
            pred = A @ w
            mse = np.mean((pred - y_curr)**2)
            return mse

        # Global Optimization for Exponents
        # Bounds: [-4.0, 4.0] covers inverse, inverse-square, linear, quadratic, etc.
        bounds = [(-4.0, 4.0), (-4.0, 4.0), (-4.0, 4.0)]
        
        # Differential Evolution
        # Robust global search. polish=True performs local gradient-based refinement (L-BFGS-B)
        res = differential_evolution(objective, bounds, 
                                     strategy='best1bin', 
                                     popsize=15, 
                                     tol=1e-5, 
                                     maxiter=100,
                                     polish=True, 
                                     seed=42)
        
        best_exps = res.x
        
        # Final parameters with OLS (no ridge) for unbiased coefficients
        terms = np.exp(best_exps[None, :] * log_X)
        A = np.column_stack([np.ones(N_samples), terms])
        w, _, _, _ = np.linalg.lstsq(A, y_curr, rcond=None)
        
        # Params: [Bias, c1, c2, c3, e1, e2, e3]
        params = np.concatenate([w, best_exps])
        results.append(params)
        
    return np.vstack(results)[0] if N_targets == 1 else np.vstack(results)
# EVOLVE-BLOCK-END

#2 Run 3 R² = 0.979398

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios.
Implements Variable Projection (VarPro) optimization to robustly fit
an additive power law model with 7 parameters.
"""
import numpy as np
import itertools
from scipy.optimize import minimize

# Fixed scaling factors for numerical stability
# Based on typical LLM scales: Params~1e9, Vocab~1e4, Data~1e11
SCALES = np.array([1e9, 1e4, 1e11])

def scaling_law_func(data_points, params):
    """
    Predicts Lossu using a sum of power laws with a bias term.
    Formula: L = Bias + c1*(N/S1)^e1 + c2*(V/S2)^e2 + c3*(D/S3)^e3
    
    Args:
        data_points: (N, 3) array [non_vocab_params, vocab_size, num_characters]
        params: (7,) or (1, 7) array [bias, c1, c2, c3, e1, e2, e3]
    """
    X = np.atleast_2d(np.asarray(data_points))
    
    # Normalize inputs
    X_norm = X / SCALES[None, :]
    
    params = np.asarray(params)
    if params.ndim == 1:
        params = params[None, :]
    
    # Unpack parameters
    bias = params[:, 0]        # (T,)
    coeffs = params[:, 1:4]    # (T, 3)
    exponents = params[:, 4:7] # (T, 3)
    
    # Compute power terms: (N, 3) ** (T, 3) -> (N, T, 3)
    # Use abs for safety, though inputs are positive
    terms = np.abs(X_norm[:, None, :]) ** exponents[None, :, :]
    
    # Weighted sum: sum over the 3 features
    pred = (terms * coeffs[None, :, :]).sum(axis=2) + bias[None, :]
    
    if pred.shape[1] == 1:
        return pred[:, 0]
    return pred

def fit_scaling_law(data_points, loss_values):
    """
    Fits the scaling law parameters using Variable Projection.
    We optimize the exponents (non-linear) using L-BFGS-B, while
    solving for the optimal coefficients (linear) at each step.
    This reduces the optimization problem from 7D to 3D.
    """
    X = np.atleast_2d(np.asarray(data_points))
    # Normalize X for fitting
    X_norm = X / SCALES[None, :]
    
    y = np.asarray(loss_values)
    if y.ndim == 1:
        y = y[:, None]
    
    N_targets = y.shape[1]
    final_params = []
    
    # Pre-allocate ones for bias term
    ones = np.ones((X.shape[0], 1))
    
    # Grid for exponents initialization
    # Focus on physically plausible range [-1, 1]
    # We include both negative (decay) and positive (growth/inverse) regimes
    grid_vals = [-1.0, -0.6, -0.4, -0.2, -0.1, -0.05, 0.05, 0.2, 0.5, 1.0]
    grid_points = list(itertools.product(grid_vals, repeat=3))
    
    for i in range(N_targets):
        y_target = y[:, i]
        
        # Inner function to solve for linear params given exponents
        def solve_linear_problem(exps, return_full=False):
            # Construct feature matrix: [1, x1^e1, x2^e2, x3^e3]
            # Handle shape explicitly
            exps = np.asarray(exps)
            features = np.abs(X_norm) ** exps[None, :]
            A = np.column_stack((ones, features))
            
            # Ridge regression for stability (L2 regularization)
            # Helps when columns are collinear or exponents are close to 0
            alpha = 1e-6
            ATA = A.T @ A
            ATA[np.diag_indices_from(ATA)] += alpha
            ATy = A.T @ y_target
            
            try:
                # Solve (A'A + alpha*I)w = A'y
                w = np.linalg.solve(ATA, ATy)
            except np.linalg.LinAlgError:
                # Fallback to lstsq
                w = np.linalg.lstsq(A, y_target, rcond=None)[0]
            
            if return_full:
                # Return [bias, c1, c2, c3, e1, e2, e3]
                return np.concatenate([w, exps])
            
            # Return MSE for optimizer
            preds = A @ w
            mse = np.mean((preds - y_target)**2)
            return mse
            
        # 1. Grid Search to find best basin of attraction
        best_mse = float('inf')
        best_exps = np.array([-0.1, -0.1, -0.1]) # Default guess
        
        for exps in grid_points:
            mse = solve_linear_problem(exps)
            if mse < best_mse:
                best_mse = mse
                best_exps = np.array(exps)
        
        # 2. Refine Exponents using L-BFGS-B
        # Bounds prevent overflow and restrict search to reasonable powers
        bounds = [(-5.0, 5.0)] * 3
        
        try:
            res = minimize(solve_linear_problem, best_exps, 
                           method='L-BFGS-B', bounds=bounds,
                           options={'ftol': 1e-9, 'gtol': 1e-9})
            refined_exps = res.x
        except:
            refined_exps = best_exps
            
        # 3. Compute final parameters
        p_opt = solve_linear_problem(refined_exps, return_full=True)
        final_params.append(p_opt)
        
    return np.vstack(final_params)[0] if N_targets == 1 else np.vstack(final_params)
# EVOLVE-BLOCK-END

#3 Run 4 R² = 0.979398

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios.
Implements a highly optimized Grid Search + L-BFGS-B strategy to discover
the optimal additive power law parameters with 7 degrees of freedom.
"""
import numpy as np
import itertools
from scipy.optimize import minimize

# Fixed scaling factors for numerical stability
# Based on typical LLM scales: Params~1e9, Vocab~1e4, Data~1e11
SCALES = np.array([1e9, 1e4, 1e11])

def scaling_law_func(data_points, params):
    """
    Predicts Lossu using a sum of power laws with a bias term.
    Formula: L = Bias + c1*(N/S1)^e1 + c2*(V/S2)^e2 + c3*(D/S3)^e3
    
    Args:
        data_points: (N, 3) array [non_vocab_params, vocab_size, num_characters]
        params: (7,) or (1, 7) array [bias, c1, c2, c3, e1, e2, e3]
    """
    X = np.atleast_2d(np.asarray(data_points))
    
    # Normalize inputs
    # X columns: [non_vocab_params, vocab_size, num_characters]
    X_norm = X / SCALES[None, :]
    
    params = np.asarray(params)
    if params.ndim == 1:
        params = params[None, :]
    
    # Unpack parameters
    bias = params[:, 0]        # (T,)
    coeffs = params[:, 1:4]    # (T, 3)
    exponents = params[:, 4:7] # (T, 3)
    
    # Compute power terms: (N, 3) ** (T, 3) -> (N, T, 3) after broadcasting
    # Use abs() for safety, though inputs are positive
    terms = np.abs(X_norm[:, None, :]) ** exponents[None, :, :]
    
    # Weighted sum: sum over the 3 features
    # (N, T, 3) * (T, 3) -> (N, T, 3) -> sum(axis=2) -> (N, T)
    pred = (terms * coeffs[None, :, :]).sum(axis=2) + bias[None, :]
    
    if pred.shape[1] == 1:
        return pred[:, 0]
    return pred

def fit_scaling_law(data_points, loss_values):
    """
    Fits the scaling law parameters using a hybrid Grid Search + L-BFGS-B approach.
    Uses a dense grid of exponents to locate the global basin of attraction,
    solved efficiently via Linear Least Squares, followed by gradient-based refinement.
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values)
    if y.ndim == 1:
        y = y[:, None]
    
    # Normalize X for fitting
    X_norm = X / SCALES[None, :]
    
    # Define a dense grid of candidate exponents
    # Covers decay (negative), growth (positive), and near-linear/log (near 0) regimes
    grid_vals = [-1.5, -1.0, -0.75, -0.5, -0.33, -0.2, -0.1, -0.05, 
                  0.05, 0.1, 0.2, 0.33, 0.5, 0.75, 1.0, 1.5]
    
    # Precompute feature powers to optimize the inner loop
    # feats[dim] is shape (N, grid_size)
    feats = [np.abs(X_norm[:, i:i+1]) ** np.array(grid_vals)[None, :] for i in range(3)]
    
    N_targets = y.shape[1]
    final_params = []
    
    # Pre-allocate ones for bias term
    ones = np.ones((X.shape[0], 1))
    
    # Grid indices
    grid_indices = range(len(grid_vals))
    
    for i in range(N_targets):
        y_target = y[:, i]
        candidates = []
        
        # 1. Grid Search with Linear Least Squares
        # Iterate over all combinations of exponents for the 3 features
        for idx in itertools.product(grid_indices, repeat=3):
            i1, i2, i3 = idx
            
            # Construct design matrix A: [1, x1^e1, x2^e2, x3^e3]
            # Use precomputed columns
            A = np.column_stack((ones, feats[0][:, i1], feats[1][:, i2], feats[2][:, i3]))
            
            # Solve linear system A * w = y
            # w = [bias, c1, c2, c3]
            try:
                # rcond=None lets numpy decide, usually 1e-15
                w, res, rank, s = np.linalg.lstsq(A, y_target, rcond=None)
                
                # Calculate MSE
                if res.size > 0:
                    mse = res[0] / X.shape[0]
                else:
                    mse = np.mean((A @ w - y_target)**2)
                
                # Store full param vector: [bias, c1, c2, c3, e1, e2, e3]
                p_vec = np.array([w[0], w[1], w[2], w[3], 
                                  grid_vals[i1], grid_vals[i2], grid_vals[i3]])
                candidates.append((mse, p_vec))
            except:
                continue
                
        # Sort candidates by MSE and pick top ones for refinement
        candidates.sort(key=lambda x: x[0])
        top_k = 10
        refinement_starts = [c[1] for c in candidates[:top_k]]
        
        # Fallback: Constant mean model
        p_const = np.zeros(7)
        p_const[0] = np.mean(y_target)
        refinement_starts.append(p_const)
        
        best_mse = float('inf')
        best_p = refinement_starts[0]
        
        # Objective function for optimizer
        def objective(p):
            preds = scaling_law_func(X, p)
            return np.mean((preds - y_target) ** 2)
            
        # 2. Refine with L-BFGS-B
        # Bounds: Exponents in [-3, 3] to prevent overflow/instability
        # Coefficients/Bias unbounded
        bounds = [(None, None)]*4 + [(-3.0, 3.0)]*3
        
        for init_p in refinement_starts:
            try:
                res = minimize(objective, init_p, method='L-BFGS-B', bounds=bounds,
                              options={'ftol': 1e-9, 'maxiter': 1000})
                if res.fun < best_mse:
                    best_mse = res.fun
                    best_p = res.x
            except:
                continue
            
        final_params.append(best_p)
        
    return np.vstack(final_params)[0] if N_targets == 1 else np.vstack(final_params)
# EVOLVE-BLOCK-END

#4 Run 5 R² = 0.979398

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios.
Uses a sum of power laws model: Loss = Bias + c_N * N^e_N + c_V * V^e_V + c_D * D^e_D.
Optimizes parameters using Variable Projection (VarPro) combined with Differential Evolution
to robustly find the non-linear exponents, followed by linear least squares for coefficients.
"""
import numpy as np
from scipy.optimize import differential_evolution

# Fixed scaling factors to normalize inputs to order of magnitude ~1
# These help numerical stability of power operations
SCALE_N = 2e8
SCALE_V = 3e4
SCALE_D = 2e10

def scaling_law_func(data_points, params):
    """
    Predicts Lossu using a sum of power laws form.
    Model: Loss = Bias + c_N * (N/s_N)^e_N + c_V * (V/s_V)^e_V + c_D * (D/s_D)^e_D
    
    Args:
        data_points: (N_samples, 3) array [non_vocab_params, vocab_size, num_characters]
        params: Array of 7 parameters [Bias, c_N, c_V, c_D, e_N, e_V, e_D]
    Returns:
        Predicted Lossu values
    """
    X = np.atleast_2d(np.asarray(data_points))
    
    # Handle parameter batching for evaluation
    params = np.asarray(params)
    if params.ndim == 1:
        params = params[None, :]
    
    # Unpack parameters
    bias = params[:, 0]
    coeffs = params[:, 1:4]     # [c_N, c_V, c_D]
    exponents = params[:, 4:7]  # [e_N, e_V, e_D]
    
    # Normalize inputs
    # X columns: 0->N, 1->V, 2->D
    n_norm = X[:, 0] / SCALE_N
    v_norm = X[:, 1] / SCALE_V
    d_norm = X[:, 2] / SCALE_D
    
    # Stack normalized features: shape (N_samples, 3)
    X_norm = np.stack([n_norm, v_norm, d_norm], axis=1)
    
    # Safe power calculation
    # Add epsilon to base to avoid 0^negative errors, though inputs are positive
    X_safe = np.abs(X_norm) + 1e-9
    
    # Calculate power terms: X^exponents
    # Broadcasting: (N_samples, 1, 3) ** (1, N_param_sets, 3) -> (N_samples, N_param_sets, 3)
    terms = X_safe[:, None, :] ** exponents[None, :, :]
    
    # Linear combination: sum(coeffs * terms) + bias
    # Sum over the feature dimension (axis 2)
    pred = (terms * coeffs[None, :, :]).sum(axis=2) + bias[None, :]
    
    # Return appropriate shape (flatten if single parameter set)
    if pred.shape[1] == 1:
        return pred[:, 0]
    return pred

def fit_scaling_law(data_points, loss_values):
    """
    Fits the scaling law parameters.
    Method:
    1. Define the model as separable: y = Bias + sum(c_i * x_i^e_i).
    2. Use Variable Projection: For fixed exponents (e), the optimal coefficients (c, Bias)
       are given by the linear least squares solution.
    3. Optimize exponents (e) using Differential Evolution to minimize the projection error (MSE).
       DE is global and robust to local minima which are common in scaling law fitting.
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values)
    if y.ndim == 1:
        y = y[:, None]
        
    n_samples, n_targets = y.shape
    
    # Normalize inputs
    n_norm = X[:, 0] / SCALE_N
    v_norm = X[:, 1] / SCALE_V
    d_norm = X[:, 2] / SCALE_D
    X_norm = np.stack([n_norm, v_norm, d_norm], axis=1)
    
    # Safe base for optimization
    X_safe = np.abs(X_norm) + 1e-9
    
    # Constant feature for bias
    ones = np.ones((n_samples, 1))
    
    results = []
    
    # Bounds for exponents.
    # We allow a generous range [-4, 4] to capture various scaling behaviors
    # (standard laws are typically [-1, 1], but interactions/ratios can imply other values)
    bounds = [(-4.0, 4.0)] * 3
    
    for i in range(n_targets):
        y_tgt = y[:, i]
        
        def objective(exponents):
            # exponents: [e_N, e_V, e_D]
            try:
                # Construct basis functions
                # shape: (N_samples, 3)
                features = X_safe ** exponents[None, :]
                
                # Design matrix A: [1, N^eN, V^eV, D^eD]
                A = np.hstack([ones, features])
                
                # Solve linear least squares: min ||Ax - y||^2
                # lstsq returns: solution, residuals, rank, singular_values
                w, residuals, _, _ = np.linalg.lstsq(A, y_tgt, rcond=None)
                
                # Calculate MSE
                # If residuals are returned (rank > features), use them
                if residuals.size > 0:
                    mse = residuals[0] / n_samples
                else:
                    preds = A @ w
                    mse = np.mean((preds - y_tgt)**2)
                
                return mse
            except Exception:
                # Return high penalty on numerical error
                return 1e20
        
        # Run Global Optimization
        # Differential Evolution is effective for this low-dimensional (3D) non-convex problem
        res = differential_evolution(
            objective, 
            bounds, 
            strategy='best1bin', 
            maxiter=100, 
            popsize=15, 
            tol=1e-5, 
            mutation=(0.5, 1.0), 
            recombination=0.7,
            seed=42,       # Reproducibility
            polish=True    # Refine with L-BFGS-B
        )
        
        best_exps = res.x
        
        # Final linear solve to get coefficients
        features = X_safe ** best_exps[None, :]
        A = np.hstack([ones, features])
        w, _, _, _ = np.linalg.lstsq(A, y_tgt, rcond=None)
        
        # Pack parameters: [Bias, c_N, c_V, c_D, e_N, e_V, e_D]
        p_opt = np.zeros(7)
        p_opt[0] = w[0]      # Bias
        p_opt[1:4] = w[1:]   # Coeffs [c_N, c_V, c_D]
        p_opt[4:7] = best_exps
        
        results.append(p_opt)
        
    return np.vstack(results) if n_targets > 1 else results[0]
# EVOLVE-BLOCK-END

#5 Run 1 R² = 0.979396

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios.
Implements a Generalized Additive Model (GAM) with power-law basis functions.
Optimization Strategy:
1. Differential Evolution (DE) searches for optimal exponents.
2. Objective Function: Minimizes Predicted Residual Error Sum of Squares (PRESS) / LOOCV MSE.
   This promotes generalization and robustness by penalizing high-leverage solutions,
   avoiding overfitting to outliers in the scaling data.
3. Variable Projection (VarPro) via SVD solves the linear coefficients efficiently within the loop.
"""
import numpy as np
from scipy.optimize import differential_evolution

# Fixed scaling factors based on geometric means of the dataset ranges
# N ~ 1.7e8, V ~ 2e4, D ~ 2.2e10
SCALES = np.array([2e8, 2e4, 2e10])

def scaling_law_func(data_points, params):
    """
    Predicts Lossu using a Generalized Additive Model (GAM) with power-law basis functions.
    Model: Loss = Bias + c1*(N/S1)^e1 + c2*(V/S2)^e2 + c3*(D/S3)^e3
    
    Args:
        data_points: (N, 3) array [non_vocab_params, vocab_size, num_characters]
        params: (7,) or (T, 7) array [bias, c1, c2, c3, e1, e2, e3]
    """
    X = np.atleast_2d(np.asarray(data_points))
    # Normalize inputs for numerical stability in power operations
    X_norm = (X / SCALES[None, :]) + 1e-9
    
    params = np.asarray(params)
    squeeze_output = False
    if params.ndim == 1:
        params = params[None, :]
        squeeze_output = True
    
    # Unpack parameters
    bias      = params[:, 0]        # (T,)
    coeffs    = params[:, 1:4]      # (T, 3)
    exponents = params[:, 4:7]      # (T, 3)
    
    # Calculate power terms: (N_data, 1, 3) ^ (1, T, 3) -> (N_data, T, 3)
    term_vals = X_norm[:, None, :] ** exponents[None, :, :]
    
    # Linear combination: Sum(coeffs * terms) + bias
    pred = (coeffs[None, :, :] * term_vals).sum(axis=2) + bias[None, :]

    if squeeze_output:
        return pred[:, 0]
    return pred


def fit_scaling_law(data_points, loss_values):
    """
    Fits the scaling law parameters.
    Uses Differential Evolution to optimize exponents by minimizing the LOOCV MSE (PRESS),
    solved via SVD-based Variable Projection. This approach is robust to overfitting.
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values)
    
    if y.ndim == 1:
        y2d = y[:, None]
    else:
        y2d = y
    
    N_data = X.shape[0]
    N_targets = y2d.shape[1]
    
    # Normalize inputs
    X_norm = (X / SCALES[None, :]) + 1e-9
    ones = np.ones((N_data, 1))
    
    results = []

    for i in range(N_targets):
        y_target = y2d[:, i]
        
        # Inner objective: returns LOOCV MSE (PRESS) for a given set of exponents
        def objective(exps):
            # Design matrix: [1, N^e1, V^e2, D^e3]
            Phi = X_norm ** exps[None, :]
            A = np.hstack([ones, Phi])
            
            # Use SVD for stable least squares and leverage computation
            # A = U S Vt
            try:
                U, S, Vt = np.linalg.svd(A, full_matrices=False)
            except np.linalg.LinAlgError:
                return 1e10 # Heavy penalty for numerical failure
            
            # Check for near-singularity
            if np.any(S < 1e-9):
                return 1e9 # Penalty for singular design matrix
            
            # Solve for weights: w = V S^-1 U^T y
            # This is equivalent to w = pinv(A) @ y
            Ut_y = U.T @ y_target
            w = Vt.T @ (Ut_y / S)
            
            # Compute residuals
            residuals = y_target - A @ w
            
            # Compute leverages (diagonal of hat matrix H = U U^T)
            # h_ii = sum(U[i, :]^2)
            leverages = np.sum(U**2, axis=1)
            
            # Clip leverages to avoid division by zero (or near zero)
            # A leverage of 1 means the model perfectly interpolates that point
            leverages = np.minimum(leverages, 0.9999)
            
            # Compute PRESS residuals: r_i / (1 - h_ii)
            press_residuals = residuals / (1.0 - leverages)
            
            # Return Mean Squared PRESS
            return np.mean(press_residuals**2)

        # Global Search with Differential Evolution
        bounds = [(-3.5, 3.5)] * 3
        
        de_result = differential_evolution(
            objective, 
            bounds, 
            strategy='best1bin', 
            maxiter=100, 
            popsize=20, 
            tol=1e-5, 
            polish=True, # Refine with L-BFGS-B
            seed=42,
            workers=1
        )
        
        best_exps = de_result.x
        
        # Final fit on full dataset to get coefficients
        Phi = X_norm ** best_exps[None, :]
        A = np.hstack([ones, Phi])
        w_best, _, _, _ = np.linalg.lstsq(A, y_target, rcond=None)
        
        # Combine parameters: [bias, c1, c2, c3, e1, e2, e3]
        final_params = np.concatenate([w_best, best_exps])
        results.append(final_params)

    params_opt = np.vstack(results)
    return params_opt[0] if N_targets == 1 else params_opt
# EVOLVE-BLOCK-END