SLD - SFT Scaling Law - SLDAgent + Gemini 3 Pro Preview

All Runs (sorted by R²)

Best Run 3 R² = 0.999266

▼

Python

# EVOLVE-BLOCK-START
"""
Improved scaling law discovery using a 4-parameter Sigmoidal (Hill) function:
L(D) = E + A / (1 + (D/B)^alpha)
Implemented as: L(D) = E + A * sigmoid(-alpha * (ln D - ln B)) for numerical stability.
Optimization Strategy:
1. Variable Projection with Grid Search:
   - Scan (log(B), alpha) space.
   - For each pair, solve for (E, A) using Non-Negative Least Squares (NNLS).
   - This effectively finds the global basin of attraction.
2. Non-Linear Least Squares Refinement:
   - Use the best grid candidates to initialize a Trust Region Reflective fit.
   - Optimize all 4 parameters jointly with bounds to ensure physical validity.
   - Bounded A and E to keep values within realistic ranges (preventing unstable large-parameter solutions).
"""
import numpy as np
from scipy.optimize import nnls, curve_fit
from scipy.special import expit

def scaling_law_func(data_points, params):
    # data_points: (N, 1) array of data sizes
    # params: Array of shape (P,) or (T, P) where P=4 [E, A, B, alpha]
    # Returns: Predicted loss values
    
    X = np.atleast_2d(np.asarray(data_points))
    x = X[:, 0]
    
    params = np.asarray(params)
    if params.ndim == 1:
        params = params[None, :]
    
    # Unpack parameters
    E     = params[:, 0]
    A     = params[:, 1]
    B     = params[:, 2]
    alpha = params[:, 3]
    
    # Model: E + A * sigmoid( -alpha * (ln(x) - ln(B)) )
    # This is equivalent to E + A / (1 + (x/B)^alpha)
    
    # Broadcasting: x is (N,), params are (T,)
    # Result should be (N, T)
    
    # Avoid log(0)
    x_safe = np.maximum(x, 1e-10)
    log_x = np.log(x_safe)[:, None]  # (N, 1)
    
    # B must be positive
    B_safe = np.maximum(B, 1e-20)
    log_B = np.log(B_safe)[None, :]  # (1, T)
    
    alf   = alpha[None, :]           # (1, T)
    
    # Argument for sigmoid: -alpha * (ln x - ln B)
    arg = -alf * (log_x - log_B)
    
    # sigmoid(z) = 1 / (1 + exp(-z))
    w = expit(arg)
    
    pred = E[None, :] + A[None, :] * w
    
    if pred.shape[1] == 1:
        return pred[:, 0]
    return pred

def fit_scaling_law(data_points, loss_values):
    # data_points: (N, 1)
    # loss_values: (N,) or (N, T)
    
    X = np.atleast_2d(np.asarray(data_points))
    x = X[:, 0].astype(float)
    y_in = np.asarray(loss_values)
    
    if y_in.ndim == 1:
        y_2d = y_in[:, None]
    else:
        y_2d = y_in
        
    num_targets = y_2d.shape[1]
    results = np.zeros((num_targets, 4))
    
    # Normalize inputs for numerical stability
    x_max = np.max(x) if x.size > 0 else 1.0
    x_norm = x / x_max
    
    # Precompute log x for optimization
    log_x_norm = np.log(np.maximum(x_norm, 1e-10))
    ones_vec = np.ones_like(x_norm)
    
    # Define model for curve_fit (working with normalized x and y)
    # Params: E_n, A_n, log_B_n, alpha
    def model_opt(x_n, e_n, a_n, log_b_n, alf):
        # x_n is passed but we use precomputed log_x_norm if possible
        # but curve_fit passes x_n. We recompute log to be safe/compatible.
        lx = np.log(np.maximum(x_n, 1e-10))
        arg = -alf * (lx - log_b_n)
        return e_n + a_n * expit(arg)

    for i in range(num_targets):
        y = y_2d[:, i]
        y_max = np.max(y) if np.max(y) > 0 else 1.0
        y_norm = y / y_max
        
        # 1. Grid Search with Variable Projection
        # We search over (log_B_norm, alpha)
        # B can range from very small (pure power law) to > 1 (saturation)
        # log(B_norm) grid:
        
        # Grid density:
        # log_B: -10 to 2 (covers orders of magnitude below and above data range)
        lb_grid = np.linspace(-10, 2.0, 13)
        # alpha: 0.1 to 4.0
        a_grid = np.array([0.1, 0.3, 0.5, 0.75, 1.0, 1.5, 2.0, 3.0, 5.0])
        
        candidates = []
        
        for lb in lb_grid:
            for alf in a_grid:
                # Construct basis w
                arg = -alf * (log_x_norm - lb)
                w = expit(arg)
                
                # Solve NNLS: [1, w] @ [E, A] ~ y
                A_mat = np.vstack([ones_vec, w]).T
                coeffs, rnorm = nnls(A_mat, y_norm)
                
                mse = (rnorm**2) / len(y_norm)
                
                # coeffs are [E_n, A_n]
                candidates.append((mse, [coeffs[0], coeffs[1], lb, alf]))
                
        # Sort and pick best candidates
        candidates.sort(key=lambda c: c[0])
        top_candidates = [c[1] for c in candidates[:3]]
        
        # 2. Refine with curve_fit
        # We optimize [E_n, A_n, log_B_n, alpha]
        best_popt = top_candidates[0]
        best_final_mse = float('inf')
        
        # Bounds:
        # E_n >= 0
        # A_n >= 0. But also A_n shouldn't be arbitrarily large (e.g. < 20) to avoid instability
        # log_B_n: [-20, 10]
        # alpha: [0, 10]
        bounds = ([0.0, 0.0, -20.0, 0.0], [np.inf, 20.0, 10.0, 10.0])
        
        for p0 in top_candidates:
            try:
                # p0 is [E, A, lb, alf]
                popt, _ = curve_fit(model_opt, x_norm, y_norm, p0=p0, 
                                    bounds=bounds, method='trf', 
                                    maxfev=1000, ftol=1e-6)
                
                pred = model_opt(x_norm, *popt)
                mse = np.mean((pred - y_norm)**2)
                
                if mse < best_final_mse:
                    best_final_mse = mse
                    best_popt = popt
            except:
                continue
        
        # 3. De-normalize
        E_n, A_n, log_B_n, alf = best_popt
        
        E_real = E_n * y_max
        A_real = A_n * y_max
        # B_real = x_max * exp(log_B_n)
        B_real = x_max * np.exp(log_B_n)
        
        results[i] = [E_real, A_real, B_real, alf]
    
    if y_in.ndim == 1:
        return results[0]
    return results
# EVOLVE-BLOCK-END

#2 Run 2 R² = 0.998467

▼

Python

# EVOLVE-BLOCK-START
"""
Improved scaling law discovery using a 4-parameter Shifted Power Law: L(x) = E + A * (data_size + delta)^(-alpha).
Key Improvements:
- Robust Initialization: Uses a fully vectorized grid search over non-linear parameters (alpha, delta) 
  combined with analytic OLS for linear parameters (A, E) to find the global basin of attraction.
- Numerical Stability: Normalizes input data to [0, 1] to prevent overflow/underflow and improve optimizer conditioning.
- Constraint Handling: Enforces physical constraints (A>=0, E>=0, alpha>=0) during both initialization and refinement.
- Efficiency: Vectorized operations allow searching thousands of parameter combinations instantly.
"""
import numpy as np
from scipy.optimize import curve_fit

def scaling_law_func(data_points, params):
    # data_points: (N, 1) array of data sizes
    # params: Array of shape (4,) or (T, 4) [A, alpha, E, delta]
    # Returns: Predicted loss values
    
    # Ensure inputs are properly shaped
    X = np.atleast_2d(np.asarray(data_points))
    x = X[:, 0]
    
    params = np.asarray(params)
    if params.ndim == 1:
        params = params[None, :]
        
    # Unpack parameters
    A = params[:, 0]
    alpha = params[:, 1]
    E = params[:, 2]
    
    # Handle optional delta parameter (for backward compatibility or 3-param variants)
    if params.shape[1] > 3:
        delta = params[:, 3]
    else:
        delta = np.zeros_like(A)
        
    # Model: E + A * (x + delta)^-alpha
    # Broadcasting: x is (N,), delta is (T,) -> base is (N, T)
    base = x[:, None] + delta[None, :]
    
    # Protect against numerical instability (0 or negative base)
    base = np.maximum(base, 1e-10)
    
    # Compute power term
    term = np.power(base, -alpha[None, :])
    pred = E[None, :] + A[None, :] * term
    
    # Return (N,) if single target, else (N, T)
    if pred.shape[1] == 1:
        return pred[:, 0]
    return pred

def fit_scaling_law(data_points, loss_values):
    # data_points: (N, 1)
    # loss_values: (N,) or (N, T)
    
    X = np.atleast_2d(np.asarray(data_points))
    x = X[:, 0].astype(float)
    ys = np.asarray(loss_values)
    
    if ys.ndim == 1:
        ys = ys[:, None]
        single_target = True
    else:
        ys = ys
        single_target = False
        
    n_targets = ys.shape[1]
    results = []
    
    # Normalize x to [0, 1] range for numerical stability
    # This is critical for power laws with large exponents or large x
    x_max = np.max(x) if x.size > 0 else 1.0
    x_norm = x / x_max
    
    # --- Initialization: Vectorized Grid Search ---
    # We search over non-linear parameters (alpha, delta) and solve for (A, E)
    
    # Grid definition
    # Alpha: Scaling exponents, typically 0.01 to 3.0
    alphas = np.linspace(0.01, 3.0, 30)
    
    # Delta: Offset in normalized units.
    # Includes 0 and log-spaced values up to 10x dataset size
    deltas = np.concatenate([[0.0], np.geomspace(1e-4, 10.0, 29)])
    
    # Pre-compute feature matrix for all grid combinations
    # bases: (N, n_deltas)
    bases = x_norm[:, None] + deltas[None, :]
    # features: (N, n_deltas, n_alphas) -> (N, n_grid)
    features = np.power(bases[:, :, None], -alphas[None, None, :])
    n_grid = len(deltas) * len(alphas)
    features_flat = features.reshape(len(x), n_grid)
    
    # Pre-compute feature statistics for OLS
    f_mean = np.mean(features_flat, axis=0)
    f_var = np.var(features_flat, axis=0)
    # Avoid division by zero for constant features
    f_var[f_var < 1e-12] = np.inf
    
    # Optimization objective for curve_fit
    def model_opt(x_n, A, alpha, E, delta):
        return E + A * np.power(np.maximum(x_n + delta, 1e-10), -alpha)

    for i in range(n_targets):
        y = ys[:, i]
        y_mean = np.mean(y)
        y_var = np.var(y)
        
        # --- Step 1: Find best initialization via OLS ---
        # Covariance(y, f)
        yf_mean = np.mean(y[:, None] * features_flat, axis=0)
        cov_yf = yf_mean - y_mean * f_mean
        
        # Linear regression: y ~ E + A*f
        # A = Cov(y, f) / Var(f)
        A_est = cov_yf / f_var
        # E = Mean(y) - A * Mean(f)
        E_est = y_mean - A_est * f_mean
        
        # Calculate MSE for each grid point
        # MSE = Var(y) + A^2*Var(f) - 2*A*Cov(y,f)
        mse_grid = y_var + (A_est**2 * f_var) - (2 * A_est * cov_yf)
        
        # Apply constraints: A >= 0
        # If A < 0, the best physical fit is A=0 (constant model)
        neg_mask = A_est < 0
        mse_grid[neg_mask] = y_var
        A_est[neg_mask] = 0.0
        E_est[neg_mask] = y_mean
        
        # Select best grid point
        best_idx = np.argmin(mse_grid)
        
        # Map index back to parameters
        d_idx = best_idx // len(alphas)
        a_idx = best_idx % len(alphas)
        
        best_delta = deltas[d_idx]
        best_alpha = alphas[a_idx]
        best_A = A_est[best_idx]
        best_E = E_est[best_idx]
        
        # --- Step 2: Refine with Trust Region Reflective ---
        # Initial guess
        p0 = [max(best_A, 1e-8), best_alpha, max(best_E, 0.0), best_delta]
        
        # Bounds: A>=0, alpha>=0, E>=0, delta>=0
        # E is usually bounded by min(y), but we allow a buffer
        min_y = np.min(y)
        bounds_lo = [0.0, 0.0, 0.0, 0.0]
        bounds_hi = [np.inf, 10.0, min_y + 0.5, 100.0]
        
        try:
            popt, _ = curve_fit(
                model_opt, x_norm, y, 
                p0=p0, bounds=(bounds_lo, bounds_hi),
                method='trf', ftol=1e-7, xtol=1e-7, max_nfev=2000
            )
        except:
            popt = p0
            
        # --- Step 3: De-normalize parameters ---
        # y = E + A_n * (x/S + d_n)^-alpha 
        #   = E + (A_n * S^alpha) * (x + d_n*S)^-alpha
        
        A_n, alpha, E, delta_n = popt
        
        A_final = A_n * np.power(x_max, alpha)
        delta_final = delta_n * x_max
        E_final = E
        
        results.append([A_final, alpha, E_final, delta_final])
        
    results = np.array(results)
    if single_target:
        return results[0]
    return results
# EVOLVE-BLOCK-END

#3 Run 4 R² = 0.998467

▼

Python

# EVOLVE-BLOCK-START
"""
Vectorized Global Search for Scaling Law Discovery.
Model: L(D) = A * (D + B)^(-alpha) + E
Improvements:
1. Fully vectorized grid search over (alpha, B) pairs, solving (A, E) via linear least squares.
   This replaces nested loops with efficient matrix operations (700x faster candidates).
2. BIC-inspired complexity penalty (1%) to favor parsimonious models (B=0).
3. Robust SSE calculation and constraint enforcement within the vectorized flow.
4. Final refinement using Trust Region Reflective optimization with bounds derived from data.
"""
import numpy as np
from scipy.optimize import curve_fit

def scaling_law_func(data_points, params):
    # data_points: (N, 1) or (N,)
    # params: (4,) or (T, 4) -> [A, alpha, E, B]
    
    # Ensure inputs are 2D arrays for broadcasting
    x = np.atleast_2d(np.asarray(data_points))[:, 0]
    p = np.atleast_2d(np.asarray(params))
    
    # Pad parameters if fewer than 4 provided (e.g. if B is implicit 0)
    if p.shape[1] < 4:
        p_new = np.zeros((p.shape[0], 4))
        p_new[:, :p.shape[1]] = p
        p = p_new
    
    A, alpha, E, B = p[:, 0], p[:, 1], p[:, 2], p[:, 3]
    
    # Model: A * (x + B)^-alpha + E
    # Broadcasting: x is (N,), B is (T,) -> base is (N, T)
    base = np.maximum(x[:, None] + B[None, :], 1e-10)
    term = np.power(base, -alpha[None, :])
    pred = A[None, :] * term + E[None, :]
    
    # Return shape handling
    if params.ndim == 1:
        return pred[:, 0]
    return pred

def fit_scaling_law(data_points, loss_values):
    # Prepare data
    x = np.atleast_2d(np.asarray(data_points))[:, 0].astype(float)
    y_in = np.asarray(loss_values)
    
    # Ensure y is (N, T)
    if y_in.ndim == 1:
        y = y_in[:, None]
    else:
        y = y_in
        if y.shape[0] != x.shape[0] and y.shape[1] == x.shape[0]:
            y = y.T
            
    N_points, N_targets = y.shape
    
    # Normalize for numerical stability
    x_max = np.max(x)
    x_n = x / x_max
    
    y_max = np.max(y, axis=0)
    y_max[y_max == 0] = 1.0
    y_n = y / y_max[None, :]
    
    # --- Phase 1: Vectorized Grid Search ---
    # Define grid for nonlinear parameters alpha and B
    # Alpha: dense in typical range [0.01, 1.0], sparse in tail
    alphas = np.concatenate([np.linspace(0.01, 1.0, 25), np.linspace(1.1, 3.0, 10)])
    # B: 0.0 plus log-spaced values (normalized) covering small shifts to large shifts
    Bs = np.concatenate([[0.0], np.logspace(-5, 0.5, 25)])
    
    # Create meshgrid and flatten
    aa, bb = np.meshgrid(alphas, Bs, indexing='ij')
    aa_flat = aa.flatten() # (K,)
    bb_flat = bb.flatten() # (K,)
    
    # Precompute feature matrices Z for all candidates
    # Z_k = (x + B_k)^-alpha_k
    base = np.maximum(x_n[None, :] + bb_flat[:, None], 1e-10)
    Z = np.power(base, -aa_flat[:, None]) # (K, N)
    
    # Solve linear systems for (A, E) for all K candidates against all T targets
    # System: D @ [A, E].T = y  where D = [Z, 1]
    
    sum_z = np.sum(Z, axis=1)       # (K,)
    sum_z2 = np.sum(Z**2, axis=1)   # (K,)
    det = sum_z2 * N_points - sum_z**2
    
    # Compute terms for D.T y
    ZY = Z @ y_n               # (K, T)
    SumY = np.sum(y_n, axis=0) # (T,)
    
    # Solve using Cramer's rule
    inv_det = 1.0 / np.maximum(det, 1e-12)
    A_est = (N_points * ZY - sum_z[:, None] * SumY[None, :]) * inv_det[:, None]
    E_est = (sum_z2[:, None] * SumY[None, :] - sum_z[:, None] * ZY) * inv_det[:, None]
    
    # --- Constraints & Projection ---
    # 1. A >= 0 (decaying loss)
    valid_A = A_est >= 0
    
    # 2. E >= 0 (non-negative irreducible loss)
    # If E < 0, project to E=0 and re-solve A
    mask_neg_E = (E_est < 0) & valid_A
    if np.any(mask_neg_E):
        A_re = ZY / np.maximum(sum_z2[:, None], 1e-12)
        A_est = np.where(mask_neg_E, A_re, A_est)
        E_est = np.where(mask_neg_E, 0.0, E_est)
        valid_A = valid_A & (A_est >= 0)
        
    # --- Scoring ---
    # Compute SSE efficiently
    sum_y2 = np.sum(y_n**2, axis=0)
    sse = (A_est**2 * sum_z2[:, None] + 
           E_est**2 * N_points + 
           sum_y2[None, :] + 
           2 * A_est * E_est * sum_z[:, None] - 
           2 * A_est * ZY - 
           2 * E_est * SumY[None, :])
           
    mse = np.maximum(sse, 0) / N_points
    
    # Complexity penalty (1%) for B > 0
    penalty = np.where(bb_flat > 1e-6, 1.01, 1.0)[:, None]
    score = mse * penalty
    
    # Filter invalid
    score[~valid_A] = np.inf
    score[det < 1e-12, :] = np.inf
    
    # Select best candidate
    best_idx = np.argmin(score, axis=0)
    
    # Gather best initialization parameters
    p_init = np.column_stack([
        A_est[best_idx, np.arange(N_targets)],
        aa_flat[best_idx],
        E_est[best_idx, np.arange(N_targets)],
        bb_flat[best_idx]
    ])
    
    # --- Phase 2: Refinement ---
    results = np.zeros((N_targets, 4))
    
    def model_opt(x_in, a, alf, e, b):
        return a * np.power(np.maximum(x_in + b, 1e-10), -alf) + e

    for i in range(N_targets):
        p0 = p_init[i]
        curr_y = y_n[:, i]
        min_y = np.min(curr_y)
        
        # Bounds: A>=0, alpha[0,5], E[0, min_y], B[0, 5]
        # Relax E bound slightly to allow noise handling
        bounds = ([0, 0, 0, 0], [np.inf, 5.0, min_y + 1e-5, 5.0])
        
        try:
            popt, _ = curve_fit(model_opt, x_n, curr_y, p0=p0, bounds=bounds, 
                                method='trf', maxfev=500, ftol=1e-6)
        except:
            popt = p0
            
        # Denormalize
        an, alfn, en, bn = popt
        results[i] = [
            an * y_max[i] * np.power(x_max, alfn),
            alfn,
            en * y_max[i],
            bn * x_max
        ]
        
    if y_in.ndim == 1:
        return results[0]
    return results
# EVOLVE-BLOCK-END

#4 Run 5 R² = 0.998464

▼

Python

# EVOLVE-BLOCK-START
"""
Robust Scaling Law Discovery using Shifted Power Law: L(D) = A * (D + D0)^(-alpha) + E.
Optimization Pipeline:
1. Vectorized Variable Projection:
   - Performs a dense grid search over non-linear parameters (alpha, D0).
   - Analytically solves for linear parameters (A, E) using constrained least squares (A>=0, E>=0).
   - Identifies the global basin of attraction and the best "pure" power law (D0=0) basin.
2. Dual-Model Refinement:
   - Refines the best 4-parameter candidate (A, alpha, E, D0).
   - Refines the best 3-parameter candidate (A, alpha, E, D0=0).
   - Uses Trust Region Reflective (TRF) optimization with physical bounds.
3. Statistical Model Selection:
   - Uses AICc (Corrected Akaike Information Criterion) to select the most parsimonious model.
   - Includes a bias against negligible D0 parameters to prevent over-parameterization.
"""
import numpy as np
from scipy.optimize import curve_fit

def scaling_law_func(data_points, params):
    # data_points: (N, 1) array
    # params: [A, alpha, E, D0]
    X = np.atleast_2d(np.asarray(data_points))
    x = X[:, 0]
    
    params = np.asarray(params)
    if params.ndim == 1: 
        params = params[None, :]
    
    # Pad to 4 params if needed
    if params.shape[1] < 4:
        p_new = np.zeros((params.shape[0], 4))
        p_new[:, :params.shape[1]] = params
        params = p_new
        
    A, alpha, E, D0 = params[:, 0], params[:, 1], params[:, 2], params[:, 3]
    
    # Model: L = A * (x + D0)^-alpha + E
    # Safe computation for base
    base = np.maximum(x[:, None] + D0[None, :], 1e-10)
    pred = A[None, :] * np.power(base, -alpha[None, :]) + E[None, :]
    
    if pred.shape[1] == 1:
        return pred[:, 0]
    return pred

def fit_scaling_law(data_points, loss_values):
    X = np.atleast_2d(np.asarray(data_points))
    x = X[:, 0].astype(float)
    y_in = np.asarray(loss_values)
    y_2d = y_in[:, None] if y_in.ndim == 1 else y_in
    
    results = np.zeros((y_2d.shape[1], 4))
    
    # Normalize inputs for numerical stability
    x_max = np.max(x) if x.size > 0 else 1.0
    x_n = x / x_max
    N = x_n.shape[0]
    
    # --- Stage 1: Vectorized Grid Search (Variable Projection) ---
    # We search over (alpha, D0) and solve for (A, E) analytically.
    
    # Alpha grid: Dense in common scaling range [0, 1.0]
    alphas = np.concatenate([
        np.linspace(0.005, 1.0, 40),
        np.linspace(1.1, 3.0, 10)
    ])
    
    # D0 grid: 0.0 and log-spaced small values. 
    # Large D0 (>>1) behaves linearly/constantly, which we want to avoid unless necessary.
    d0s = np.concatenate([
        [0.0],
        np.logspace(-5, 0.5, 24) # up to ~3.16 * x_max
    ])
    
    # Create meshgrid: (G_alpha, G_d0)
    aa, dd = np.meshgrid(alphas, d0s, indexing='ij')
    aa_flat = aa.ravel()
    dd_flat = dd.ravel()
    
    # Compute Basis Functions H: (G, N)
    # H[g, i] = (x_i + d0_g)^(-alpha_g)
    base = x_n[None, :] + dd_flat[:, None]
    base = np.maximum(base, 1e-10)
    H = np.power(base, -aa_flat[:, None])
    
    # Precompute Determinant terms for Linear Regression
    Sum_H = np.sum(H, axis=1)        # (G,)
    Sum_H2 = np.sum(H**2, axis=1)    # (G,)
    Det = N * Sum_H2 - Sum_H**2      # Determinant of X^T X
    valid_det = Det > 1e-10
    
    # Pre-allocate arrays for loop
    G = len(aa_flat)
    
    for i in range(y_2d.shape[1]):
        y = y_2d[:, i]
        y_max = np.max(y) if np.max(y) > 0 else 1.0
        y_n = y / y_max
        min_y = np.min(y_n)
        
        # Target specific sums
        Sum_Y = np.sum(y_n)
        Sum_HY = np.sum(H * y_n[None, :], axis=1)
        
        # Initialize storage for grid results
        RSS = np.full(G, np.inf)
        A_est = np.zeros(G)
        E_est = np.zeros(G)
        
        # --- Vectorized Solve ---
        if np.any(valid_det):
            idx = np.where(valid_det)[0]
            
            # 1. Unconstrained OLS solution
            # Cramer's rule for system [[Sum_H2, Sum_H], [Sum_H, N]] * [A, E]^T = [Sum_HY, Sum_Y]^T
            # A = (N * Sum_HY - Sum_H * Sum_Y) / Det
            A_u = (N * Sum_HY[idx] - Sum_H[idx] * Sum_Y) / Det[idx]
            E_u = (Sum_Y - A_u * Sum_H[idx]) / N
            
            # 2. Apply Constraints (A >= 0, E >= 0)
            
            # Mask where unconstrained is valid
            mask_valid = (A_u >= 0) & (E_u >= 0)
            
            # Mask where E < 0 (Active constraint E=0)
            # Re-solve A = Sum_HY / Sum_H2
            mask_neg_E = (E_u < 0) & (A_u >= 0)
            
            # Initialize temp arrays
            A_temp = np.zeros_like(A_u)
            E_temp = np.zeros_like(E_u)
            
            # Fill valid
            A_temp[mask_valid] = A_u[mask_valid]
            E_temp[mask_valid] = E_u[mask_valid]
            
            # Fill E=0 case
            if np.any(mask_neg_E):
                A_re = Sum_HY[idx][mask_neg_E] / (Sum_H2[idx][mask_neg_E] + 1e-12)
                A_temp[mask_neg_E] = np.maximum(A_re, 0)
                E_temp[mask_neg_E] = 0.0
            
            # Handle A < 0 or remaining cases (Set A=0, E=mean(y))
            # If A_u < 0, it means correlation is negative (loss increasing), so flat line is best fit.
            mask_neg_A = (A_u < 0) | ((A_temp == 0) & (E_temp == 0) & (~mask_neg_E))
            if np.any(mask_neg_A):
                A_temp[mask_neg_A] = 0.0
                E_temp[mask_neg_A] = Sum_Y / N
            
            # Store estimates
            A_est[idx] = A_temp
            E_est[idx] = E_temp
            
            # Compute RSS
            Pred = A_temp[:, None] * H[idx] + E_temp[:, None]
            RSS[idx] = np.sum((Pred - y_n[None, :])**2, axis=1)
            
        # --- Select Initial Points ---
        # 1. Best global point (for 4-param model)
        best_idx = np.argmin(RSS)
        p_init_4 = [A_est[best_idx], aa_flat[best_idx], E_est[best_idx], dd_flat[best_idx]]
        rss_grid_4 = RSS[best_idx]
        
        # 2. Best point with D0 = 0 (for 3-param model)
        mask_d0_0 = (dd_flat == 0.0)
        if np.any(mask_d0_0):
            idx_3 = np.where(mask_d0_0)[0]
            best_local_3 = np.argmin(RSS[idx_3])
            best_idx_3 = idx_3[best_local_3]
            p_init_3 = [A_est[best_idx_3], aa_flat[best_idx_3], E_est[best_idx_3]]
        else:
            p_init_3 = [p_init_4[0], p_init_4[1], p_init_4[2]] # Fallback
            
        # --- Stage 2: Dual-Model Refinement ---
        
        def model_4(x_v, a, alf, e, d0):
            return a * np.power(np.maximum(x_v + d0, 1e-10), -alf) + e
        
        def model_3(x_v, a, alf, e):
            return a * np.power(np.maximum(x_v, 1e-10), -alf) + e

        # Fit 4-Parameter Model
        # Bounds: A>=0, alpha in [0, 5], E near min, D0 reasonable
        bounds_4 = ([0, 0, 0, 0], [np.inf, 5.0, min_y + 0.1, 10.0])
        try:
            popt_4, _ = curve_fit(model_4, x_n, y_n, p0=p_init_4, bounds=bounds_4, method='trf', maxfev=1000)
            rss_4 = np.sum((model_4(x_n, *popt_4) - y_n)**2)
        except:
            popt_4 = p_init_4
            rss_4 = rss_grid_4
            
        # Fit 3-Parameter Model (D0=0)
        bounds_3 = ([0, 0, 0], [np.inf, 5.0, min_y + 0.1])
        try:
            popt_3, _ = curve_fit(model_3, x_n, y_n, p0=p_init_3, bounds=bounds_3, method='trf', maxfev=1000)
            rss_3 = np.sum((model_3(x_n, *popt_3) - y_n)**2)
        except:
            popt_3 = p_init_3
            rss_3 = np.inf
            
        # --- Stage 3: AICc Model Selection ---
        def get_aicc(rss, k, n_samples):
            if rss <= 1e-20: return -np.inf
            if n_samples <= k + 1: return np.inf
            aic = n_samples * np.log(rss / n_samples) + 2 * k
            correction = (2 * k * (k + 1)) / (n_samples - k - 1)
            return aic + correction
            
        aicc_4 = get_aicc(rss_4, 4, N)
        aicc_3 = get_aicc(rss_3, 3, N)
        
        # Penalize 4-param model if D0 is effectively zero
        if popt_4[3] < 1e-4:
            aicc_4 += 10.0
        
        if aicc_4 < aicc_3:
            final_p = popt_4
        else:
            final_p = list(popt_3) + [0.0]
            
        # Denormalize
        A_n, alf, E_n, D0_n = final_p
        results[i] = [
            A_n * y_max * np.power(x_max, alf),
            alf,
            E_n * y_max,
            D0_n * x_max
        ]
        
    return results[0] if y_in.ndim == 1 else results
# EVOLVE-BLOCK-END

#5 Run 1 R² = 0.998460

▼

Python

# EVOLVE-BLOCK-START
"""
Robust Scaling Law Discovery with Variable Projection and BIC Model Selection.
Model: Loss = E + A * (data_size + delta)^(-alpha)
Features:
- Variable Projection (VarPro): Reduces 4D optimization to 2D (alpha, delta) by solving for (A, E) exactly.
- Custom 2x2 NNLS Solver: Analytic solution for the inner linear problem for speed and stability.
- 2D Grid Initialization: Searches both alpha and delta space to avoid local minima.
- BIC Model Selection: Automatically chooses between 3-param (delta=0) and 4-param models to prevent overfitting.
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    X = np.atleast_2d(np.asarray(data_points))
    x = X[:, 0]
    params = np.asarray(params)
    
    if params.ndim == 1:
        A, alpha, E, delta = params
        return E + A * np.power(np.maximum(x + delta, 1e-10), -alpha)
    else:
        A = params[:, 0]
        alpha = params[:, 1]
        E = params[:, 2]
        delta = params[:, 3]
        base = np.maximum(x[:, None] + delta[None, :], 1e-10)
        return E[None, :] + A[None, :] * np.power(base, -alpha[None, :])

def fit_scaling_law(data_points, loss_values):
    X = np.atleast_2d(np.asarray(data_points))
    x = X[:, 0].astype(float)
    ys = np.asarray(loss_values)
    
    single = False
    if ys.ndim == 1:
        ys = ys[:, None]
        single = True
        
    results = []
    x_max = np.max(x) if x.size > 0 else 1.0
    x_norm = x / x_max
    
    # Grid for initialization: alpha (log-spaced) and delta (mixed)
    grid_alpha = np.logspace(np.log10(0.1), np.log10(3.0), 12)
    grid_delta = [0.0, 1e-4, 1e-3, 0.01, 0.1, 0.5]
    
    # Analytic solver for min ||y - (E + A*h)||^2 s.t. A,E >= 0
    def solve_linear(h, y):
        N = len(y)
        sy, sh = np.sum(y), np.sum(h)
        syh, sh2 = np.sum(y*h), np.sum(h*h)
        det = N * sh2 - sh * sh
        
        best_mse, best_A, best_E = 1e20, 0.0, 0.0
        candidates = []
        
        # 1. Unconstrained
        if abs(det) > 1e-10:
            E_u = (sy * sh2 - sh * syh) / det
            A_u = (N * syh - sh * sy) / det
            if E_u >= 0 and A_u >= 0: candidates.append((A_u, E_u))
            
        # 2. Boundary A=0 (y=E) -> E=mean(y)
        candidates.append((0.0, max(0.0, sy/N)))
        # 3. Boundary E=0 (y=A*h) -> A=syh/sh2
        if sh2 > 1e-12: candidates.append((max(0.0, syh/sh2), 0.0))
            
        for A_c, E_c in candidates:
            mse = np.mean((y - (E_c + A_c * h))**2)
            if mse < best_mse: best_mse, best_A, best_E = mse, A_c, E_c
        return best_mse, best_A, best_E

    for i in range(ys.shape[1]):
        y = ys[:, i]
        
        # 1. Grid Search
        best_mse, best_p = 1e20, (0.5, 0.0)
        for alf in grid_alpha:
            for dlt in grid_delta:
                try:
                    mse, _, _ = solve_linear(np.power(np.maximum(x_norm + dlt, 1e-10), -alf), y)
                    if mse < best_mse: best_mse, best_p = mse, (alf, dlt)
                except: continue
        
        # 2. Optimization (VarPro)
        def obj(p):
            if p[0] < 1e-4 or p[1] < 0: return 1e20
            try:
                return solve_linear(np.power(np.maximum(x_norm + p[1], 1e-10), -p[0]), y)[0]
            except: return 1e20
            
        res = minimize(obj, best_p, bounds=[(1e-4, 10), (0, 100)], method='L-BFGS-B')
        p_opt = res.x if res.fun < best_mse else best_p
        mse_4 = min(res.fun, best_mse)
        
        # 3. BIC Check vs 3-param (delta=0)
        res_3 = minimize(lambda a: obj([a[0], 0.0]), [p_opt[0]], bounds=[(1e-4, 10)], method='L-BFGS-B')
        mse_3 = res_3.fun
        
        n = len(y)
        bic_4 = n * np.log(max(mse_4, 1e-20)) + 4 * np.log(n)
        bic_3 = n * np.log(max(mse_3, 1e-20)) + 3 * np.log(n)
        
        if bic_3 < bic_4 + 0.5: # Prefer 3-param
            alf_fin, dlt_fin = res_3.x[0], 0.0
        else:
            alf_fin, dlt_fin = p_opt
            
        # Final Params
        _, A_norm, E_opt = solve_linear(np.power(np.maximum(x_norm + dlt_fin, 1e-10), -alf_fin), y)
        results.append([A_norm * np.power(x_max, alf_fin), alf_fin, E_opt, dlt_fin * x_max])
        
    return results[0] if single else np.array(results)
# EVOLVE-BLOCK-END