SLD - SFT Scaling Law - SLDAgent + Claude Sonnet 4.5

All Runs (sorted by R²)

Best Run 1 R² = 0.999306

▼

Python

# EVOLVE-BLOCK-START
"""
Enhanced 4-parameter scaling law with adaptive transition behavior
Form: L(N) = L_inf + A / (N^alpha + c)
This smooth transition form avoids logarithmic singularities while capturing
both rapid initial decay and gradual asymptotic convergence
"""
import numpy as np
from scipy.optimize import differential_evolution, minimize, dual_annealing

def scaling_law_func(data_points, params):
    """
    4-parameter scaling law: L(N) = L_inf + A / (N^alpha + c)
    params = [L_inf, A, alpha, c]
    The offset c provides adaptive transition behavior across different scales
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    N = X[:, 0]
    
    params = np.asarray(params, dtype=np.float64)
    if params.ndim == 1:
        params = params[None, :]
    
    L_inf = params[:, 0]
    A = params[:, 1]
    alpha = params[:, 2]
    c = params[:, 3]
    
    # Numerical stability with safe minimum values
    N_safe = np.maximum(N, 1.0)
    
    # Smooth power-law form with offset for adaptive transition
    denominator = np.maximum(N_safe[:, None] ** alpha[None, :] + np.abs(c[None, :]), 1e-10)
    
    pred = L_inf[None, :] + A[None, :] / denominator
    
    return pred[:, 0] if pred.shape[1] == 1 else pred


def fit_scaling_law(data_points, loss_values):
    """
    Advanced fitting using dual annealing for better global search
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64)
    
    N = X[:, 0]
    y2d = y[:, None] if y.ndim == 1 else y
    
    # Data statistics for adaptive bounds
    y_min, y_max = np.min(y2d), np.max(y2d)
    y_range = y_max - y_min
    N_min, N_max = np.min(N), np.max(N)
    
    # Compute initial estimates for better convergence
    # Estimate L_inf from minimum loss with buffer
    L_inf_est = y_min - 0.1
    
    # Estimate A and alpha from log-log fit of initial decay
    if len(N) > 2:
        idx_low = N < np.percentile(N, 50)
        if np.sum(idx_low) > 2:
            N_low = N[idx_low]
            y_low = y2d[idx_low, 0] - L_inf_est
            y_low = np.maximum(y_low, 1e-6)
            
            log_N = np.log(N_low)
            log_y = np.log(y_low)
            
            # Linear fit in log space: log(y) ~ log(A) - alpha*log(N)
            coeffs = np.polyfit(log_N, log_y, 1)
            alpha_est = -coeffs[0]
            A_est = np.exp(coeffs[1])
        else:
            alpha_est = 0.5
            A_est = y_range * (N_max ** 0.5)
    else:
        alpha_est = 0.5
        A_est = y_range * (N_max ** 0.5)
    
    # Informed parameter bounds: [L_inf, A, alpha, c]
    bounds = [
        (y_min - 0.5, y_min + 0.8),              # L_inf: near minimum with wider margin
        (1e-3, y_range * N_max**1.5),            # A: power coefficient
        (0.1, 1.5),                               # alpha: wider range for flexibility
        (1.0, N_max * 0.15)                       # c: transition offset
    ]
    
    def objective(params):
        try:
            pred = scaling_law_func(X, params)
            pred = pred[:, None] if pred.ndim == 1 else pred
            
            # Mean squared error
            mse = np.mean((pred - y2d) ** 2)
            
            # Adaptive regularization based on parameter magnitudes
            reg_A = 1e-9 * (params[1] / (y_range * N_max**0.5))**2
            reg_c = 1e-9 * (params[3] / N_max)**2
            
            # Encourage alpha near empirically optimal range
            alpha_reg = 1e-8 * (params[2] - 0.5)**2
            
            # Penalize L_inf far from minimum
            L_inf_reg = 1e-7 * (params[0] - y_min)**2
            
            return mse + reg_A + reg_c + alpha_reg + L_inf_reg
        except:
            return 1e10
    
    # Try dual annealing first (better global search than DE in some cases)
    try:
        result_da = dual_annealing(
            objective,
            bounds,
            seed=42,
            maxiter=400,
            initial_temp=5230.0,
            visit=2.62,
            accept=-5.0,
            no_local_search=True
        )
        params_opt = result_da.x
        best_obj = result_da.fun
    except:
        params_opt = np.array([L_inf_est, A_est, alpha_est, N_max * 0.05])
        best_obj = objective(params_opt)
    
    # Differential evolution as fallback/alternative
    try:
        result_de = differential_evolution(
            objective, 
            bounds, 
            seed=43,
            maxiter=500,
            popsize=20,
            atol=1e-11,
            tol=1e-11,
            strategy='best1bin',
            mutation=(0.5, 1.5),
            recombination=0.8,
            polish=False,
            workers=1
        )
        if result_de.fun < best_obj:
            params_opt = result_de.x
            best_obj = result_de.fun
    except:
        pass
    
    # Multi-stage local refinement with multiple methods
    for method in ['L-BFGS-B', 'Powell', 'TNC']:
        try:
            result_local = minimize(
                objective,
                params_opt,
                method=method,
                bounds=bounds if method in ['L-BFGS-B', 'TNC'] else None,
                options={'ftol': 1e-12, 'maxiter': 800}
            )
            if result_local.success and result_local.fun < best_obj:
                params_opt = result_local.x
                best_obj = result_local.fun
        except:
            pass
    
    # Final ultra-precise refinement with Nelder-Mead
    try:
        result_nm = minimize(
            objective,
            params_opt,
            method='Nelder-Mead',
            options={'xatol': 1e-11, 'fatol': 1e-12, 'maxiter': 500}
        )
        if result_nm.success and result_nm.fun < best_obj:
            params_opt = result_nm.x
    except:
        pass
    
    return params_opt
# EVOLVE-BLOCK-END

#2 Run 3 R² = 0.999206

▼

Python

# EVOLVE-BLOCK-START
"""
Enhanced scaling law: L(D) = A + B/(C + D^α)
Combining best practices from top performers with improved stability
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Scaling law: L(D) = A + B/(C + D^α)
    params: [A=asymptote, B=scale, C=offset, α=exponent]
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    params = np.asarray(params, dtype=np.float64)
    
    if params.ndim == 1:
        params = params[None, :]
    
    D = np.maximum(X[:, 0], 1.0)
    A, B, C, alpha = params[:, 0], params[:, 1], params[:, 2], params[:, 3]
    
    # Stable computation with clipping
    denom = np.maximum(C[:, None] + np.power(D[None, :], alpha[:, None]), 1e-10)
    pred = A[:, None] + B[:, None] / denom
    
    return pred[0, :] if pred.shape[0] == 1 else pred.T


def fit_scaling_law(data_points, loss_values):
    """
    Hybrid optimization with data-driven initialization
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64)
    D = X[:, 0]
    
    y_min, y_max = np.min(y), np.max(y)
    y_range = y_max - y_min
    D_min, D_max = np.min(D), np.max(D)
    
    # Smart percentile-based initialization
    y_low = np.percentile(y, 10)
    y_high = np.percentile(y, 90)
    D_mid = np.exp(0.5 * (np.log(D_min) + np.log(D_max)))
    
    def objective(p):
        try:
            pred = scaling_law_func(X, p)
            mse = np.mean((pred - y)**2)
            reg = 1e-10 * np.sum(p**2)
            return mse + reg
        except:
            return 1e10
    
    # Optimized bounds from top performers
    bounds = [
        (y_low - 0.3 * y_range, y_min + 0.15 * y_range),
        (y_range * 0.1, y_range * 6e6),
        (1e-2, D_max * 6),
        (0.15, 0.65)
    ]
    
    best_params = None
    best_score = np.inf
    
    # Global search with tuned parameters
    try:
        res = differential_evolution(
            objective, bounds, seed=42,
            maxiter=650, popsize=15,
            atol=1e-9, tol=1e-9,
            strategy='best1bin',
            recombination=0.75,
            mutation=(0.5, 1.4),
            polish=True,
            workers=1
        )
        if res.fun < best_score:
            best_score = res.fun
            best_params = res.x
    except:
        pass
    
    # Estimate alpha from log-log slope
    alpha_est = 0.35
    if len(D) >= 3:
        try:
            log_D = np.log(D)
            alpha_est = np.clip(-np.polyfit(log_D, y, 1)[0], 0.25, 0.5)
        except:
            pass
    
    # Strategic local searches
    init_configs = [
        [y_low - 0.1 * y_range, y_range * 1500, D_min ** 0.32, alpha_est],
        [y_min - 0.08 * y_range, y_range * 1100, D_min ** 0.35, 0.36],
        [y_low - 0.15 * y_range, y_range * 2000, D_mid ** 0.15, 0.33],
        [y_min - 0.12 * y_range, y_range * 1300, (D_min * D_mid) ** 0.2, 0.38],
        [y_low - 0.18 * y_range, y_range * 1700, D_min ** 0.28, 0.32]
    ]
    
    for init in init_configs:
        try:
            res = minimize(
                objective, init,
                method='L-BFGS-B',
                bounds=bounds,
                options={'maxiter': 4500, 'ftol': 1e-12, 'gtol': 1e-10}
            )
            if res.fun < best_score:
                best_score = res.fun
                best_params = res.x
        except:
            continue
    
    # Final polish with Nelder-Mead
    if best_params is not None and best_score < 0.025:
        try:
            res = minimize(
                objective, best_params,
                method='Nelder-Mead',
                options={'maxiter': 1800, 'xatol': 1e-10, 'fatol': 1e-12}
            )
            if res.fun < best_score:
                best_params = res.x
        except:
            pass
    
    # Robust fallback
    if best_params is None:
        best_params = np.array([y_low - 0.1 * y_range, y_range * 1500, 
                                D_min ** 0.32, alpha_est])
    
    return best_params
# EVOLVE-BLOCK-END

#3 Run 4 R² = 0.998917

▼

Python

# EVOLVE-BLOCK-START
"""
Enhanced scaling law: L(N) = L_inf + A / (N^alpha + B)
Key improvements:
- Multi-scale alpha estimation using quantile-based regression
- Adaptive parameter bounds based on data statistics
- Three-stage optimization with improved convergence
- Robust fallback with strategic perturbations
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Scaling law: L(N) = L_inf + A / (N^alpha + B)
    params: [L_inf, A, alpha, B]
    - L_inf: asymptotic minimum loss
    - A: scale coefficient (positive)
    - alpha: power law exponent (positive)
    - B: stabilization offset (positive)
    """
    data_points = np.atleast_2d(np.asarray(data_points))
    N = data_points[:, 0]
    params = np.asarray(params)
    
    if params.ndim == 1:
        L_inf, A, alpha, B = params
        A, alpha, B = np.abs(A), np.abs(alpha), np.abs(B)
        return L_inf + A / (np.power(N, alpha) + B + 1e-12)
    else:
        L_inf = params[:, 0]
        A = np.abs(params[:, 1])
        alpha = np.abs(params[:, 2])
        B = np.abs(params[:, 3])
        denom = np.power(N[None, :], alpha[:, None]) + B[:, None] + 1e-12
        return (L_inf[:, None] + A[:, None] / denom).T


def fit_scaling_law(data_points, loss_values):
    """
    Optimized fitting with multi-scale initialization and hybrid optimization
    """
    data_points = np.atleast_2d(np.asarray(data_points))
    loss_values = np.asarray(loss_values)
    N, y = data_points[:, 0], loss_values
    
    # Robust statistics
    y_min, y_max = np.min(y), np.max(y)
    y_range = y_max - y_min
    y_q25, y_q50, y_q75 = np.percentile(y, [25, 50, 75])
    N_min, N_max = np.min(N), np.max(N)
    N_med = np.median(N)
    
    # Multi-scale alpha estimation
    if len(N) > 5:
        log_N = np.log(N)
        
        # Estimate on upper portion (more stable region)
        upper_mask = N >= N_med
        if np.sum(upper_mask) >= 4:
            log_N_upper = log_N[upper_mask]
            y_upper = y[upper_mask]
            y_shift_upper = np.maximum(y_upper - y_min + 0.005 * y_range, 0.005)
            log_y_upper = np.log(y_shift_upper)
            alpha_upper = -np.polyfit(log_N_upper, log_y_upper, 1)[0]
        else:
            alpha_upper = 0.35
        
        # Estimate on full data
        y_shift_full = np.maximum(y - y_min + 0.01 * y_range, 0.01)
        log_y_full = np.log(y_shift_full)
        alpha_full = -np.polyfit(log_N, log_y_full, 1)[0]
        
        # Weighted combination
        alpha_est = 0.65 * alpha_upper + 0.35 * alpha_full
        alpha_est = np.clip(alpha_est, 0.12, 0.75)
    else:
        alpha_est = 0.35
    
    # Enhanced initialization using quantiles
    init_L_inf = y_q25 - 0.35 * (y_q75 - y_q25)
    init_A = (y_q75 - y_q25) * np.power(N_med, alpha_est) * 1.3
    init_alpha = alpha_est
    init_B = N_max * 0.007
    init_params = np.array([init_L_inf, init_A, init_alpha, init_B])
    
    def objective(params):
        try:
            pred = scaling_law_func(data_points, params)
            mse = np.mean((pred - y) ** 2)
            # Light adaptive regularization
            reg = 1e-9 * (params[1]**2 + params[3]**2)
            return mse + reg
        except:
            return 1e10
    
    # Adaptive bounds
    bounds = [
        (y_min - 1.6 * y_range, y_q25 + 0.1 * y_range),  # L_inf
        (1e-6 * y_range, 550 * y_range),                  # A
        (0.07, 1.05),                                      # alpha
        (0.15, 0.18 * N_max)                              # B
    ]
    
    try:
        # Stage 1: Global search with enhanced settings
        result_de = differential_evolution(
            objective,
            bounds=bounds,
            maxiter=210,
            popsize=15,
            seed=42,
            atol=1e-9,
            tol=1e-9,
            mutation=(0.55, 1.15),
            recombination=0.8,
            strategy='best1bin',
            polish=True,
            workers=1,
            updating='deferred'
        )
        
        # Stage 2: Local refinement with L-BFGS-B
        result_local1 = minimize(
            objective,
            result_de.x,
            method='L-BFGS-B',
            bounds=bounds,
            options={'maxiter': 1000, 'ftol': 1e-11, 'gtol': 1e-9}
        )
        
        # Stage 3: Fine-tune with TNC if beneficial
        best_result = result_local1 if result_local1.success else result_de
        try:
            result_local2 = minimize(
                objective,
                best_result.x,
                method='TNC',
                bounds=bounds,
                options={'maxiter': 500, 'ftol': 1e-12}
            )
            if result_local2.success and result_local2.fun < best_result.fun:
                best_result = result_local2
        except:
            pass
        
        params_opt = best_result.x
        
    except Exception:
        # Enhanced fallback with strategic perturbations
        best_params = init_params
        best_loss = objective(init_params)
        
        # Try multiple initialization strategies
        for seed_val, scale in [(0, 0.10), (17, 0.15), (42, 0.22), (99, 0.08), (123, 0.18)]:
            try:
                rng = np.random.RandomState(seed_val)
                perturb = init_params * (1 + scale * rng.randn(4))
                # Clip to bounds
                for i, (lb, ub) in enumerate(bounds):
                    perturb[i] = np.clip(perturb[i], lb, ub)
                
                result = minimize(
                    objective,
                    perturb,
                    method='L-BFGS-B',
                    bounds=bounds,
                    options={'maxiter': 1500, 'ftol': 1e-11}
                )
                
                if result.success and result.fun < best_loss:
                    best_params = result.x
                    best_loss = result.fun
            except:
                continue
        
        params_opt = best_params
    
    return params_opt
# EVOLVE-BLOCK-END

#4 Run 2 R² = 0.997537

▼

Python

# EVOLVE-BLOCK-START
"""
Enhanced scaling law using a 4-parameter generalized power law:
L(N) = A + B / (N^alpha + C)

This form places C inside the exponent for better regime transition modeling.

Key improvements over current:
- More flexible functional form with C inside power term
- Refined initialization strategy using data percentiles
- Streamlined multi-strategy optimization
- Enhanced numerical stability with adaptive clipping
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Generalized power law: L(N) = A + B / (N^alpha + C)
    params: [A, B, alpha, C]
    - A: irreducible asymptotic loss
    - B: scaling coefficient
    - alpha: scaling exponent
    - C: offset term for regime transitions
    """
    data_size = np.asarray(data_points).flatten()
    A, B, alpha, C = params
    
    # Enhanced numerical stability
    data_size = np.maximum(data_size, 1e-10)
    alpha = np.clip(alpha, 0.05, 1.5)
    C = np.maximum(C, 0)
    
    denominator = np.power(data_size, alpha) + C
    denominator = np.maximum(denominator, 1e-10)
    
    return A + B / denominator


def fit_scaling_law(data_points, loss_values):
    """
    Optimized fitting with adaptive multi-start strategy
    """
    data_size = np.asarray(data_points).flatten()
    loss_values = np.asarray(loss_values).flatten()
    
    min_loss = np.min(loss_values)
    max_loss = np.max(loss_values)
    loss_range = max_loss - min_loss
    min_data = np.min(data_size)
    max_data = np.max(data_size)
    
    # Smart initialization using data statistics
    A_init = np.percentile(loss_values, 8) * 0.92
    
    # Estimate alpha from log-log slope using robust middle region
    if len(data_size) >= 6:
        q1_idx = len(data_size) // 4
        q3_idx = 3 * len(data_size) // 4
        
        loss_adj = np.maximum(loss_values - A_init, 1e-8)
        log_data_diff = np.log(data_size[q3_idx] / data_size[q1_idx])
        log_loss_diff = np.log(loss_adj[q1_idx] / loss_adj[q3_idx])
        
        alpha_init = np.clip(log_loss_diff / log_data_diff, 0.20, 0.50)
    else:
        alpha_init = 0.36
    
    # Initialize C based on minimum data size scaled by alpha
    C_init = (min_data ** alpha_init) * 0.5
    
    # Back-calculate B from representative middle point
    mid_idx = len(data_size) // 2
    denominator_mid = (data_size[mid_idx] ** alpha_init) + C_init
    B_init = np.maximum((loss_values[mid_idx] - A_init) * denominator_mid, 0.01)
    
    def objective(params):
        pred = scaling_law_func(data_size, params)
        mse = np.mean((pred - loss_values) ** 2)
        
        # Minimal regularization to guide optimization
        A, B, alpha, C = params
        reg = 1e-7 * (
            (alpha - 0.36) ** 2 +
            (C / (max_data ** alpha)) ** 2 +
            ((A - min_loss) / loss_range) ** 2
        )
        
        return mse + reg
    
    # Adaptive bounds based on data characteristics
    bounds = [
        (min_loss * 0.20, min_loss * 2.5),
        (0.001, loss_range * 100),
        (0.05, 1.0),
        (0, (max_data ** 0.5) * 5)
    ]
    
    best_result = None
    best_score = float('inf')
    
    # Strategy 1: Global search with differential evolution
    try:
        result = differential_evolution(
            objective,
            bounds,
            maxiter=200,
            popsize=15,
            seed=42,
            atol=1e-9,
            tol=1e-9,
            polish=True,
            updating='deferred',
            workers=1
        )
        if result.fun < best_score:
            best_result = result.x
            best_score = result.fun
    except:
        pass
    
    # Strategy 2: Local refinement from smart initialization
    try:
        result = minimize(
            objective,
            [A_init, B_init, alpha_init, C_init],
            method='L-BFGS-B',
            bounds=bounds,
            options={'maxiter': 3000, 'ftol': 1e-12}
        )
        if result.success and result.fun < best_score:
            best_result = result.x
            best_score = result.fun
    except:
        pass
    
    # Strategy 3: Grid search over alpha parameter
    if best_result is not None:
        for alpha_try in [0.25, 0.30, 0.35, 0.40, 0.45, 0.50]:
            try:
                x0 = best_result.copy()
                x0[2] = alpha_try
                # Re-estimate C for this alpha
                x0[3] = (min_data ** alpha_try) * 0.5
                
                result = minimize(
                    objective,
                    x0,
                    method='L-BFGS-B',
                    bounds=bounds,
                    options={'maxiter': 1500, 'ftol': 1e-11}
                )
                if result.success and result.fun < best_score:
                    best_result = result.x
                    best_score = result.fun
            except:
                continue
    
    # Strategy 4: Explore C parameter variations
    if best_result is not None:
        for c_scale in [0.3, 0.7, 1.5, 2.5]:
            try:
                x0 = best_result.copy()
                new_C = (min_data ** x0[2]) * c_scale
                x0[3] = np.clip(new_C, bounds[3][0], bounds[3][1])
                
                result = minimize(
                    objective,
                    x0,
                    method='L-BFGS-B',
                    bounds=bounds,
                    options={'maxiter': 1200, 'ftol': 1e-11}
                )
                if result.success and result.fun < best_score:
                    best_result = result.x
                    best_score = result.fun
            except:
                continue
    
    # Strategy 5: Fine-tune B parameter around best solution
    if best_result is not None:
        for b_scale in [0.90, 0.95, 1.05, 1.10]:
            try:
                x0 = best_result.copy()
                x0[1] *= b_scale
                
                result = minimize(
                    objective,
                    x0,
                    method='L-BFGS-B',
                    bounds=bounds,
                    options={'maxiter': 1000, 'ftol': 1e-11}
                )
                if result.success and result.fun < best_score:
                    best_result = result.x
                    best_score = result.fun
            except:
                continue
    
    # Fallback to initial guess if all strategies fail
    if best_result is None:
        best_result = np.array([A_init, B_init, alpha_init, C_init])
    
    return best_result
# EVOLVE-BLOCK-END

#5 Run 5 R² = 0.995633

▼

Python

# EVOLVE-BLOCK-START
"""
Advanced rational scaling law with enhanced optimization
Uses 4 parameters: a, b, c, d where loss = a + b/(c + D^d)
Combines rational function stability with improved fitting strategy
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Scaling law: L(D) = a + b / (c + D^d)
    where D is data_size
    params = [a, b, c, d] (4 parameters)
    
    Rational form with:
    - a: asymptotic minimum loss (irreducible error)
    - b: scale of improvement from additional data
    - c: offset controlling when saturation begins
    - d: power law exponent controlling decay rate
    """
    data_points = np.atleast_2d(np.asarray(data_points))
    data_size = data_points[:, 0]
    params = np.asarray(params)
    
    if params.ndim == 1:
        a, b, c, d = params
    else:
        a = params[:, 0]
        b = params[:, 1]
        c = params[:, 2]
        d = params[:, 3]
        data_size = data_size[:, None]
    
    # Numerical stability guarantees
    eps = 1e-10
    data_size = np.maximum(data_size, eps)
    
    # Ensure positive denominators and bounded exponents
    c_safe = np.abs(c) + eps
    d_safe = np.clip(np.abs(d), 0.05, 0.95)
    
    # Rational function form
    pred = a + b / (c_safe + data_size ** d_safe)
    
    if pred.ndim > 1:
        return pred[:, 0] if pred.shape[1] == 1 else pred
    return pred


def fit_scaling_law(data_points, loss_values):
    """
    Fit scaling law parameters using enhanced multi-strategy optimization
    """
    data_points = np.atleast_2d(np.asarray(data_points))
    loss_values = np.asarray(loss_values)
    data_size = data_points[:, 0]
    
    # Comprehensive data analysis
    min_loss = np.min(loss_values)
    max_loss = np.max(loss_values)
    loss_range = max_loss - min_loss
    min_data = np.min(data_size)
    max_data = np.max(data_size)
    median_data = np.median(data_size)
    
    # Estimate parameters from data structure
    large_data_mask = data_size > median_data
    small_data_mask = data_size < median_data
    
    large_data_losses = loss_values[large_data_mask]
    small_data_losses = loss_values[small_data_mask]
    
    # Estimate asymptotic loss from largest data points
    est_a = np.min(large_data_losses) if len(large_data_losses) > 0 else min_loss
    
    # Estimate scale parameter from loss difference
    if len(small_data_losses) > 0 and len(large_data_losses) > 0:
        avg_small = np.mean(small_data_losses)
        est_b = (avg_small - est_a) * (min_data ** 0.35)
    else:
        est_b = loss_range * 2000
    
    # Adaptive bounds based on data characteristics
    bounds = [
        (min_loss - 1.2, min_loss + 0.6),      # a: asymptotic loss
        (0.5, loss_range * 60000),              # b: numerator scale
        (0.05, max_data * 0.6),                 # c: offset parameter
        (0.08, 0.85)                            # d: exponent
    ]
    
    def objective(params):
        try:
            pred = scaling_law_func(data_points, params)
            residuals = pred - loss_values
            mse = np.mean(residuals ** 2)
            
            # Multi-component regularization
            a, b, c, d = params
            
            # Prefer moderate b values relative to loss range
            reg_b = 1e-8 * (b**2 / (loss_range**2 + 1e-10))
            
            # Prefer exponent near empirically optimal 0.35
            reg_d = 1e-6 * ((d - 0.35)**2)
            
            # Small penalty for extreme 'a' values
            reg_a = 1e-7 * ((a - est_a)**2)
            
            return mse + reg_b + reg_d + reg_a
        except:
            return 1e10
    
    # Enhanced multi-start differential evolution
    best_result = None
    best_score = np.inf
    
    # Try multiple random seeds for robustness
    for seed_val in [42, 123, 456, 789]:
        try:
            result = differential_evolution(
                objective,
                bounds,
                maxiter=450,
                popsize=22,
                seed=seed_val,
                atol=1e-10,
                tol=1e-10,
                strategy='best1bin',
                mutation=(0.4, 1.6),
                recombination=0.75,
                polish=False  # We'll do our own polishing
            )
            
            if result.fun < best_score:
                best_score = result.fun
                best_result = result
        except:
            continue
    
    # Intensive local refinement with multiple methods
    if best_result is not None and best_result.success:
        try:
            # First refinement with L-BFGS-B
            result_local1 = minimize(
                objective,
                best_result.x,
                method='L-BFGS-B',
                bounds=bounds,
                options={'maxiter': 1500, 'ftol': 1e-11, 'gtol': 1e-9}
            )
            
            if result_local1.success and result_local1.fun < best_score:
                best_score = result_local1.fun
                params_opt = result_local1.x
            else:
                params_opt = best_result.x
                
            # Second refinement with Nelder-Mead for fine-tuning
            try:
                result_local2 = minimize(
                    objective,
                    params_opt,
                    method='Nelder-Mead',
                    options={'maxiter': 500, 'xatol': 1e-10, 'fatol': 1e-11}
                )
                if result_local2.success and result_local2.fun < best_score:
                    params_opt = result_local2.x
            except:
                pass
                
        except:
            params_opt = best_result.x
    else:
        # Intelligent fallback with estimated parameters
        params_opt = np.array([est_a, est_b, min_data * 0.15, 0.35])
    
    return params_opt
# EVOLVE-BLOCK-END