SLD - U-shaped Scaling Law - SLDAgent + Claude Haiku 4.5

All Runs (sorted by R²)

Best Run 2 R² = 0.929632

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM easy question performance with U-shaped pattern
Optimized 6-parameter model capturing double descent behavior with improved fitting
Refined hyperparameters and optimization strategy for robustness
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    U-shaped scaling law with 6 parameters capturing double descent:
    params[0]: a (amplitude of Gaussian dip)
    params[1]: b (curvature/width of dip)
    params[2]: c (location of minimum)
    params[3]: d (linear recovery slope)
    params[4]: e (baseline floor)
    params[5]: offset (vertical shift)
    
    Model: pred = a * exp(-b * (x - c)^2) + d * (x - c) + e + offset
    """
    X = np.atleast_2d(np.asarray(data_points))
    x = X[:, 0]
    params = np.asarray(params).flatten()
    
    # Extract 6 parameters
    a, b, c, d, e, offset = params[:6]
    
    # Ensure b is positive (controls width)
    b = np.abs(b) + 1e-6
    
    # U-shaped function: Gaussian dip + linear recovery + baseline
    dx = x - c
    gaussian_term = a * np.exp(-b * dx**2)
    linear_term = d * dx
    
    pred = gaussian_term + linear_term + e + offset
    
    return pred


def fit_scaling_law(data_points, loss_values):
    """
    Optimize U-shaped scaling law using adaptive strategy:
    1. Normalize data for numerical stability
    2. Smart initialization from data statistics
    3. Global optimization (differential_evolution)
    4. Local refinement (L-BFGS-B)
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values).flatten()
    x = X[:, 0]
    
    # Data statistics for smart initialization
    x_min, x_max = np.min(x), np.max(x)
    x_mean = np.mean(x)
    x_std = np.std(x) + 1e-8
    x_range = x_max - x_min
    
    y_min, y_max = np.min(y), np.max(y)
    y_mean = np.mean(y)
    y_std = np.std(y) + 1e-8
    y_range = y_max - y_min
    
    # Identify potential minimum location (where y is lowest)
    min_idx = np.argmin(y)
    x_at_min = x[min_idx]
    y_at_min = y[min_idx]
    
    def objective(params):
        """MSE loss with regularization"""
        pred = scaling_law_func(X, params)
        mse = np.mean((pred - y)**2)
        # Effective regularization to prevent extreme values
        reg = 0.0001 * (np.abs(params[0])**2 + params[1]**2 + params[5]**2)
        return mse + reg
    
    # Parameter bounds: [a, b, c, d, e, offset]
    bounds = [
        (-2.0 * y_range, 2.0 * y_range),  # a: amplitude (can be negative for dip)
        (0.01, 20.0),                      # b: curvature (must be positive, controls width)
        (x_min - 0.5*x_range, x_max + 0.5*x_range),  # c: minimum location
        (-3.0, 3.0),                       # d: linear slope for recovery
        (y_min - y_std, y_max + y_std),    # e: baseline floor
        (y_min - 2*y_std, y_max + 2*y_std) # offset: vertical shift
    ]
    
    # Smart initialization based on data
    init_params = np.array([
        -0.3 * y_range,        # a: slight negative dip
        2.0,                    # b: moderate curvature
        x_at_min,               # c: put minimum where y is lowest
        0.05,                   # d: slight recovery slope
        y_mean,                 # e: baseline at mean
        y_at_min - y_mean       # offset: shift towards minimum
    ])
    
    # Validate initial guess
    init_loss = objective(init_params)
    if np.isnan(init_loss) or np.isinf(init_loss):
        init_params = np.array([
            -0.5 * y_std,
            1.0,
            x_mean,
            0.1,
            y_mean,
            0.0
        ])
    
    # Global optimization with differential evolution
    result_de = differential_evolution(
        objective,
        bounds,
        seed=42,
        maxiter=500,
        popsize=20,
        atol=1e-8,
        tol=1e-8,
        workers=1,
        updating='deferred',
        init='latinhypercube',
        mutation=(0.5, 1.5),
        recombination=0.7,
        polish=False
    )
    
    params_global = result_de.x
    
    # Local refinement with L-BFGS-B for high precision
    result_local = minimize(
        objective,
        params_global,
        method='L-BFGS-B',
        bounds=bounds,
        options={
            'maxiter': 500,
            'ftol': 1e-10,
            'gtol': 1e-8,
            'maxcor': 20
        }
    )
    
    # Choose best result
    if result_local.success and result_local.fun < result_de.fun:
        params_opt = result_local.x
    else:
        params_opt = params_global
    
    # Ensure positivity of b parameter
    params_opt[1] = np.abs(params_opt[1]) + 1e-6
    
    return params_opt

# EVOLVE-BLOCK-END

#2 Run 3 R² = 0.929434

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM easy-question scenarios with U-shaped pattern
Optimized 6-parameter centered model with robust hybrid optimization
Captures double descent via centered quadratic + linear + power + cubic terms
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    U-shaped scaling law with 6 parameters using centered parameterization:
    f(x) = a*(x-x0)^2 + b*(x-x0) + c + d*|x-x0|^1.5 + e*(x-x0)^3 + f
    
    where x0 = mean(x) is data-adaptive centering for numerical stability.
    
    This captures:
    - Quadratic term (a): main U-shape structure
    - Linear term (b): asymmetry and overall trend
    - Bias (c): primary vertical shift
    - Power term (d): smooth transitions and asymptotic behavior via 1.5 exponent
    - Cubic term (e): fine curvature adjustments
    - Offset (f): additional vertical flexibility
    
    params: [a, b, c, d, e, f] (6 parameters, max allowed)
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    x = X[:, 0]  # Extract log_flops (1D)
    
    params = np.asarray(params, dtype=np.float64).flatten()
    
    # Pad if necessary
    if len(params) < 6:
        params = np.pad(params, (0, 6 - len(params)), mode='constant', constant_values=0)
    
    a, b, c, d, e, f = params[:6]
    
    # Use data-adaptive centering for numerical stability
    x0 = np.mean(x)
    x_centered = x - x0
    
    # Stability constraints on parameters to prevent overflow
    a = np.clip(a, -50, 50)
    b = np.clip(b, -50, 50)
    c = np.clip(c, -50, 50)
    d = np.clip(d, -30, 30)
    e = np.clip(e, -30, 30)
    f = np.clip(f, -50, 50)
    
    # Build prediction with all components
    quad_term = a * (x_centered ** 2)
    linear_term = b * x_centered
    power_term = d * np.sign(x_centered) * (np.abs(x_centered) ** 1.5)
    cubic_term = e * (x_centered ** 3)
    
    # Combined prediction
    pred = quad_term + linear_term + power_term + cubic_term + c + f
    
    return pred


def fit_scaling_law(data_points, loss_values):
    """
    Fit U-shaped scaling law using robust multi-phase optimization:
    Phase 1: Global search with differential evolution
    Phase 2: Local refinement with L-BFGS-B
    Phase 3: Multiple diverse restarts for robustness
    Phase 4: Final polish for convergence
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64).flatten()
    
    x = X[:, 0]
    x_mean = np.mean(x)
    x_range = np.max(x) - np.min(x)
    
    y_mean = np.mean(y)
    y_range = np.max(y) - np.min(y)
    
    if y_range < 1e-10:
        y_range = 1.0
    
    def objective(params):
        """MSE objective with light regularization"""
        try:
            pred = scaling_law_func(X, params)
            
            # Check for numerical issues
            if not np.all(np.isfinite(pred)):
                return 1e10
            
            # MSE loss
            mse = np.mean((pred - y) ** 2)
            
            # Very light L2 regularization to encourage simpler solutions
            reg = 0.0005 * np.sum(params ** 2)
            
            return mse + reg
        except (ValueError, FloatingPointError, OverflowError):
            return 1e10
    
    # Parameter bounds - carefully calibrated
    bounds = [
        (-50.0, 50.0),           # a: quadratic coefficient
        (-50.0, 50.0),           # b: linear coefficient
        (-50.0, 50.0),           # c: bias term
        (-30.0, 30.0),           # d: power term coefficient
        (-30.0, 30.0),           # e: cubic coefficient
        (-50.0, 50.0),           # f: additional offset
    ]
    
    best_params = None
    best_loss = float('inf')
    
    # Phase 1: Global search with differential evolution
    try:
        de_result = differential_evolution(
            objective,
            bounds,
            seed=42,
            maxiter=600,
            popsize=25,
            atol=1e-10,
            tol=1e-10,
            workers=1,
            updating='deferred',
            polish=True,
        )
        if de_result.fun < best_loss:
            best_loss = de_result.fun
            best_params = de_result.x
    except Exception:
        pass
    
    # Phase 2: Local refinement with L-BFGS-B
    if best_params is not None:
        try:
            refine_result = minimize(
                objective,
                best_params,
                method='L-BFGS-B',
                bounds=bounds,
                options={'maxiter': 1000, 'ftol': 1e-11, 'gtol': 1e-9}
            )
            if refine_result.fun < best_loss:
                best_loss = refine_result.fun
                best_params = refine_result.x
        except Exception:
            pass
    
    # Phase 3: Multiple restarts from diverse initializations
    init_strategies = [
        lambda: np.array([0.3, -0.5, y_mean, 0.1, 0.05, 0.0]),
        lambda: np.array([0.8, -1.0, y_mean, 0.2, 0.1, 0.0]),
        lambda: np.array([0.1, 0.0, y_mean, 0.05, 0.02, 0.0]),
        lambda: np.array([-0.2, 0.5, y_mean, -0.1, -0.05, 0.0]),
        lambda: np.array([0.5, -0.2, y_mean, 0.15, 0.08, -0.5*y_range]),
        lambda: np.array([1.0, -1.5, y_mean, 0.3, 0.15, 0.0]),
        lambda: np.random.uniform(-10, 10, 6),
        lambda: np.random.uniform(-5, 5, 6),
    ]
    
    for init_func in init_strategies:
        try:
            init_params = init_func()
            # Ensure bounds compliance
            init_params = np.clip(init_params, np.array(bounds)[:, 0], np.array(bounds)[:, 1])
            
            result = minimize(
                objective,
                init_params,
                method='L-BFGS-B',
                bounds=bounds,
                options={'maxiter': 500, 'ftol': 1e-10, 'gtol': 1e-9}
            )
            
            if result.fun < best_loss:
                best_loss = result.fun
                best_params = result.x
        except Exception:
            pass
    
    # Phase 4: Final polish if we have a solution
    if best_params is not None:
        try:
            polish_result = minimize(
                objective,
                best_params,
                method='L-BFGS-B',
                bounds=bounds,
                options={'maxiter': 2000, 'ftol': 1e-12, 'gtol': 1e-10}
            )
            if polish_result.fun < best_loss:
                best_params = polish_result.x
        except Exception:
            pass
    
    # Fallback initialization
    if best_params is None:
        best_params = np.array([
            0.5,        # a: moderate positive quadratic for U-shape
            -0.5,       # b: slight negative linear trend
            y_mean,     # c: centered at data mean
            0.1,        # d: small power term
            0.05,       # e: small cubic term
            0.0         # f: no additional offset
        ])
    
    return best_params

# EVOLVE-BLOCK-END

#3 Run 1 R² = 0.928072

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning with U-shaped/double descent patterns
Evolved approach: Simplified model with better generalization
Focus: Reduced complexity, improved cross-task robustness, better MAE/NMAE
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Improved U-shaped scaling law: asymmetric polynomial + exponential decay
    params = [a, b, c, d, e, f]
    f(x) = a + b*x + c*x^2 + d*exp(-e*|x-f|)
    
    Key improvements:
    - Exponential decay (instead of sigmoid) for cleaner U-shape
    - Absolute value distance for symmetric asymmetry control
    - Better captures the characteristic double descent pattern
    - Fewer numerical issues with cleaner functional form
    """
    X = np.atleast_2d(np.asarray(data_points))
    x = X[:, 0]  # Extract log_flops
    params = np.asarray(params, dtype=np.float64)
    
    # Ensure params has 6 elements
    if len(params) < 6:
        params = np.pad(params, (0, 6 - len(params)), mode='constant')
    
    a, b, c, d, e, f = params[:6]
    
    # Base polynomial: a + b*x + c*x^2
    poly = a + b * x + c * x**2
    
    # Exponential modulation: d*exp(-e*|x-f|)
    # Creates a "valley" or "peak" at position f
    distance = np.abs(x - f)
    exp_arg = np.clip(-e * distance, -100, 0)  # Ensure non-positive for numerical stability
    exp_term = d * np.exp(exp_arg)
    
    pred = poly + exp_term
    return pred


def fit_scaling_law(data_points, loss_values):
    """
    Optimized fitting with better initialization strategy
    - Simplified parameter bounds based on physical intuition
    - Two-phase optimization: global + local
    - Improved initialization from polynomial regression
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values, dtype=np.float64).flatten()
    x = X[:, 0]
    
    # Compute data statistics
    x_min, x_max = np.min(x), np.max(x)
    x_range = x_max - x_min + 1e-8
    x_mean = np.mean(x)
    y_min, y_max = np.min(y), np.max(y)
    y_range = y_max - y_min + 1e-8
    y_mean = np.mean(y)
    
    # Fit initial polynomial for better initialization
    poly_coeffs = np.polyfit(x, y, 2)
    a_init, b_init, c_init = poly_coeffs[0], poly_coeffs[1], poly_coeffs[2]
    
    def objective(params):
        """Robust MAE-based objective"""
        try:
            pred = scaling_law_func(X, params)
            mae = np.mean(np.abs(pred - y))
            # Small L2 regularization on parameters for stability
            reg = 1e-5 * np.sum(params**2)
            return mae + reg
        except:
            return 1e10
    
    # Adaptive bounds based on data characteristics
    bounds = [
        (y_mean - 2*y_range, y_mean + 2*y_range),      # a: baseline
        (-2*y_range/x_range, 2*y_range/x_range),        # b: linear slope
        (-2*y_range/x_range**2, 2*y_range/x_range**2),  # c: quadratic
        (-y_range, y_range),                             # d: exponential amplitude
        (0.01, 20.0),                                     # e: exponential decay rate
        (x_min - x_range, x_max + x_range)               # f: valley/peak position
    ]
    
    # Phase 1: Global optimization with differential evolution
    result_de = differential_evolution(
        objective,
        bounds,
        seed=42,
        maxiter=500,
        popsize=25,
        atol=1e-8,
        tol=1e-8,
        workers=1,
        updating='deferred',
        strategy='best1bin'
    )
    
    params_global = result_de.x
    
    # Phase 2: Local refinement with L-BFGS-B
    result_local = minimize(
        objective,
        params_global,
        method='L-BFGS-B',
        bounds=bounds,
        options={
            'ftol': 1e-10,
            'gtol': 1e-9,
            'maxiter': 2000,
            'maxfun': 4000
        }
    )
    
    params_opt = result_local.x if result_local.success else params_global
    
    # Ensure exactly 6 parameters
    params_opt = np.asarray(params_opt, dtype=np.float64)
    if len(params_opt) < 6:
        params_opt = np.pad(params_opt, (0, 6 - len(params_opt)), mode='constant')
    else:
        params_opt = params_opt[:6]
    
    # Post-optimization bounds clipping for physical validity
    params_opt[4] = np.clip(params_opt[4], 0.001, 100.0)  # e: decay rate must be positive
    
    return params_opt
# EVOLVE-BLOCK-END

#4 Run 4 R² = 0.925944

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning with U-shaped pattern
Optimized double descent model with refined parameterization and enhanced optimization.
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution, least_squares

def scaling_law_func(data_points, params):
    """
    U-shaped scaling law with 6 parameters optimized for double descent:
    y = a + b*x + c*exp(-d*(x-e)^2) + f*(x-e)^2
    
    Parameters:
    - a: baseline performance
    - b: linear scaling trend
    - c: Gaussian dip amplitude (negative for U-shape)
    - d: Gaussian width (positive, controls dip sharpness)
    - e: Gaussian center (dip location)
    - f: quadratic recovery coefficient (positive for improvement at scale)
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    x = X[:, 0]
    params = np.asarray(params, dtype=np.float64)
    
    a, b, c, d, e, f = params[0], params[1], params[2], params[3], params[4], params[5]
    
    # Numerical stability: ensure d is positive
    d_safe = np.abs(d) + 1e-8
    
    # U-shaped pattern: linear trend + Gaussian dip + quadratic recovery
    gaussian_dip = c * np.exp(-d_safe * (x - e) ** 2)
    quadratic_recovery = f * (x - e) ** 2
    
    pred = a + b * x + gaussian_dip + quadratic_recovery
    
    return pred


def fit_scaling_law(data_points, loss_values):
    """
    Fit U-shaped scaling law with multi-stage optimization:
    1. Data-driven initialization
    2. Global search with differential evolution
    3. Local refinement with least-squares
    4. Final polish with L-BFGS-B
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64).ravel()
    
    x = X[:, 0]
    x_min, x_max = x.min(), x.max()
    x_range = x_max - x_min
    x_mid = (x_min + x_max) / 2
    y_min, y_max = y.min(), y.max()
    y_mean = np.mean(y)
    y_std = np.std(y) + 1e-9
    
    # Detect dip location more robustly
    window_size = max(3, len(y) // 20)
    smoothed_y = np.convolve(y, np.ones(window_size)/window_size, mode='same')
    dip_idx = np.argmin(smoothed_y)
    x_dip = x[dip_idx]
    
    def objective(params):
        """MSE loss with numerical stability"""
        try:
            pred = scaling_law_func(X, params)
            if np.any(np.isnan(pred)) or np.any(np.isinf(pred)):
                return 1e10
            mse = np.mean((pred - y) ** 2)
            return mse
        except:
            return 1e10
    
    def objective_residuals(params):
        """Residuals for least squares"""
        try:
            pred = scaling_law_func(X, params)
            if np.any(np.isnan(pred)) or np.any(np.isinf(pred)):
                return np.full_like(y, 1e10)
            return pred - y
        except:
            return np.full_like(y, 1e10)
    
    # Tighter, more informed bounds
    bounds = [
        (y_min - 2*y_std, y_max + 2*y_std),        # a: baseline
        (-1.5, 1.5),                                # b: linear coefficient
        (-2.5*y_std, -0.1*y_std),                   # c: negative Gaussian amplitude (dip)
        (0.1, 15.0),                                # d: Gaussian width (positive)
        (x_min - 0.5*x_range, x_max + 0.5*x_range),# e: dip location
        (0.001, 0.5),                               # f: positive quadratic (recovery)
    ]
    
    # Multiple smart initializations
    init_candidates = [
        np.array([y_mean, 0.02, -0.8*y_std, 2.0, x_dip, 0.05]),
        np.array([y_min, -0.05, -y_std, 3.0, x_dip, 0.08]),
        np.array([y_mean + 0.5*y_std, 0.0, -y_std, 2.5, x_mid, 0.03]),
        np.array([y_mean - 0.5*y_std, 0.1, -0.5*y_std, 1.5, x_dip, 0.1]),
    ]
    
    best_init = init_candidates[0]
    best_init_loss = objective(best_init)
    
    for candidate in init_candidates[1:]:
        cand_loss = objective(candidate)
        if cand_loss < best_init_loss:
            best_init_loss = cand_loss
            best_init = candidate
    
    # Global optimization with differential evolution
    try:
        result_de = differential_evolution(
            objective,
            bounds,
            seed=42,
            maxiter=1000,
            popsize=30,
            atol=1e-8,
            tol=1e-8,
            workers=1,
            updating='deferred',
            polish=True
        )
        params_de = result_de.x
        loss_de = result_de.fun
    except:
        params_de = best_init
        loss_de = best_init_loss
    
    # Local refinement with least squares
    try:
        result_ls = least_squares(
            objective_residuals,
            params_de,
            bounds=(
                [b[0] for b in bounds],
                [b[1] for b in bounds]
            ),
            ftol=1e-11,
            xtol=1e-11,
            gtol=1e-11,
            max_nfev=10000,
            loss='soft_l1',
            f_scale=1.0
        )
        params_ls = result_ls.x
        loss_ls = objective(params_ls)
    except:
        params_ls = params_de
        loss_ls = loss_de
    
    # Final L-BFGS-B polish with stricter tolerance
    try:
        result_lbfgs = minimize(
            objective,
            params_ls,
            method='L-BFGS-B',
            bounds=bounds,
            options={
                'ftol': 1e-10,
                'gtol': 1e-8,
                'maxiter': 3000,
                'maxfun': 10000
            }
        )
        params_final = result_lbfgs.x
        loss_final = result_lbfgs.fun
    except:
        params_final = params_ls
        loss_final = loss_ls
    
    # Validate and return best result
    if np.all(np.isfinite(params_final)) and loss_final < 1e9:
        return params_final
    elif np.all(np.isfinite(params_ls)) and loss_ls < 1e9:
        return params_ls
    else:
        return params_de

# EVOLVE-BLOCK-END

#5 Run 5 R² = 0.925939

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM easy questions - U-shaped double descent pattern
Valley-centered quadratic model with inverse smoothing and asymmetric recovery
Proven 6-parameter architecture optimized for convergence and accuracy
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution
import warnings
warnings.filterwarnings('ignore')

def scaling_law_func(data_points, params):
    """
    U-shaped scaling law: a*(x-x0)^2 + c + b/(1+|x-x0|) + d*sign(x-x0)*|x-x0|^e
    
    params: [a, b, c, x0, d, e]
    - a: quadratic curvature (U-shape width)
    - b: inverse distance term (smoothing near valley)
    - c: baseline performance
    - x0: location of valley (log_flops at minimum)
    - d: asymmetric power term (captures left/right asymmetry)
    - e: power exponent for asymmetric recovery
    """
    X = np.atleast_1d(np.asarray(data_points, dtype=np.float64)).flatten()
    params = np.atleast_1d(np.asarray(params, dtype=np.float64))
    
    if len(params) < 6:
        params = np.pad(params, (0, 6 - len(params)), mode='constant')
    
    a, b, c, x0, d, e = params[:6]
    
    # Center around valley location
    dx = X - x0
    
    # Quadratic term: main U-shape structure
    quad = a * np.clip(dx**2, 0, 1000)
    
    # Inverse distance term: smooth regularization near valley
    inv_term = b / (1.0 + np.abs(dx))
    
    # Asymmetric power term: captures different rates of improvement left/right
    sign_dx = np.sign(dx)
    abs_dx = np.abs(dx)
    e_safe = np.clip(e, 0.1, 3.0)
    power_term = d * sign_dx * np.power(np.clip(abs_dx, 0, 100), e_safe)
    
    pred = quad + inv_term + c + power_term
    
    return pred


def fit_scaling_law(data_points, loss_values):
    """
    Fit U-shaped scaling law using proven hybrid optimization strategy.
    DE for global search + aggressive L-BFGS-B refinement + perturbation restarts
    """
    X = np.atleast_1d(np.asarray(data_points, dtype=np.float64)).flatten()
    y = np.atleast_1d(np.asarray(loss_values, dtype=np.float64)).flatten()
    
    N = len(X)
    if N < 2:
        return np.array([0., 0., np.mean(y), 0., 0., 1.0])
    
    # Data statistics for robust initialization
    x_min, x_max = np.min(X), np.max(X)
    x_range = x_max - x_min if x_max > x_min else 1.0
    
    y_min, y_max = np.min(y), np.max(y)
    y_range = y_max - y_min if y_max > y_min else 1.0
    y_mean = np.mean(y)
    
    # Find valley location (minimum value = best performance)
    idx_valley = np.argmin(y)
    x_valley_init = X[idx_valley]
    y_valley_init = y[idx_valley]
    
    # Intelligent initialization based on data
    valley_depth = max(y_mean - y_valley_init, 1e-8)
    a_init = max(0.01, valley_depth / (x_range**2 + 1e-8))
    c_init = y_valley_init
    b_init = 0.1 * y_range
    d_init = 0.05 * y_range
    e_init = 1.0
    
    init_params = np.array([a_init, b_init, c_init, x_valley_init, d_init, e_init])
    
    def objective(params):
        try:
            pred = scaling_law_func(X, params)
            if np.any(~np.isfinite(pred)):
                return 1e10
            mse = np.mean((pred - y)**2)
            return mse
        except:
            return 1e10
    
    # Data-adaptive bounds with proven ranges
    bounds = [
        (0.0001, 100),                      # a: quadratic coefficient
        (-10*y_range, 10*y_range),         # b: inverse term coefficient
        (y_min - 2*y_range, y_max + 2*y_range),  # c: baseline
        (x_min - x_range, x_max + x_range),      # x0: valley location
        (-10*y_range, 10*y_range),         # d: asymmetric coefficient
        (0.1, 3.0)                         # e: power exponent
    ]
    
    best_params = init_params.copy()
    best_loss = objective(best_params)
    
    # Strategy 1: Global optimization with differential_evolution (proven parameters)
    try:
        result_de = differential_evolution(
            objective,
            bounds,
            seed=42,
            maxiter=400,
            workers=1,
            updating='deferred',
            atol=1e-8,
            tol=1e-8,
            polish=True
        )
        if result_de.fun < best_loss:
            best_params = result_de.x.copy()
            best_loss = result_de.fun
    except:
        pass
    
    # Strategy 2: Aggressive local refinement with L-BFGS-B
    try:
        result_local = minimize(
            objective,
            best_params,
            method='L-BFGS-B',
            bounds=bounds,
            options={'ftol': 1e-10, 'maxiter': 1000, 'maxcor': 10}
        )
        if result_local.success and result_local.fun < best_loss:
            best_params = result_local.x.copy()
            best_loss = result_local.fun
    except:
        pass
    
    # Strategy 3: Multi-start perturbation with additive noise (not multiplicative)
    try:
        for trial in range(2):
            # Use additive perturbation scaled by data ranges (more stable than multiplicative)
            perturbation = np.random.normal(0, 0.1, 6)
            perturbed = best_params.copy()
            perturbed[0] += perturbation[0] * 0.1 * best_params[0] if best_params[0] > 0 else perturbation[0] * 0.01
            perturbed[1] += perturbation[1] * 0.5 * y_range
            perturbed[2] += perturbation[2] * 0.5 * y_range
            perturbed[3] += perturbation[3] * 0.3 * x_range
            perturbed[4] += perturbation[4] * 0.5 * y_range
            perturbed[5] += perturbation[5] * 0.15
            
            # Clip to bounds
            for i, (lo, hi) in enumerate(bounds):
                perturbed[i] = np.clip(perturbed[i], lo, hi)
            
            result_pert = minimize(
                objective,
                perturbed,
                method='L-BFGS-B',
                bounds=bounds,
                options={'ftol': 1e-9, 'maxiter': 500}
            )
            if result_pert.fun < best_loss:
                best_params = result_pert.x.copy()
                best_loss = result_pert.fun
    except:
        pass
    
    return best_params
# EVOLVE-BLOCK-END