SLD - Data-Constrained Scaling Law - SLDAgent + Claude Haiku 4.5

All Runs (sorted by R²)

Best Run 3 R² = 0.904712

▼

Python

# EVOLVE-BLOCK-START
"""
Simplified and optimized scaling law with code efficiency focus.
Maintains high predictive accuracy through smart parameter bounds and
efficient hybrid optimization strategy.
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Scaling law: L = a + b/D^α + c/N^β + d*log₁₀(V/V₀)
    Uses 7 parameters: [a, b, c, d, α, β, v₀]
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    p = np.asarray(params, dtype=np.float64)
    
    V, N, D = X[:, 0], X[:, 1], X[:, 2]
    a, b, c, d, alpha, beta, v0 = p
    
    alpha = np.clip(alpha, 0.08, 1.6)
    beta = np.clip(beta, 0.08, 1.6)
    v0_val = 10.0 ** np.clip(v0, 3.5, 9.5)
    
    loss = (a + 
            b / np.power(np.maximum(D, 1e4), alpha) +
            c / np.power(np.maximum(N, 1e4), beta) +
            d * np.log10(np.maximum(V, 1e2) / v0_val))
    
    return np.clip(loss, 0.3, 12.0)


def fit_scaling_law(data_points, loss_values):
    """
    Optimized fitting with efficient bounds and aggressive refinement.
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64).ravel()
    
    y_min, y_max, y_mean, y_std = np.min(y), np.max(y), np.mean(y), np.std(y)
    y_range = y_max - y_min
    
    bounds = [
        (max(0.1, y_min - 2), min(y_mean + y_std, y_max)),
        (0.001, max(10, y_range * 200)),
        (0.001, max(10, y_range * 200)),
        (-2.0, 2.0),
        (0.05, 2.0),
        (0.05, 2.0),
        (3.0, 10.0)
    ]
    
    def obj(p):
        try:
            pred = scaling_law_func(X, p)
            mse = np.mean((pred - y) ** 2)
            return mse if np.isfinite(mse) else 1e12
        except:
            return 1e12
    
    # Global search with efficient settings
    de_result = differential_evolution(
        obj, bounds, seed=42, maxiter=250, popsize=15,
        atol=1e-10, tol=1e-10, workers=1, updating='deferred', polish=True
    )
    
    # Aggressive local refinement with higher iteration limit
    local_result = minimize(
        obj, de_result.x, method='L-BFGS-B', bounds=bounds,
        options={'maxiter': 600, 'ftol': 1e-12, 'gtol': 1e-10}
    )
    
    best_params = local_result.x if local_result.fun < de_result.fun else de_result.x
    return np.array([np.clip(best_params[i], bounds[i][0], bounds[i][1]) 
                     for i in range(7)])
# EVOLVE-BLOCK-END

#2 Run 1 R² = 0.898748

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM training with data constraints.
Efficient reciprocal power law model with streamlined two-stage optimization.
Simplified approach: one global search + targeted local refinement.
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Scaling law: L = A + B/(D^α) + C/(P^β) + E*log(V) + F*log(D*P)
    
    Components:
    - A: base loss floor
    - B/(D^α): token scaling (loss decreases with more tokens)
    - C/(P^β): parameter scaling (loss decreases with more parameters)
    - E*log(V): vocabulary effect (sublinear growth with unique tokens)
    - F*log(D*P): log-linear scale interaction (captures product effects)
    
    Parameters (7): [A, B, alpha, C, beta, E, F]
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    N, F = X.shape
    
    if F != 3:
        raise ValueError(f"Expected 3 features, got {F}")
    
    unique_tokens = X[:, 0]
    param_count = X[:, 1]
    token_count = X[:, 2]
    
    params = np.atleast_1d(np.asarray(params, dtype=np.float64))
    
    if len(params) != 7:
        raise ValueError(f"Expected 7 parameters, got {len(params)}")
    
    A, B, alpha, C, beta, E, F_coeff = params
    
    # Constrain exponents to stable range
    alpha = np.clip(alpha, 0.05, 1.8)
    beta = np.clip(beta, 0.05, 1.8)
    
    # Safe feature values
    token_safe = np.maximum(token_count, 1e6)
    param_safe = np.maximum(param_count, 1e6)
    vocab_safe = np.maximum(unique_tokens, 1e4)
    
    # Scaling law components
    base_loss = A
    token_term = B / np.power(token_safe, alpha)
    param_term = C / np.power(param_safe, beta)
    vocab_term = E * np.log(vocab_safe + 1.0)
    
    # Log-linear interaction term - more stable scaling behavior
    scale_product = token_safe * param_safe
    interaction_term = F_coeff * np.log(scale_product / 1e14 + 1.0)
    
    loss = base_loss + token_term + param_term + vocab_term + interaction_term
    
    return np.clip(loss, 0.5, 15.0)


def fit_scaling_law(data_points, loss_values):
    """
    Fit scaling law using efficient two-stage optimization:
    1. Differential evolution for robust global search
    2. L-BFGS-B for precise local refinement
    
    Streamlined approach: focused initialization and targeted refinement.
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.atleast_1d(np.asarray(loss_values, dtype=np.float64))
    
    N, F = X.shape
    if F != 3:
        raise ValueError(f"Expected 3 features, got {F}")
    if len(y) != N:
        raise ValueError("Mismatched number of samples")
    
    # Loss statistics
    loss_min = np.min(y)
    loss_max = np.max(y)
    loss_mean = np.mean(y)
    loss_q1 = np.percentile(y, 25)
    loss_range = loss_max - loss_min
    
    def objective(params_flat):
        """MSE objective with NaN handling"""
        try:
            pred = scaling_law_func(X, params_flat)
            if np.any(np.isnan(pred)) or np.any(np.isinf(pred)):
                return 1e10
            return np.mean((pred - y) ** 2)
        except:
            return 1e10
    
    # Bounds based on data range
    bounds = [
        (loss_min * 0.5, loss_max * 0.6),
        (0.0001, 200.0),
        (0.05, 1.8),
        (0.0001, 200.0),
        (0.05, 1.8),
        (-10.0, 10.0),
        (-5.0, 5.0),  # Reduced range for log interaction term
    ]
    
    # Primary initialization strategy
    init_primary = np.array([
        loss_q1,
        loss_range * 1.0,
        0.25,
        loss_range * 0.5,
        0.15,
        loss_range * 0.2,
        0.05,  # Small log interaction
    ])
    
    # Aggressive initialization strategy
    init_aggressive = np.array([
        loss_mean * 0.6,
        loss_range * 2.0,
        0.4,
        loss_range * 1.5,
        0.3,
        loss_range * 0.5,
        0.3,  # Larger log interaction
    ])
    
    # Clip to bounds
    init_primary = np.array([np.clip(x, b[0], b[1]) for x, b in zip(init_primary, bounds)])
    init_aggressive = np.array([np.clip(x, b[0], b[1]) for x, b in zip(init_aggressive, bounds)])
    
    # Stage 1: Global search with differential evolution
    result_de = differential_evolution(
        objective,
        bounds,
        seed=42,
        maxiter=800,
        popsize=30,
        mutation=(0.5, 1.5),
        recombination=0.7,
        atol=1e-12,
        tol=1e-12,
        workers=1,
        updating='deferred',
        polish=False
    )
    
    # Stage 2: Local refinement from DE result
    result_bfgs_de = minimize(
        objective,
        result_de.x,
        method='L-BFGS-B',
        bounds=bounds,
        options={'ftol': 1e-14, 'gtol': 1e-10, 'maxiter': 1000, 'maxcor': 25}
    )
    
    # Stage 3: Local refinement from primary init
    result_bfgs_primary = minimize(
        objective,
        init_primary,
        method='L-BFGS-B',
        bounds=bounds,
        options={'ftol': 1e-14, 'gtol': 1e-10, 'maxiter': 800, 'maxcor': 25}
    )
    
    # Stage 4: Local refinement from aggressive init
    result_bfgs_aggressive = minimize(
        objective,
        init_aggressive,
        method='L-BFGS-B',
        bounds=bounds,
        options={'ftol': 1e-14, 'gtol': 1e-10, 'maxiter': 800, 'maxcor': 25}
    )
    
    # Select best result
    candidates = [
        (result_de.fun, result_de.x),
        (result_bfgs_de.fun, result_bfgs_de.x),
        (result_bfgs_primary.fun, result_bfgs_primary.x),
        (result_bfgs_aggressive.fun, result_bfgs_aggressive.x),
    ]
    
    best_loss, best_params = min(candidates, key=lambda x: x[0])
    best_params = np.array([np.clip(p, b[0], b[1]) for p, b in zip(best_params, bounds)])
    
    return best_params

# EVOLVE-BLOCK-END

#3 Run 2 R² = 0.680568

▼

Python

# EVOLVE-BLOCK-START
"""
Enhanced scaling law for LLM training under data constraints.
Improved 4-term formulation: L = a + b/D^α + c/P^β + d*log(V)/V^γ
Features: adaptive regularization, correlation-aware initialization, refined multi-stage optimization.
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Scaling law: L = a + b/D^α + c/P^β + d*log(V)/V^γ
    
    Parameters (7 total):
    - a: baseline loss
    - b: data scaling coefficient
    - c: parameter scaling coefficient
    - d: vocabulary coefficient
    - alpha, beta, gamma: exponents for data, params, vocabulary
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    params = np.atleast_1d(np.asarray(params, dtype=np.float64))
    
    # Extract features with safety clipping
    V = np.maximum(X[:, 0], 1e5)
    P = np.maximum(X[:, 1], 1e7)
    D = np.maximum(X[:, 2], 1e8)
    
    # Pad parameters to length 7
    if len(params) < 7:
        params = np.concatenate([params, np.zeros(7 - len(params))])
    
    a, b, c, d, alpha, beta, gamma = params[:7]
    
    # Clip exponents for numerical stability
    alpha = np.clip(alpha, 0.01, 1.5)
    beta = np.clip(beta, 0.01, 1.5)
    gamma = np.clip(gamma, 0.01, 1.5)
    
    # Core scaling terms
    term_base = a
    term_data = b / np.power(D, alpha)
    term_param = c / np.power(P, beta)
    term_vocab = d * np.log(np.maximum(V, 2.0)) / np.power(V, gamma)
    
    loss = term_base + term_data + term_param + term_vocab
    return np.clip(loss, 0.1, 100.0)


def fit_scaling_law(data_points, loss_values):
    """
    Enhanced three-stage robust optimization:
    1. Correlation-aware global exploration with DE
    2. Adaptive intermediate refinement
    3. L-BFGS-B for final local refinement
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.atleast_1d(np.asarray(loss_values, dtype=np.float64))
    
    # Normalize targets for better numerical stability
    y_mean = np.mean(y)
    y_std = np.std(y) + 1e-8
    y_norm = (y - y_mean) / y_std
    
    # Extract features for intelligent initialization
    V = np.maximum(X[:, 0], 1e5)
    P = np.maximum(X[:, 1], 1e7)
    D = np.maximum(X[:, 2], 1e8)
    
    log_V = np.log(V)
    log_P = np.log(P)
    log_D = np.log(D)
    
    # Compute feature correlations for initialization
    try:
        corr_D = np.abs(np.corrcoef(log_D, y)[0, 1])
        corr_P = np.abs(np.corrcoef(log_P, y)[0, 1])
        corr_V = np.abs(np.corrcoef(log_V, y)[0, 1])
    except:
        corr_D = corr_P = corr_V = 0.3
    
    # Data statistics for adaptive regularization
    y_q25, y_median, y_q75, y_min, y_max = np.percentile(y, [25, 50, 75, 0, 100])
    y_iqr = np.maximum(y_q75 - y_q25, 1e-6)
    y_range = y_max - y_min + 1e-6
    
    # Adaptive regularization weight based on data characteristics
    base_reg = 0.0005
    if y_std < 0.4:
        base_reg *= 0.3
    elif y_std > 2.5:
        base_reg *= 2.0
    
    def objective(params):
        """Objective with adaptive regularization"""
        try:
            pred = scaling_law_func(X, params)
            if np.any(np.isnan(pred)) or np.any(np.isinf(pred)):
                return 1e10
            
            pred_norm = (pred - y_mean) / y_std
            pred_norm = np.clip(pred_norm, -50, 50)
            
            mse = np.mean((pred_norm - y_norm) ** 2)
            
            # Adaptive regularization: prioritize coefficient stability
            # but allow larger b, c for strong effects
            reg = (base_reg * (
                np.abs(params[0]) +
                np.maximum(0, params[1] - 2.0) * 0.5 +
                np.maximum(0, params[2] - 2.0) * 0.5 +
                np.abs(params[3])
            ))
            
            return mse + reg
        except:
            return 1e10
    
    # Bounds based on scaling law theory
    bounds = [
        (0.1, 15.0),      # a: baseline loss
        (0.001, 10.0),    # b: data coefficient
        (0.001, 10.0),    # c: parameter coefficient
        (0.0, 5.0),       # d: vocabulary coefficient
        (0.01, 1.5),      # alpha: data exponent
        (0.01, 1.5),      # beta: parameter exponent
        (0.01, 1.5),      # gamma: vocabulary exponent
    ]
    
    # Correlation-aware initialization
    x0_corr = np.array([
        y_median,
        np.clip(corr_D * y_iqr * 0.5, 0.001, 5.0),
        np.clip(corr_P * y_iqr * 0.35, 0.001, 5.0),
        np.clip(corr_V * 0.1, 0.0, 2.0),
        np.clip(0.3 + corr_D * 0.15, 0.05, 0.5),
        np.clip(0.2 + corr_P * 0.15, 0.05, 0.45),
        np.clip(0.2 + corr_V * 0.1, 0.05, 0.45),
    ])
    
    # Conservative baseline
    x0_base = np.array([
        y_median,
        y_iqr * 0.4,
        y_iqr * 0.2,
        0.05,
        0.35,
        0.25,
        0.25,
    ])
    
    # Stage 1: Global exploration with DE
    best_x = x0_base.copy()
    best_loss = objective(best_x)
    
    try:
        result_de = differential_evolution(
            objective,
            bounds,
            seed=42,
            maxiter=650,
            popsize=28,
            atol=1e-9,
            tol=1e-9,
            workers=1,
            updating='deferred',
            recombination=0.87,
            mutation=(0.45, 1.65),
            strategy='best1bin',
            init='sobol',
            polish=False
        )
        if result_de.fun < best_loss:
            best_x = result_de.x
            best_loss = result_de.fun
    except Exception:
        pass
    
    # Stage 2: Intermediate refinement from best point
    try:
        result_inter = minimize(
            objective,
            best_x,
            method='L-BFGS-B',
            bounds=bounds,
            options={
                'maxiter': 800,
                'ftol': 1e-11,
                'gtol': 1e-9,
                'maxcor': 25
            }
        )
        if result_inter.fun < best_loss:
            best_x = result_inter.x
            best_loss = result_inter.fun
    except Exception:
        pass
    
    # Stage 3: Secondary DE refinement with tighter bounds
    try:
        refined_bounds = [
            (max(bounds[i][0], best_x[i] * 0.6),
             min(bounds[i][1], best_x[i] * 1.6))
            for i in range(7)
        ]
        
        result_de2 = differential_evolution(
            objective,
            refined_bounds,
            seed=43,
            maxiter=400,
            popsize=18,
            atol=1e-10,
            tol=1e-10,
            workers=1,
            updating='deferred',
            polish=False
        )
        if result_de2.fun < best_loss:
            best_x = result_de2.x
            best_loss = result_de2.fun
    except Exception:
        pass
    
    # Stage 4: Final polish with L-BFGS-B
    try:
        result_final = minimize(
            objective,
            best_x,
            method='L-BFGS-B',
            bounds=bounds,
            options={
                'maxiter': 2500,
                'ftol': 1e-13,
                'gtol': 1e-11,
                'maxcor': 35
            }
        )
        if result_final.fun < best_loss:
            best_x = result_final.x
    except Exception:
        pass
    
    # Ensure bounds are strictly respected
    params_opt = np.array([
        np.clip(best_x[i], bounds[i][0], bounds[i][1])
        for i in range(7)
    ])
    
    return params_opt

# EVOLVE-BLOCK-END

#4 Run 5 R² = 0.667923

▼

Python

# EVOLVE-BLOCK-START
"""
Advanced scaling law discovery for LLM training under data constraints.
Enhanced optimization with adaptive bounds and two-stage refinement.
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Compute loss prediction using interactive scaling law model.
    
    Model form: L = a + b*V^γ/D^α + c/P^β
    Where:
    - V: unique tokens (vocabulary size)
    - D: total tokens
    - P: model parameters  
    - α, β, γ: scaling exponents
    - a, b, c: coefficients
    
    This captures:
    1. Irreducible loss floor (a)
    2. Data and vocabulary efficiency scaling (V^γ/D^α)
    3. Model capacity scaling (1/P^β)
    """
    X = np.atleast_2d(np.asarray(data_points))
    params = np.asarray(params, dtype=np.float64)
    
    # Extract features with numerical stability
    unique_tokens = np.maximum(X[:, 0], 1e5)
    model_params = np.maximum(X[:, 1], 1e7)
    tokens = np.maximum(X[:, 2], 1e8)
    
    # Parameters: [a, b, c, alpha, beta, gamma, epsilon]
    if len(params) < 7:
        params = np.concatenate([params, np.ones(7 - len(params))])
    
    a = params[0]
    b = np.exp(params[1])  # Exponential for positivity
    c = np.exp(params[2])  # Exponential for positivity
    alpha = np.clip(params[3], 0.05, 1.5)   # Data scaling exponent
    beta = np.clip(params[4], 0.05, 1.5)    # Parameter scaling exponent
    gamma = np.clip(params[5], -0.4, 0.4)   # Vocabulary scaling exponent (expanded range)
    epsilon = np.clip(params[6], 1e-3, 1.0) # Smoothing for numerical stability
    
    # Compute loss components
    vocab_factor = np.power(unique_tokens / 1e8, gamma)
    data_term = b * vocab_factor / np.power(tokens + epsilon, alpha)
    param_term = c / np.power(model_params + epsilon, beta)
    
    # Combined prediction
    loss = a + data_term + param_term
    
    # Clip to reasonable range
    loss = np.clip(loss, 0.5, 20.0)
    
    return loss


def fit_scaling_law(data_points, loss_values):
    """
    Fit scaling law parameters using enhanced hybrid optimization.
    Uses adaptive bounds, differential evolution, and two-stage local refinement.
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values, dtype=np.float64)
    
    y_mean = np.mean(y)
    y_std = np.std(y) + 1e-6
    y_min = np.min(y)
    y_max = np.max(y)
    y_range = y_max - y_min
    
    # Compute feature statistics for adaptive bounds
    log_V = np.log10(np.maximum(X[:, 0], 1e5))
    log_P = np.log10(np.maximum(X[:, 1], 1e7))
    log_D = np.log10(np.maximum(X[:, 2], 1e8))
    
    # Correlation-based insights for initialization
    y_normalized = (y - y_mean) / y_std
    corr_V = np.corrcoef(log_V, y_normalized)[0, 1] if len(y) > 2 else 0.0
    corr_P = np.corrcoef(log_P, y_normalized)[0, 1] if len(y) > 2 else 0.0
    corr_D = np.corrcoef(log_D, y_normalized)[0, 1] if len(y) > 2 else 0.0
    
    def objective(params_flat):
        """Objective function in normalized space."""
        try:
            params = np.atleast_1d(params_flat)
            if len(params) < 7:
                params = np.concatenate([params, np.ones(7 - len(params))])
            else:
                params = params[:7]
            
            pred = scaling_law_func(X, params)
            mse = np.mean(((pred - y) / y_std) ** 2)
            
            # Adaptive regularization based on parameter magnitude
            penalties = 0.0
            penalties += 0.004 * (params[3] ** 2)  # alpha regularization
            penalties += 0.004 * (params[4] ** 2)  # beta regularization
            penalties += 0.008 * (params[5] ** 2)  # gamma regularization
            
            return mse + penalties
        except:
            return 1e10
    
    def objective_tight(params_flat):
        """Tighter objective for final refinement."""
        try:
            params = np.atleast_1d(params_flat)
            if len(params) < 7:
                params = np.concatenate([params, np.ones(7 - len(params))])
            else:
                params = params[:7]
            
            pred = scaling_law_func(X, params)
            return np.mean(((pred - y) / y_std) ** 2)
        except:
            return 1e10
    
    # Adaptive bounds based on data statistics
    alpha_center = max(0.3, min(0.8, -corr_D * 0.5)) if not np.isnan(corr_D) else 0.3
    beta_center = max(0.2, min(0.8, -corr_P * 0.5)) if not np.isnan(corr_P) else 0.3
    gamma_center = max(-0.2, min(0.2, corr_V * 0.1)) if not np.isnan(corr_V) else 0.0
    
    bounds = [
        (y_min - 0.5*y_range, y_min + 0.5*y_range),  # a
        (-2.0, 2.0),                                   # log(b)
        (-2.0, 2.0),                                   # log(c)
        (0.05, 1.5),                                   # alpha
        (0.05, 1.5),                                   # beta
        (-0.4, 0.4),                                   # gamma (expanded)
        (1e-3, 1.0),                                   # epsilon
    ]
    
    # Smart initialization using data statistics
    a_init = y_min
    mask_high = y > y_mean
    if np.sum(mask_high) > 1:
        b_init = np.log(np.mean(y[mask_high]) - y_min + 0.1)
    else:
        b_init = np.log(0.2)
    c_init = np.log(np.std(y) + 0.1)
    
    x0_fallback = np.array([
        a_init, 
        b_init, 
        c_init, 
        alpha_center,
        beta_center,
        gamma_center,
        0.1
    ])
    
    # Stage 1: Global optimization with differential evolution
    x0 = x0_fallback.copy()
    try:
        result_de = differential_evolution(
            objective,
            bounds,
            maxiter=450,
            popsize=22,
            seed=42,
            atol=1e-8,
            tol=1e-8,
            workers=1,
            updating='deferred',
            polish=True,
            mutation=(0.4, 1.6),
            recombination=0.87
        )
        x0 = result_de.x
    except:
        pass
    
    # Stage 2: First-pass local refinement with L-BFGS-B
    params_opt = x0.copy()
    try:
        result_lbfgs1 = minimize(
            objective,
            x0,
            method='L-BFGS-B',
            bounds=bounds,
            options={'maxiter': 400, 'ftol': 1e-7, 'gtol': 1e-5}
        )
        params_opt = result_lbfgs1.x
    except:
        pass
    
    # Stage 3: Second-pass fine refinement with tighter tolerance
    try:
        result_lbfgs2 = minimize(
            objective_tight,
            params_opt,
            method='L-BFGS-B',
            bounds=bounds,
            options={'maxiter': 600, 'ftol': 1e-9, 'gtol': 1e-7, 'maxfun': 4000}
        )
        params_opt = result_lbfgs2.x
    except:
        pass
    
    # Ensure exactly 7 parameters with valid values
    if len(params_opt) < 7:
        params_opt = np.concatenate([params_opt, np.ones(7 - len(params_opt))])
    else:
        params_opt = params_opt[:7]
    
    # Final validation
    if np.any(~np.isfinite(params_opt)):
        params_opt = x0_fallback
    
    return params_opt
# EVOLVE-BLOCK-END

#5 Run 4 R² = 0.639676

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law for LLM training: L = a + b/(D^c) + d/(N^e) + f*log(V)^g
Streamlined 7-parameter inverse power-law with optimized fitting.
Improved from 0.7766 by: better initialization, tighter tolerances, simplified code.
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Inverse power law: L = a + b/(D^c) + d/(N^e) + f*log(V)^g
    D=tokens, N=params, V=unique_tokens
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    params = np.asarray(params, dtype=np.float64)
    
    V = np.clip(X[:, 0], 1e7, 5e8)
    N = np.clip(X[:, 1], 1.1e8, 1.1e9)
    D = np.clip(X[:, 2], 1e9, 1e12)
    
    a = np.clip(params[0], 0.5, 8.0)
    b = np.clip(params[1], 1e-8, 100.0)
    c = np.clip(params[2], 0.05, 1.5)
    d = np.clip(params[3], 1e-8, 100.0)
    e = np.clip(params[4], 0.05, 1.5)
    f = np.clip(params[5], -10.0, 10.0)
    g = np.clip(params[6], 0.05, 2.0)
    
    log_V = np.log(np.maximum(V, 2.0))
    
    loss = (a + 
            np.clip(b / (D ** c), -50, 50) + 
            np.clip(d / (N ** e), -50, 50) + 
            np.clip(f * (log_V ** g), -50, 50))
    
    return np.clip(loss, 0.5, 12.0)


def fit_scaling_law(data_points, loss_values):
    """
    Multi-stage fitting: smart initialization → global search → aggressive local refinement
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64).flatten()
    
    y_mean, y_std = np.mean(y), np.std(y) + 1e-10
    y_norm = (y - y_mean) / y_std
    
    def objective(p):
        try:
            pred = scaling_law_func(X, p)
            if not np.all(np.isfinite(pred)):
                return 1e10
            pred_norm = (pred - y_mean) / y_std
            mse = np.mean((pred_norm - y_norm) ** 2)
            reg = 0.0005 * (p[2]**2 + p[4]**2 + 0.1*p[6]**2)
            return mse + reg
        except:
            return 1e10
    
    bounds = [
        (0.5, 8.0), (1e-8, 100.0), (0.05, 1.5),
        (1e-8, 100.0), (0.05, 1.5), (-10.0, 10.0), (0.05, 2.0)
    ]
    
    # Smart initialization from data statistics
    y_min, y_p25 = np.min(y), np.percentile(y, 25)
    a_init = np.clip(y_p25 * 0.9, 0.5, 8.0)
    
    log_D = np.log(np.clip(X[:, 2], 1e9, 1e12))
    log_N = np.log(np.clip(X[:, 1], 1.1e8, 1.1e9))
    log_V = np.log(np.clip(X[:, 0], 1e7, 5e8))
    
    y_scaled = np.maximum(y - a_init + 0.1, 0.01)
    try:
        A = np.column_stack([log_D, log_N, log_V])
        coeffs = np.linalg.lstsq(A, np.log(y_scaled), rcond=None)[0]
        c_init = np.clip(-coeffs[0], 0.05, 1.5)
        e_init = np.clip(-coeffs[1], 0.05, 1.5)
        g_init = np.clip(coeffs[2], 0.05, 2.0)
    except:
        c_init, e_init, g_init = 0.3, 0.2, 0.5
    
    b_init = np.clip(np.std(y) * 0.4, 1e-8, 100.0)
    d_init = np.clip(np.std(y) * 0.2, 1e-8, 100.0)
    f_init = np.clip(np.std(y) * 0.05, -10.0, 10.0)
    
    x0 = np.array([a_init, b_init, c_init, d_init, e_init, f_init, g_init])
    
    best_params = None
    best_loss = np.inf
    
    # Stage 1: Focused differential evolution (single seed, larger popsize)
    try:
        result_de = differential_evolution(
            objective, bounds,
            seed=42,
            maxiter=600,
            popsize=28,
            atol=1e-10, tol=1e-10,
            workers=1, updating='deferred',
            mutation=(0.5, 1.5), recombination=0.8,
            polish=True
        )
        best_loss = result_de.fun
        best_params = result_de.x
    except:
        pass
    
    # Stage 2: Aggressive local refinement from global optimum
    if best_params is not None:
        try:
            result_local = minimize(
                objective, best_params,
                method='L-BFGS-B', bounds=bounds,
                options={'maxiter': 1200, 'ftol': 1e-14, 'gtol': 1e-12}
            )
            if result_local.fun < best_loss:
                best_loss = result_local.fun
                best_params = result_local.x
        except:
            pass
    
    # Stage 3: Ultra-tight final polish
    if best_params is not None:
        try:
            result_final = minimize(
                objective, best_params,
                method='L-BFGS-B', bounds=bounds,
                options={'maxiter': 1500, 'ftol': 1e-15, 'gtol': 1e-13}
            )
            if result_final.fun < best_loss:
                best_params = result_final.x
        except:
            pass
    
    return best_params if best_params is not None else x0
# EVOLVE-BLOCK-END