SLD - Vocabulary Scaling Law - SLDAgent + Claude Sonnet 4.5

All Runs (sorted by R²)

Best Run 1 R² = 0.986132

▼

Python

# EVOLVE-BLOCK-START
"""
Optimized scaling law with multiplicative vocabulary modulation
Key improvements:
1. Multiplicative vocab modulation: (1 + b*log(V)) directly scales base efficiency
2. Dual interaction terms: both P-based and D-based for comprehensive modeling
3. Enhanced optimization with basin-hopping and adaptive bounds
4. Tighter regularization targeting Chinchilla-optimal values
Uses exactly 7 parameters
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution, basinhopping

def scaling_law_func(data_points, params):
    """
    Optimized scaling law with multiplicative vocabulary effects:
    L = a * P^(-alpha) * D^(-beta) * (1 + b*log(V)) + c*log(V)/(P^gamma * D^delta) + offset
    
    Wait, that's 8 parameters. Let me simplify to 7:
    L = a * P^(-alpha) * D^(-beta) * (1 + b*log(V)) + c*log(V)/D^gamma + offset
    
    This captures:
    - Base Chinchilla power law: a * P^(-alpha) * D^(-beta)
    - Multiplicative vocab efficiency: (1 + b*log(V))
    - Data-vocab interaction: c*log(V)/D^gamma (vocab helps with more data)
    - Baseline offset
    
    7 parameters: [a, alpha, beta, b, c, gamma, offset]
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    params = np.asarray(params, dtype=np.float64).ravel()
    
    # Ensure exactly 7 parameters
    if len(params) < 7:
        params = np.pad(params, (0, 7 - len(params)), constant_values=0.0)
    params = params[:7]
    
    # Extract features with numerical stability
    eps = 1e-10
    P = np.maximum(X[:, 0], eps)  # non_vocab_parameters
    V = np.maximum(X[:, 1], eps)  # vocab_size
    D = np.maximum(X[:, 2], eps)  # num_characters
    
    # Extract parameters
    a, alpha, beta, b, c, gamma, offset = params
    
    # Force positive exponents for numerical stability
    alpha = np.abs(alpha)
    beta = np.abs(beta)
    gamma = np.abs(gamma)
    
    # Compute log vocabulary once
    log_V = np.log(V)
    
    # Term 1: Base power law with multiplicative vocabulary modulation
    base_scaling = a * np.power(P, -alpha) * np.power(D, -beta)
    vocab_multiplier = 1.0 + b * log_V
    term1 = base_scaling * vocab_multiplier
    
    # Term 2: Data-vocabulary interaction
    # Captures how vocabulary efficiency depends on data availability
    term2 = c * log_V * np.power(D, -gamma)
    
    # Final prediction
    pred = term1 + term2 + offset
    
    return pred


def fit_scaling_law(data_points, loss_values):
    """
    Four-stage robust optimization:
    1. Differential evolution with wide exploration
    2. L-BFGS-B refinement
    3. Basin-hopping to escape local minima
    4. Final TNC polish
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64).ravel()
    
    # Compute statistics
    y_mean = np.mean(y)
    y_std = np.std(y)
    
    def objective(params):
        try:
            pred = scaling_law_func(X, params)
            residuals = pred - y
            mse = np.mean(residuals ** 2)
            
            # Adaptive regularization: penalize deviation from Chinchilla values
            # alpha~0.34, beta~0.28 from Chinchilla paper
            chinchilla_penalty = 1e-7 * (
                (params[1] - 0.34)**2 + 
                (params[2] - 0.28)**2 + 
                params[5]**2  # Keep gamma small
            )
            
            return mse + chinchilla_penalty
        except:
            return 1e10
    
    # Optimized bounds based on top performers
    bounds = [
        (0.001, 100.0),    # a: scale coefficient
        (0.01, 2.0),       # alpha: param exponent
        (0.01, 2.0),       # beta: data exponent
        (-1.0, 1.0),       # b: vocab multiplier
        (-10.0, 10.0),     # c: interaction coefficient
        (0.01, 2.0),       # gamma: interaction exponent
        (y_mean - 4*y_std, y_mean + 2*y_std)  # offset
    ]
    
    # Stage 1: Global search with differential evolution
    result_de = differential_evolution(
        objective,
        bounds,
        maxiter=500,
        popsize=25,
        seed=42,
        atol=1e-9,
        tol=1e-9,
        workers=1,
        strategy='best1bin',
        mutation=(0.5, 1.8),
        recombination=0.8,
        polish=False
    )
    
    best_params = result_de.x
    best_loss = result_de.fun
    
    # Stage 2: L-BFGS-B refinement
    result_lbfgs = minimize(
        objective,
        best_params,
        method='L-BFGS-B',
        bounds=bounds,
        options={
            'maxiter': 1500,
            'ftol': 1e-12,
            'gtol': 1e-10
        }
    )
    
    if result_lbfgs.success and result_lbfgs.fun < best_loss:
        best_params = result_lbfgs.x
        best_loss = result_lbfgs.fun
    
    # Stage 3: Basin-hopping to escape local minima
    class BoundsChecker:
        def __init__(self, bounds):
            self.bounds = bounds
        
        def __call__(self, **kwargs):
            x = kwargs["x_new"]
            tmax = bool(np.all(x <= [b[1] for b in self.bounds]))
            tmin = bool(np.all(x >= [b[0] for b in self.bounds]))
            return tmax and tmin
    
    minimizer_kwargs = {
        "method": "L-BFGS-B",
        "bounds": bounds,
        "options": {"maxiter": 500, "ftol": 1e-11}
    }
    
    try:
        result_bh = basinhopping(
            objective,
            best_params,
            minimizer_kwargs=minimizer_kwargs,
            niter=30,
            T=1.0,
            stepsize=0.5,
            accept_test=BoundsChecker(bounds),
            seed=42
        )
        
        if result_bh.fun < best_loss:
            best_params = result_bh.x
            best_loss = result_bh.fun
    except:
        pass
    
    # Stage 4: Final TNC polish
    try:
        result_tnc = minimize(
            objective,
            best_params,
            method='TNC',
            bounds=bounds,
            options={'maxiter': 800, 'ftol': 1e-12}
        )
        
        if result_tnc.success and result_tnc.fun < best_loss:
            best_params = result_tnc.x
    except:
        pass
    
    return best_params
# EVOLVE-BLOCK-END

#2 Run 4 R² = 0.985661

▼

Python

# EVOLVE-BLOCK-START
"""
Refined scaling law with balanced vocabulary-compute interaction
Uses 7 parameters with enhanced stability and efficiency
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Scaling law: L = A/N^alpha + B/D^beta + C*log(V)/sqrt(N*D)^gamma + F
    
    Key innovation: Vocabulary term depends on geometric mean of compute (sqrt(N*D))
    - Captures that vocab efficiency depends on total compute budget
    - More stable than separate N and D interactions
    - log(V) models diminishing returns of larger vocabularies
    
    7 params: [A, alpha, B, beta, C, gamma, F]
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    params = np.asarray(params, dtype=np.float64).ravel()
    
    N = X[:, 0]  # non_vocab_parameters
    V = X[:, 1]  # vocab_size
    D = X[:, 2]  # num_characters
    
    A, alpha, B, beta, C, gamma, F = params
    
    # Normalize using data-informed scales
    N_norm = np.maximum(N / 1e8, 1e-3)
    D_norm = np.maximum(D / 1e11, 1e-3)
    V_norm = np.maximum(V / 20000, 1e-2)
    
    # Standard Chinchilla-style terms
    term1 = A / (N_norm ** alpha)
    term2 = B / (D_norm ** beta)
    
    # Vocabulary-compute interaction using geometric mean
    # sqrt(N*D) represents balanced compute budget
    compute_budget = np.sqrt(N_norm * D_norm)
    term3 = C * np.log(V_norm) / (compute_budget ** gamma)
    
    return term1 + term2 + term3 + F


def fit_scaling_law(data_points, loss_values):
    """
    Streamlined two-stage optimization with refined hyperparameters
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64)
    
    y_mean = np.mean(y)
    y_range = np.ptp(y)
    
    def objective(params):
        try:
            pred = scaling_law_func(X, params)
            mse = np.mean((pred - y) ** 2)
            
            # Light regularization toward Chinchilla exponents
            alpha_reg = 8e-7 * (params[1] - 0.34) ** 2
            beta_reg = 8e-7 * (params[3] - 0.28) ** 2
            gamma_reg = 8e-7 * (params[5] - 0.3) ** 2
            
            return mse + alpha_reg + beta_reg + gamma_reg
        except:
            return 1e10
    
    # Refined bounds based on theory
    # [A, alpha, B, beta, C, gamma, F]
    bounds = [
        (0.08, 65),       # A: parameter coefficient
        (0.14, 0.68),     # alpha: parameter exponent
        (0.08, 65),       # B: data coefficient
        (0.14, 0.68),     # beta: data exponent
        (-3.5, 3.5),      # C: vocabulary interaction
        (0.12, 0.58),     # gamma: compute-vocab coupling
        (-5.6, -0.4)      # F: baseline
    ]
    
    # Intelligent initialization
    init_params = np.array([
        y_range * 4.3,    # A
        0.34,             # alpha
        y_range * 3.8,    # B
        0.28,             # beta
        0.38,             # C
        0.3,              # gamma
        y_mean            # F
    ])
    
    # Stage 1: Global optimization
    result_global = differential_evolution(
        objective,
        bounds,
        maxiter=270,
        popsize=15,
        atol=1e-9,
        tol=1e-9,
        seed=42,
        init='latinhypercube',
        workers=1,
        updating='deferred',
        polish=False
    )
    
    # Stage 2: Local refinement
    result_local = minimize(
        objective,
        result_global.x,
        method='L-BFGS-B',
        bounds=bounds,
        options={
            'maxiter': 2400,
            'ftol': 1e-12,
            'gtol': 1e-9
        }
    )
    
    # Select best result
    if result_local.success and objective(result_local.x) < objective(result_global.x):
        params_opt = result_local.x
    else:
        params_opt = result_global.x
    
    # Fallback
    if objective(params_opt) > 1e9:
        params_opt = init_params
    
    return params_opt
# EVOLVE-BLOCK-END

#3 Run 2 R² = 0.985194

▼

Python

# EVOLVE-BLOCK-START
"""
Refined scaling law with adaptive reference scales and stabilized vocab term
Theory: L = a * (P/P0)^(-alpha) * (D/D0)^(-beta) * [1 + c*log(V/V0) + e*log(V/V0)^2] + d
Uses learnable P0 and D0 for better cross-dataset adaptation, with enhanced numerical stability.
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Adaptive scaling law with learnable reference scales:
    L = a * (P/P0)^(-alpha) * (D/D0)^(-beta) * [1 + c*log(V/V0) + e*log(V/V0)^2] + d
    
    7 parameters: [a, alpha, beta, c, e, log_P0, log_D0]
    Note: d is derived as 7th param, but we pack both log_P0 and log_D0
    Actually: [a, alpha, beta, c, e, log_P0, d] with fixed D0
    - a: overall scale factor
    - alpha: parameter exponent
    - beta: data exponent  
    - c: linear vocabulary coefficient
    - e: quadratic vocabulary coefficient
    - log_P0: learnable parameter reference scale
    - d: asymptotic bias
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    params = np.asarray(params, dtype=np.float64)
    
    if params.ndim == 1:
        params = params[None, :]
    
    # Extract features with safety bounds
    P = np.maximum(X[:, 0], 1e6)
    V = np.maximum(X[:, 1], 1000)
    D = np.maximum(X[:, 2], 1e8)
    
    # Extract parameters
    a = params[:, 0]
    alpha = np.abs(params[:, 1])
    beta = np.abs(params[:, 2])
    c = params[:, 3]
    e = params[:, 4]
    log_P0 = params[:, 5]
    d = params[:, 6]
    
    # Learnable P0 with tight bounds, fixed D0 at empirical median
    P0 = np.exp(np.clip(log_P0, np.log(1e7), np.log(1e10)))[:, None]
    D0 = 1e11
    V0 = 32000.0
    
    # Normalized ratios with enhanced numerical stability
    P_norm = np.clip(P[None, :] / P0, 1e-3, 1e3)
    D_norm = np.clip(D[None, :] / D0, 1e-3, 1e3)
    log_V_ratio = np.clip(np.log(V[None, :] / V0), -3.0, 3.0)
    
    # Main power law with more stable computation
    power_base = a[:, None] * np.power(P_norm, -alpha[:, None]) * np.power(D_norm, -beta[:, None])
    
    # Quadratic vocabulary efficiency (proven to work well)
    vocab_factor = 1.0 + c[:, None] * log_V_ratio + e[:, None] * log_V_ratio**2
    
    # Combined prediction
    pred = power_base * vocab_factor + d[:, None]
    
    return pred[0] if pred.shape[0] == 1 else pred


def fit_scaling_law(data_points, loss_values):
    """
    Enhanced three-stage optimization: global search + multi-restart local + fine polish
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64)
    
    if y.ndim == 1:
        y = y[:, None]
    
    T = y.shape[1]
    
    # Data-driven initialization
    P_med = np.median(X[:, 0])
    y_mean = np.mean(y)
    y_std = np.std(y)
    
    # Improved initialization closer to Chinchilla optimal
    init_params = np.array([
        [np.abs(y_std) * 2.85,   # a: scale factor
         0.072,                   # alpha: slightly lower than 0.073
         0.072,                   # beta: symmetric with alpha
         -0.058,                  # c: slightly stronger negative
         -0.0032,                 # e: small quadratic term
         np.log(P_med),           # log_P0: data-driven
         y_mean]                  # d: bias at mean
    ]).repeat(T, axis=0)
    
    def objective(flat_params):
        params = flat_params.reshape(T, 7)
        try:
            pred = scaling_law_func(X, params)
            if pred.ndim == 1:
                pred = pred[:, None]
            
            # MSE loss
            mse = np.mean((pred - y) ** 2)
            
            # Refined regularization with optimal weights
            reg = 0.00016 * (
                np.sum((params[:, 1] - 0.072)**2) +      # alpha near optimal
                np.sum((params[:, 2] - 0.072)**2) +      # beta near optimal
                np.sum((params[:, 3] + 0.058)**2) +      # c near optimal
                np.sum(params[:, 4]**2) * 2.8            # small quadratic term
            )
            
            return mse + reg
        except:
            return 1e10
    
    # Optimized parameter bounds
    bounds = [
        (0.01, 85.0),                      # a
        (0.018, 0.35),                     # alpha
        (0.018, 0.35),                     # beta
        (-2.7, 2.7),                       # c
        (-0.15, 0.15),                     # e
        (np.log(1e7), np.log(1e10)),       # log_P0
        (-12.5, 0.0)                       # d
    ] * T
    
    best_result = None
    best_loss = float('inf')
    
    # Stage 1: Global search with enhanced settings
    try:
        result_de = differential_evolution(
            objective,
            bounds,
            strategy='best1bin',
            maxiter=420,
            popsize=13,
            seed=42,
            atol=1e-9,
            tol=1e-9,
            polish=True,
            updating='deferred',
            workers=1
        )
        if result_de.success or result_de.fun < best_loss:
            best_result = result_de
            best_loss = result_de.fun
    except:
        pass
    
    # Stage 2: Multi-restart local optimization with adaptive perturbation
    restart_configs = [
        (0.0, 'exact'),        # Start from DE result
        (0.012, 'micro'),      # Micro perturbation
        (0.04, 'small'),       # Small perturbation
        (0.08, 'medium'),      # Medium exploration
        (0.13, 'large'),       # Large exploration
        (0.20, 'xlarge')       # Extra large exploration
    ]
    
    for scale, _ in restart_configs:
        if scale == 0.0 and best_result is not None:
            init = best_result.x
        else:
            init = init_params.ravel() + np.random.randn(T * 7) * scale
        
        try:
            result = minimize(
                objective,
                init,
                method='L-BFGS-B',
                bounds=bounds,
                options={
                    'maxiter': 2200, 
                    'ftol': 1e-12,
                    'gtol': 1e-10,
                    'maxfun': 28000
                }
            )
            
            if result.fun < best_loss:
                best_loss = result.fun
                best_result = result
        except:
            continue
    
    # Stage 3: Final polish with alternative method if needed
    if best_result is not None and best_loss > 0.014:
        try:
            result_powell = minimize(
                objective,
                best_result.x,
                method='Powell',
                options={'maxiter': 1000, 'ftol': 1e-11}
            )
            if result_powell.fun < best_loss:
                best_result = result_powell
        except:
            pass
    
    # Final TNC polish for difficult cases
    if best_result is not None and best_loss > 0.016:
        try:
            result_tnc = minimize(
                objective,
                best_result.x,
                method='TNC',
                bounds=bounds,
                options={'maxiter': 800, 'ftol': 1e-11}
            )
            if result_tnc.fun < best_loss:
                best_result = result_tnc
        except:
            pass
    
    if best_result is None:
        params_opt = init_params
    else:
        params_opt = best_result.x.reshape(T, 7)
    
    return params_opt[0] if T == 1 else params_opt
# EVOLVE-BLOCK-END

#4 Run 5 R² = 0.984899

▼

Python

# EVOLVE-BLOCK-START
"""
Compact multiplicative scaling law with vocab-compute interaction
Form: L = a*(P/P0)^(-α)*(D/D0)^(-β)*(1 + b*log(V/V0) + c/(V/V0)^γ) + d
Multiplicative coupling captures vocab efficiency across compute scales
"""
import numpy as np
from scipy.optimize import differential_evolution, minimize

def scaling_law_func(data_points, params):
    """
    7 params: [a, alpha, beta, b, c, gamma, d]
    Multiplicative form with log and inverse power vocab terms
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    params = np.asarray(params, dtype=np.float64)
    
    if params.ndim == 1:
        params = params[None, :]
    
    # Stabilize features
    P = np.maximum(X[:, 0], 1e6)
    V = np.maximum(X[:, 1], 1000)
    D = np.maximum(X[:, 2], 1e8)
    
    # Fixed normalizers (geometric means of typical ranges)
    P0, D0, V0 = 2e8, 3e10, 22000
    P_n, D_n, V_n = P / P0, D / D0, V / V0
    
    # Constrained parameters
    a = np.abs(params[:, 0]) + 1e-8
    alpha = np.clip(np.abs(params[:, 1]), 0.05, 0.8)
    beta = np.clip(np.abs(params[:, 2]), 0.05, 0.8)
    b, c = params[:, 3], params[:, 4]
    gamma = np.clip(np.abs(params[:, 5]), 0.05, 1.5)
    d = params[:, 6]
    
    # Chinchilla base
    base = a[:, None] * np.power(P_n[None, :], -alpha[:, None]) * \
           np.power(D_n[None, :], -beta[:, None])
    
    # Vocab multiplier: log for linear scaling + inverse power for saturation
    log_v = b[:, None] * np.log(np.maximum(V_n[None, :], 0.1))
    pow_v = c[:, None] / np.power(V_n[None, :], gamma[:, None])
    vocab_mult = 1.0 + log_v + pow_v
    
    pred = base * vocab_mult + d[:, None]
    return pred[0, :] if pred.shape[0] == 1 else pred


def fit_scaling_law(data_points, loss_values):
    """
    Efficient 2-stage optimization: DE global + L-BFGS-B local
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64)
    
    if y.ndim == 1:
        y = y[:, None]
    
    T = y.shape[1]
    
    # Robust statistics
    y_med = np.median(y)
    y_std = np.std(y)
    y_p5, y_p95 = np.percentile(y, [5, 95])
    y_rng = y_p95 - y_p5
    
    def objective(params_flat):
        params = params_flat.reshape(T, 7)
        try:
            pred = scaling_law_func(X, params)
            if pred.ndim == 1:
                pred = pred[:, None]
            
            # Huber loss for robustness
            res = pred - y
            delta = 0.5 * y_std
            huber = np.where(np.abs(res) <= delta,
                           0.5 * res**2,
                           delta * (np.abs(res) - 0.5 * delta))
            
            # Light regularization toward Chinchilla values
            reg = 1e-6 * ((params[:, 1] - 0.34)**2 + (params[:, 2] - 0.28)**2)
            
            return np.mean(huber) + np.sum(reg)
        except:
            return 1e10
    
    # Bounds: [a, alpha, beta, b, c, gamma, d]
    bounds = [
        (1e-4, 20*y_rng),
        (0.05, 0.8),
        (0.05, 0.8),
        (-2*y_rng, 2*y_rng),
        (-3*y_rng, 3*y_rng),
        (0.05, 1.5),
        (y_med-5*y_std, y_med+5*y_std)
    ] * T
    
    # Global search
    result_de = differential_evolution(
        objective, bounds, seed=42,
        maxiter=450, popsize=22,
        atol=1e-9, tol=1e-9,
        workers=1, updating='deferred',
        strategy='best1bin',
        mutation=(0.5, 1.6), recombination=0.7
    )
    
    # Local refinement
    result_local = minimize(
        objective, result_de.x,
        method='L-BFGS-B', bounds=bounds,
        options={'maxiter': 2000, 'ftol': 1e-11, 'gtol': 1e-9}
    )
    
    best = result_local if result_local.success and result_local.fun < result_de.fun else result_de
    params_opt = best.x.reshape(T, 7)
    
    return params_opt[0] if T == 1 else params_opt
# EVOLVE-BLOCK-END

#5 Run 3 R² = 0.980252

▼

Python

# EVOLVE-BLOCK-START
"""
Streamlined scaling law with principled interaction modeling
- Unified adaptive interaction framework
- Cleaner parameter structure with better generalization
- Enhanced numerical stability and fitting convergence
"""
import numpy as np
from scipy.optimize import differential_evolution

def scaling_law_func(data_points, params):
    """
    Refined scaling law with unified interaction framework:
    L = a0 + a1/N^alpha + a2/D^beta + a3/(V^gamma * N^delta) + interaction
    
    Key design:
    - V-N multiplicative coupling (proven architecture)
    - Single unified interaction term scaled by parameter magnitudes
    - Delta fixed at 0.12 for stability (derived from empirical data)
    
    7 params: [a0, a1, alpha, a2, beta, a3, gamma]
    """
    X = np.atleast_2d(np.asarray(data_points))
    params = np.asarray(params)
    
    if params.ndim == 1:
        params = params[None, :]
    
    # Extract features with numerical guards
    N = np.maximum(X[:, 0], 1e6)
    V = np.maximum(X[:, 1], 100)
    D = np.maximum(X[:, 2], 1e6)
    
    # Optimized normalization scales
    N_norm = N / 1e8
    V_norm = V / 1e4
    D_norm = D / 1e10
    
    pred_list = []
    for i in range(params.shape[0]):
        a0, a1, alpha, a2, beta, a3, gamma = params[i]
        
        # Use absolute values for stability
        alpha_abs = np.abs(alpha)
        beta_abs = np.abs(beta)
        gamma_abs = np.abs(gamma)
        
        # Core power law terms
        base = a0
        param_term = a1 / (N_norm ** alpha_abs)
        data_term = a2 / (D_norm ** beta_abs)
        
        # V-N multiplicative coupling with optimized delta
        vocab_param_coupling = a3 / (V_norm ** gamma_abs * N_norm ** 0.12)
        
        # Unified interaction term: captures V-D-N synergies
        # Uses geometric mean of coefficient magnitudes for adaptive scaling
        coef_magnitude = np.sqrt(np.abs(a1) * np.abs(a2) * np.abs(a3))
        interaction_scale = coef_magnitude / (750.0 + coef_magnitude)
        
        # Log-space interaction with balanced V-D contribution
        log_vd = np.log1p(V_norm * D_norm)
        vd_balance = np.log1p(V_norm / np.maximum(D_norm, 0.01))
        
        # Combined interaction term
        interaction_term = interaction_scale * (
            param_term * log_vd * 0.014 + 
            vocab_param_coupling * vd_balance * 0.055
        )
        
        pred = base + param_term + data_term + vocab_param_coupling + interaction_term
        pred_list.append(pred)
    
    pred = np.array(pred_list).T
    return pred[:, 0] if pred.shape[1] == 1 else pred


def fit_scaling_law(data_points, loss_values):
    """
    Enhanced fitting with improved robustness and convergence
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values)
    
    if y.ndim == 1:
        y = y[:, None]
    
    T = y.shape[1]
    P = 7
    
    def objective(flat_params):
        params = flat_params.reshape(T, P)
        pred = scaling_law_func(X, params)
        if pred.ndim == 1:
            pred = pred[:, None]
        
        residuals = pred - y
        
        # Adaptive Huber loss: robust to outliers with smooth transition
        abs_res = np.abs(residuals)
        delta = 0.77  # Optimized transition threshold
        huber_loss = np.where(
            abs_res <= delta,
            0.5 * residuals ** 2,
            delta * (abs_res - 0.5 * delta)
        )
        loss = np.mean(huber_loss)
        
        # Exponent regularization: prefer Chinchilla-like scaling
        exponent_target = 0.3
        exponent_reg = 0.00125 * np.sum((params[:, [2, 4, 6]] - exponent_target) ** 2)
        
        # Balanced coefficient regularization
        coef_reg = 4.2e-7 * np.sum(params[:, [1, 3, 5]] ** 2)
        
        return loss + exponent_reg + coef_reg
    
    # Optimized bounds
    bounds = [
        (-8, 2),         # a0: baseline
        (-188, 188),     # a1: parameter coefficient
        (0.06, 1.4),     # alpha: parameter exponent
        (-188, 188),     # a2: data coefficient
        (0.06, 1.4),     # beta: data exponent
        (-188, 188),     # a3: vocab coefficient
        (0.06, 1.4),     # gamma: vocab exponent
    ] * T
    
    # Strategic initialization with improved diversity
    init_pop = []
    seed_offsets = [0, 0.26, -0.26, 0.40, -0.40, 0.14]
    
    for offset in seed_offsets[:5]:
        init = []
        for _ in range(T):
            # Well-tuned base parameters
            base_params = np.array([
                -3.70,                       # baseline
                17.1 + offset * 5.1,         # param coefficient
                0.31 + offset * 0.086,       # param exponent
                8.55 + offset * 2.65,        # data coefficient
                0.28 + offset * 0.067,       # data exponent
                4.33 + offset * 1.73,        # vocab coefficient
                0.19 + offset * 0.051        # vocab exponent
            ])
            # Controlled exploration noise
            noise_scale = np.array([0.21, 1.68, 0.041, 1.23, 0.032, 0.87, 0.025])
            noise = np.random.randn(P) * noise_scale
            init.extend(np.clip(
                base_params + noise,
                [b[0] for b in bounds[:P]],
                [b[1] for b in bounds[:P]]
            ))
        init_pop.append(init)
    
    # Optimized differential evolution
    result = differential_evolution(
        objective,
        bounds,
        strategy='best1bin',
        maxiter=490,
        popsize=19,
        init=np.array(init_pop),
        seed=42,
        atol=4.5e-8,
        tol=4.5e-8,
        workers=1,
        updating='deferred',
        polish=True,
        recombination=0.735,
        mutation=(0.51, 1.19)
    )
    
    params_opt = result.x.reshape(T, P)
    return params_opt[0] if T == 1 else params_opt
# EVOLVE-BLOCK-END