SLD - Vocabulary Scaling Law - SLDAgent + Claude Haiku 4.5

All Runs (sorted by R²)

Best Run 5 R² = 0.978871

▼

Python

# EVOLVE-BLOCK-START
"""
Enhanced scaling law with adaptive reference scales and improved regularization
Achieves better generalization through data-informed normalization and
sophisticated parameter-specific regularization strategies
Uses 7-parameter additive model: L = a0 + a1*(P_nv)^a2 + a3*(V)^a4 + a5*(N_chars)^a6
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Additive power-law scaling law with 7 parameters
    Model: L = a0 + a1*(P_nv)^a2 + a3*(V)^a4 + a5*(N_chars)^a6
    
    Adaptive normalization based on data statistics for better numerical behavior.
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    params = np.asarray(params, dtype=np.float64)
    
    if X.shape[1] != 3:
        raise ValueError(f"Expected 3 features, got {X.shape[1]}")
    
    P_nv = X[:, 0]
    V = X[:, 1]
    N_chars = X[:, 2]
    
    if len(params) < 7:
        params = np.concatenate([params, np.zeros(7 - len(params))])
    
    a0, a1, a2, a3, a4, a5, a6 = params[:7]
    
    # Adaptive reference scales using geometric mean of data ranges
    # More stable than fixed values across different data distributions
    P_ref = np.exp(0.5 * (np.log(np.min(P_nv[P_nv > 0]) + 1e-10) + 
                           np.log(np.max(P_nv) + 1e-10)))
    V_ref = np.exp(0.5 * (np.log(np.min(V[V > 0]) + 1e-10) + 
                           np.log(np.max(V) + 1e-10)))
    N_ref = np.exp(0.5 * (np.log(np.min(N_chars[N_chars > 0]) + 1e-10) + 
                           np.log(np.max(N_chars) + 1e-10)))
    
    # Normalize inputs
    P_nv_norm = P_nv / P_ref
    V_norm = V / V_ref
    N_chars_norm = N_chars / N_ref
    
    # Tighter clipping based on typical exponent ranges
    P_nv_norm = np.clip(P_nv_norm, 1e-4, 1e4)
    V_norm = np.clip(V_norm, 1e-4, 1e4)
    N_chars_norm = np.clip(N_chars_norm, 1e-4, 1e4)
    
    # Power law terms
    term_params = a1 * np.power(P_nv_norm, a2)
    term_vocab = a3 * np.power(V_norm, a4)
    term_data = a5 * np.power(N_chars_norm, a6)
    
    return a0 + term_params + term_vocab + term_data


def fit_scaling_law(data_points, loss_values):
    """
    Advanced three-phase hierarchical optimization with adaptive strategies
    Phase 1: Global exploration with differential evolution
    Phase 2: Local refinement from theory-informed initializations
    Phase 3: Final convergence with tight tolerances and parameter-specific tuning
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64).ravel()
    
    y_mean = np.mean(y)
    y_std = np.std(y) + 1e-8
    y_norm = (y - y_mean) / y_std
    
    # Domain-informed bounds with tighter exponent constraints
    bounds = [
        (-2.0, 2.0),    # a0: intercept (baseline loss)
        (-15.0, 15.0),  # a1: parameter coefficient
        (-1.3, 1.3),    # a2: parameter exponent (tighter - usually -0.5 to 0)
        (-15.0, 15.0),  # a3: vocab coefficient
        (-1.0, 0.3),    # a4: vocab exponent (tighter - typically -0.4 to -0.1)
        (-15.0, 15.0),  # a5: data coefficient
        (-0.5, -0.01),  # a6: data exponent (much tighter - all negative)
    ]
    
    def objective(params):
        """Objective with parameter-specific regularization"""
        try:
            pred = scaling_law_func(X, params)
            
            if not np.all(np.isfinite(pred)):
                return 1e10
            
            pred_norm = (pred - y_mean) / y_std
            mse = np.mean((pred_norm - y_norm) ** 2)
            
            # Parameter-specific regularization (stronger on exponents than coefficients)
            reg_coeff = 0.0005 * (params[1]**2 + params[3]**2 + params[5]**2)
            reg_exp = 0.0020 * (params[2]**2 + params[4]**2 + params[6]**2)
            reg_intercept = 0.0010 * params[0]**2
            
            return mse + reg_coeff + reg_exp + reg_intercept
            
        except:
            return 1e10
    
    # Phase 1: Global exploration with differential evolution
    result_de = differential_evolution(
        objective, bounds, seed=42, maxiter=600, popsize=28,
        atol=1e-11, tol=1e-11, mutation=(0.5, 1.5), recombination=0.8,
        workers=1, updating='deferred', polish=False
    )
    
    best_params = result_de.x
    best_loss = result_de.fun
    
    # Phase 2: Local refinement from multiple theory-informed initializations
    smart_inits = [
        result_de.x,  # Best from DE
        np.array([-0.2, -2.8, -0.55, -3.2, -0.28, -3.2, -0.22]),  # Refined Chinchilla
        np.array([-0.8, -3.2, -0.75, -1.8, -0.15, -2.8, -0.28]),  # Data-heavy
        np.array([-0.4, -1.8, -0.35, -3.8, -0.35, -3.8, -0.35]),  # Vocab-heavy
        np.array([-0.6, -2.2, -0.65, -2.5, -0.22, -2.2, -0.18]),  # Balanced
        np.array([-0.3, -2.0, -0.45, -2.8, -0.30, -3.5, -0.25]),  # Parameter-focused
        np.array([-1.0, -1.5, -0.80, -1.2, -0.10, -2.0, -0.12]),  # Minimal scaling
    ]
    
    for init_params in smart_inits:
        # Clip to bounds to ensure valid initialization
        init_params = np.array([np.clip(p, b[0], b[1]) for p, b in zip(init_params, bounds)])
        
        result = minimize(
            objective, init_params, method='L-BFGS-B', bounds=bounds,
            options={'ftol': 1e-13, 'gtol': 1e-12, 'maxiter': 1000, 'maxls': 120}
        )
        
        if result.fun < best_loss:
            best_loss = result.fun
            best_params = result.x
    
    # Phase 3: Final ultra-tight refinement with adaptive tolerance
    result_final = minimize(
        objective, best_params, method='L-BFGS-B', bounds=bounds,
        options={'ftol': 1e-15, 'gtol': 1e-13, 'maxiter': 2000, 'maxls': 200}
    )
    
    if result_final.fun < best_loss:
        best_params = result_final.x
    
    # Phase 3b: One more tight global polish around best solution
    tight_bounds = [
        (max(b[0], p - 0.25), min(b[1], p + 0.25))
        for b, p in zip(bounds, best_params)
    ]
    
    result_polish = differential_evolution(
        objective, tight_bounds, seed=43, maxiter=200, popsize=15,
        atol=1e-12, tol=1e-12, mutation=(0.5, 1.5), recombination=0.9,
        workers=1, updating='deferred', polish=True
    )
    
    if result_polish.fun < best_loss:
        best_params = result_polish.x
    
    return best_params
# EVOLVE-BLOCK-END

#2 Run 1 R² = 0.949983

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM vocabulary scaling
Enhanced 7-parameter model with adaptive L2 regularization and refined multi-stage optimization
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Scaling law: Lossu = a + b*log(P) + c*log(V) + d*log(N) + e*log(V/P) + f*log(P/N) + g*log(P)*log(N)
    where P = non_vocab_parameters, V = vocab_size, N = num_characters
    Uses 7 parameters: 6 core features + 1 carefully regularized interaction
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    params = np.asarray(params, dtype=np.float64)
    
    # Extract features with numerical stability
    P_nv = np.maximum(X[:, 0], 1e-10)
    V = np.maximum(X[:, 1], 1e-10)
    N = np.maximum(X[:, 2], 1e-10)
    
    # Log-space features for numerical stability
    log_P = np.log(P_nv)
    log_V = np.log(V)
    log_N = np.log(N)
    
    # Log-ratio features for relative scaling
    log_V_per_P = log_V - log_P
    log_P_per_N = log_P - log_N
    
    # Parameter-data interaction (captures scaling behavior)
    log_P_logN = log_P * log_N
    
    # Unpack 7 parameters
    a = params[0]           # intercept
    b = params[1]           # coefficient for log(P)
    c = params[2]           # coefficient for log(V)
    d = params[3]           # coefficient for log(N)
    e = params[4]           # ratio effect: log(V/P)
    f = params[5]           # efficiency effect: log(P/N)
    g = params[6]           # interaction: log(P)*log(N)
    
    # Scaling law combining ratios and interaction
    pred = (a + 
            b * log_P + 
            c * log_V + 
            d * log_N + 
            e * log_V_per_P + 
            f * log_P_per_N + 
            g * log_P_logN)
    
    return pred


def fit_scaling_law(data_points, loss_values):
    """
    Fit scaling law with enhanced adaptive regularization and refined optimization
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64)
    
    if y.ndim > 1:
        y = y.flatten()
    
    # Compute data statistics for adaptive regularization
    y_std = np.std(y)
    y_mean = np.mean(y)
    y_min = np.min(y)
    y_max = np.max(y)
    
    def objective(params):
        try:
            pred = scaling_law_func(X, params)
            mse = np.mean((pred - y) ** 2)
            
            # Adaptive L2 regularization: scale with data variance
            # Stronger penalty on interaction term to prevent overfitting
            reg_base = 0.0007 * (y_std + 1e-6)
            reg = reg_base * (np.sum(params[1:6] ** 2) + 2.5 * params[6] ** 2)
            
            return mse + reg
        except:
            return 1e10
    
    # Data-driven bounds for better search space definition
    bounds = [
        (y_min - 0.6*y_std, y_max + 0.6*y_std),  # a (intercept) - tighter
        (-2.1, 2.1),                              # b (log(P))
        (-2.1, 2.1),                              # c (log(V))
        (-2.1, 2.1),                              # d (log(N))
        (-2.1, 2.1),                              # e (log-ratio V/P)
        (-2.1, 2.1),                              # f (log-ratio P/N)
        (-0.55, 0.55)                             # g (interaction, strongly constrained)
    ]
    
    # Stage 1: Global optimization with refined differential evolution
    result_global = differential_evolution(
        objective,
        bounds,
        seed=42,
        maxiter=410,
        popsize=21,
        atol=1e-11,
        tol=1e-11,
        workers=1,
        updating='deferred',
        mutation=(0.5, 1.5),
        polish=False
    )
    
    params_global = result_global.x
    
    # Stage 2: Local refinement with L-BFGS-B
    result_local = minimize(
        objective,
        params_global,
        method='L-BFGS-B',
        bounds=bounds,
        options={'maxiter': 1300, 'ftol': 1e-15, 'gtol': 1e-12, 'maxcor': 20}
    )
    
    params_local = result_local.x
    
    # Stage 3: Fine-tuning with Nelder-Mead for polishing
    result_fine = minimize(
        objective,
        params_local,
        method='Nelder-Mead',
        options={'maxiter': 650, 'xatol': 1e-11, 'fatol': 1e-15, 'adaptive': True}
    )
    
    # Return best result from all stages
    candidates = [
        (result_global.fun, result_global.x),
        (result_local.fun, result_local.x),
        (result_fine.fun, result_fine.x)
    ]
    params_opt = min(candidates, key=lambda x: x[0])[1]
    
    return params_opt

# EVOLVE-BLOCK-END

#3 Run 3 R² = 0.934849

▼

Python

# EVOLVE-BLOCK-START
"""
Optimized scaling law for vocabulary-parameter-compute scaling
Returns to proven quadratic model with aggressive multi-stage optimization
Focus: Mathematical accuracy, parameter efficiency, and numerical stability
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Scaling law with 7 parameters:
    loss = a + b*log(P_nv) + c*log(V) + d*log(D)
         + e*log(P_nv)*log(V) + f*log(V)^2 + g*log(D)^2
    
    Captures:
    - Base scaling: linear log terms for each factor
    - Trade-off: parameter-vocabulary interaction
    - Diminishing returns: quadratic terms for vocabulary and data
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    params = np.asarray(params, dtype=np.float64).flatten()
    
    if len(params) < 7:
        params = np.concatenate([params, np.zeros(7 - len(params))])
    
    P_nv = X[:, 0]
    V = X[:, 1]
    D = X[:, 2]
    
    # Safe clamping to prevent log errors
    P_nv = np.clip(P_nv, 1e6, 1e12)
    V = np.clip(V, 100, 1e6)
    D = np.clip(D, 1e6, 1e14)
    
    a, b, c, d, e, f, g = params[:7]
    
    log_P = np.log(P_nv)
    log_V = np.log(V)
    log_D = np.log(D)
    
    loss = (a + 
            b * log_P + 
            c * log_V + 
            d * log_D +
            e * log_P * log_V +
            f * (log_V ** 2) +
            g * (log_D ** 2))
    
    return loss


def fit_scaling_law(data_points, loss_values):
    """
    Multi-stage fitting with robust initialization and aggressive refinement.
    Uses both global and local optimization with tight convergence criteria.
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64).flatten()
    
    P_nv = np.clip(X[:, 0], 1e6, 1e12)
    V = np.clip(X[:, 1], 100, 1e6)
    D = np.clip(X[:, 2], 1e6, 1e14)
    
    log_P = np.log(P_nv)
    log_V = np.log(V)
    log_D = np.log(D)
    
    # Normalize for numerical stability
    y_mean = np.mean(y)
    y_std = np.std(y) + 1e-8
    y_norm = (y - y_mean) / y_std
    
    def objective(params):
        """MSE with minimal regularization"""
        try:
            pred = scaling_law_func(X, params)
            pred_norm = (pred - y_mean) / y_std
            
            if not np.all(np.isfinite(pred_norm)):
                return 1e10
            
            mse = np.mean((pred_norm - y_norm) ** 2)
            reg = 1e-5 * np.sum(params ** 2)
            return mse + reg
        except:
            return 1e10
    
    # Stage 1: Robust linear regression initialization
    try:
        A = np.column_stack([
            np.ones_like(log_P),
            log_P,
            log_V,
            log_D,
            log_P * log_V,
            log_V ** 2,
            log_D ** 2
        ])
        coeffs = np.linalg.lstsq(A, y_norm, rcond=1e-10)[0]
        init_params = np.array(coeffs[:7])
        
        if not np.all(np.isfinite(init_params)):
            init_params = np.array([-2.0, -0.3, -0.2, 0.3, -0.05, -0.01, -0.001])
    except:
        init_params = np.array([-2.0, -0.3, -0.2, 0.3, -0.05, -0.01, -0.001])
    
    bounds = [
        (-10, 10),      # a: bias
        (-2, 2),        # b: log(P_nv)
        (-2, 2),        # c: log(V)
        (-2, 2),        # d: log(D)
        (-1, 1),        # e: P_nv-V interaction
        (-0.5, 0.5),    # f: log(V)^2
        (-0.5, 0.5)     # g: log(D)^2
    ]
    
    best_params = init_params
    best_loss = objective(init_params)
    
    # Stage 2: Global optimization with differential evolution
    try:
        result_de = differential_evolution(
            objective,
            bounds,
            maxiter=320,
            popsize=21,
            seed=42,
            atol=1e-8,
            tol=1e-8,
            workers=1,
            updating='deferred',
            polish=True,
            init='sobol'
        )
        if result_de.fun < best_loss:
            best_params = result_de.x
            best_loss = result_de.fun
    except:
        pass
    
    # Stage 3: Local refinement with aggressive convergence
    try:
        result_local = minimize(
            objective,
            best_params,
            method='L-BFGS-B',
            bounds=bounds,
            options={
                'ftol': 1e-11,
                'gtol': 1e-8,
                'maxiter': 2200,
                'maxcor': 50
            }
        )
        if result_local.fun < best_loss:
            best_params = result_local.x
            best_loss = result_local.fun
    except:
        pass
    
    # Stage 4: Secondary refinement from initialization if not converged well
    if best_loss > objective(init_params) * 0.94:
        try:
            result_init = minimize(
                objective,
                init_params,
                method='L-BFGS-B',
                bounds=bounds,
                options={
                    'ftol': 1e-11,
                    'gtol': 1e-8,
                    'maxiter': 2200,
                    'maxcor': 50
                }
            )
            if result_init.fun < best_loss:
                best_params = result_init.x
        except:
            pass
    
    if len(best_params) < 7:
        best_params = np.concatenate([best_params, np.zeros(7 - len(best_params))])
    
    return best_params[:7]
# EVOLVE-BLOCK-END

#4 Run 2 R² = 0.924633

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM vocabulary trade-offs
Enhanced log-linear model with improved feature interactions and efficient optimization
Focuses on capturing vocabulary scaling trade-offs with simplified robust fitting
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Enhanced scaling law: Loss = a + b*log(P_nv) + c*log(V) + d*log(C) + 
                                 e*log(V/P_nv) + f*log(C)^2 + g*log(V)*log(C)
    
    This 7-parameter model improves upon previous formulations:
    1. Baseline loss term (a)
    2. Parameter scaling benefit (b*log(P_nv)) - negative (more params help)
    3. Vocabulary cost (c*log(V)) - positive (larger vocab hurts)
    4. Data scaling benefit (d*log(C)) - negative (more data helps)
    5. Vocabulary-parameter efficiency ratio (e*log(V/P_nv)) - captures trade-off directly
    6. Non-linear data saturation (f*log(C)^2) - captures diminishing returns
    7. Data-vocabulary interaction (g*log(V)*log(C)) - captures how data mitigates vocab cost
    
    Log-space computation for numerical stability across 8+ orders of magnitude.
    Max 7 parameters: [a, b, c, d, e, f, g]
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    params = np.asarray(params, dtype=np.float64).ravel()
    
    if X.shape[1] != 3:
        raise ValueError(f"Expected 3 features, got {X.shape[1]}")
    
    # Extract features with safe bounds
    P_nv = np.maximum(X[:, 0], 1e5)
    V = np.maximum(X[:, 1], 10.0)
    C = np.maximum(X[:, 2], 1e5)
    
    # Log-space transformations
    log_P = np.log(P_nv)
    log_V = np.log(V)
    log_C = np.log(C)
    
    # Ensure exactly 7 parameters
    if len(params) < 7:
        params = np.pad(params, (0, 7 - len(params)), mode='constant', constant_values=0.0)
    else:
        params = params[:7]
    
    a, b, c, d, e, f, g = params
    
    # Enhanced model with direct efficiency ratio and data-vocabulary interaction
    pred = (a + 
            b * log_P + 
            c * log_V + 
            d * log_C + 
            e * (log_V - log_P) +
            f * (log_C ** 2) +
            g * log_V * log_C)
    
    return pred


def fit_scaling_law(data_points, loss_values):
    """
    Efficient two-stage optimization with smart initialization
    1. Intelligent global exploration with differential_evolution
    2. Rapid local convergence with L-BFGS-B
    
    Simplified for faster convergence while maintaining accuracy.
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64).ravel()
    
    if X.shape[1] != 3:
        raise ValueError(f"Expected 3 features, got {X.shape[1]}")
    
    # Normalize for numerical stability
    y_mean = np.mean(y)
    y_std = np.std(y) + 1e-8
    y_norm = (y - y_mean) / y_std
    
    # Extract log features for initialization
    P_nv = np.maximum(X[:, 0], 1e5)
    V = np.maximum(X[:, 1], 10.0)
    C = np.maximum(X[:, 2], 1e5)
    
    log_P = np.log(P_nv)
    log_V = np.log(V)
    log_C = np.log(C)
    
    def objective(params):
        """Normalized MSE with minimal regularization"""
        try:
            pred = scaling_law_func(X, params)
            if np.any(np.isnan(pred)) or np.any(np.isinf(pred)):
                return 1e10
            
            pred_norm = (pred - y_mean) / y_std
            mse = np.mean((pred_norm - y_norm) ** 2)
            
            # Minimal regularization only on highest-order terms
            reg = 1e-6 * (np.abs(params[5]) + np.abs(params[6]))
            return mse + reg
        except:
            return 1e10
    
    bounds = [
        (-10.0, 10.0),      # a: baseline
        (-2.0, 0.5),        # b: log(P) coefficient
        (-2.0, 0.5),        # c: log(V) coefficient
        (-2.0, 0.5),        # d: log(C) coefficient
        (-2.0, 2.0),        # e: log(V/P) efficiency ratio
        (-1.0, 1.0),        # f: log(C)^2 saturation
        (-1.0, 1.0),        # g: data-vocabulary interaction
    ]
    
    # Smart initialization from data correlations
    corr_P = np.corrcoef(log_P, y)[0, 1] if np.std(log_P) > 1e-6 else 0
    corr_V = np.corrcoef(log_V, y)[0, 1] if np.std(log_V) > 1e-6 else 0
    corr_C = np.corrcoef(log_C, y)[0, 1] if np.std(log_C) > 1e-6 else 0
    
    init_params = np.array([
        y_mean,                           # a
        -0.3 * np.sign(corr_P) if corr_P != 0 else -0.2,  # b
        -0.2 * np.sign(corr_V) if corr_V != 0 else -0.1,  # c
        -0.3 * np.sign(corr_C) if corr_C != 0 else -0.2,  # d
        0.1,                              # e: efficiency ratio
        0.05,                             # f: saturation
        -0.02                             # g: interaction
    ])
    
    best_params = None
    best_loss = np.inf
    
    # Stage 1: Quick global search
    result_global = differential_evolution(
        objective,
        bounds,
        seed=42,
        maxiter=400,
        popsize=24,
        atol=1e-8,
        tol=1e-8,
        workers=1,
        updating='deferred',
        polish=True,
        mutation=(0.5, 1.5),
        recombination=0.7
    )
    
    best_params = result_global.x.copy()
    best_loss = result_global.fun
    
    # Stage 2: Fast local refinement
    result_local = minimize(
        objective,
        best_params,
        method='L-BFGS-B',
        bounds=bounds,
        options={
            'maxiter': 800,
            'ftol': 1e-10,
            'gtol': 1e-8,
            'maxcor': 40
        }
    )
    
    if result_local.success and result_local.fun < best_loss:
        best_params = result_local.x
    
    # Ensure exactly 7 parameters
    if len(best_params) < 7:
        best_params = np.pad(best_params, (0, 7 - len(best_params)), mode='constant')
    else:
        best_params = best_params[:7]
    
    return best_params

# EVOLVE-BLOCK-END

#5 Run 4 R² = 0.892967

▼

Python

# EVOLVE-BLOCK-START
"""
Improved scaling law discovery for LLM vocabulary-aware training
Uses a physics-informed model with vocabulary-aware embedding scaling
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Scaling law with vocabulary-aware embedding cost model.
    
    Model: Loss = a + b*P_nv^α + c*log(V)*D^β + d*log(D)*log(N)^γ
    where:
    - P_nv: non-vocabulary parameters
    - V: vocabulary size
    - D: embedding dimension (derived from vocab_size and params)
    - N: number of characters
    
    params: [a, b, α, c, β, d, γ] - 7 parameters
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    params = np.asarray(params, dtype=np.float64).flatten()
    
    if len(params) < 7:
        params = np.pad(params, (0, 7 - len(params)), 'constant', constant_values=0)
    
    P_nv = X[:, 0]      # non-vocabulary parameters
    V = X[:, 1]         # vocabulary size
    N = X[:, 2]         # number of characters
    
    # Estimate embedding dimension from non-vocab parameters and vocab size
    # Assume typical architecture: D ≈ sqrt(P_nv / (num_layers * hidden_multiplier))
    # For simplicity, use embedding dimension proxy
    D = np.maximum(np.sqrt(P_nv / 100), 8)  # bounded embedding dimension estimate
    
    a, b, alpha, c, beta, d, gamma = params[0], params[1], params[2], params[3], params[4], params[5], params[6]
    
    # Avoid log of zero/negative values
    V_safe = np.maximum(V, 1)
    N_safe = np.maximum(N, 1)
    D_safe = np.maximum(D, 1)
    P_nv_safe = np.maximum(P_nv, 1)
    
    try:
        # Main scaling law with vocabulary awareness
        term1 = a
        term2 = b * np.power(P_nv_safe, alpha)
        term3 = c * np.log(V_safe) * np.power(D_safe, beta)
        term4 = d * np.log(D_safe) * np.power(np.log(N_safe), gamma)
        
        pred = term1 + term2 + term3 + term4
        
        # Clamp to reasonable range to avoid numerical issues
        pred = np.clip(pred, -10, 2)
        
    except (RuntimeWarning, FloatingPointError):
        pred = np.full_like(P_nv, a)
    
    return pred


def fit_scaling_law(data_points, loss_values):
    """
    Fit scaling law using differential evolution for global optimization.
    Provides better convergence than BFGS for this non-convex problem.
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64).flatten()
    
    N = len(y)
    
    # Normalize inputs for better numerical stability
    P_nv = X[:, 0]
    V = X[:, 1]
    N_chars = X[:, 2]
    
    P_nv_log_mean = np.mean(np.log(np.maximum(P_nv, 1)))
    V_log_mean = np.mean(np.log(np.maximum(V, 1)))
    N_log_mean = np.mean(np.log(np.maximum(N_chars, 1)))
    
    def objective(params):
        """MSE loss function."""
        try:
            pred = scaling_law_func(X, params)
            mse = np.mean((pred - y) ** 2)
            if not np.isfinite(mse) or mse > 1e10:
                return 1e10
            return mse
        except:
            return 1e10
    
    # Bounds for 7 parameters
    # a: bias term, typically near loss values (-5 to 0)
    # b: coefficient for P_nv term (0.001 to 10)
    # α: exponent for P_nv (-0.5 to 0.5)
    # c: coefficient for vocab term (0.001 to 10)
    # β: exponent for embedding dim (-0.5 to 0.5)
    # d: coefficient for interaction term (-10 to 10)
    # γ: exponent for characters (-0.5 to 0.5)
    
    bounds = [
        (-5, 0),        # a
        (1e-6, 10),     # b
        (-0.5, 0.5),    # alpha
        (1e-6, 10),     # c
        (-0.5, 0.5),    # beta
        (-10, 10),      # d
        (-0.5, 0.5),    # gamma
    ]
    
    # Use differential evolution for global optimization
    result_de = differential_evolution(
        objective,
        bounds,
        maxiter=300,
        popsize=20,
        seed=42,
        atol=1e-8,
        tol=1e-8,
        workers=1,
        polish=True
    )
    
    params_de = result_de.x
    
    # Refine with local optimization
    result_local = minimize(
        objective,
        params_de,
        method='L-BFGS-B',
        bounds=bounds,
        options={'maxiter': 500, 'ftol': 1e-10}
    )
    
    if result_local.success and result_local.fun < result_de.fun:
        params_opt = result_local.x
    else:
        params_opt = params_de
    
    # Ensure we return exactly 7 parameters
    params_opt = np.asarray(params_opt).flatten()
    if len(params_opt) < 7:
        params_opt = np.pad(params_opt, (0, 7 - len(params_opt)), 'constant', constant_values=0)
    else:
        params_opt = params_opt[:7]
    
    return params_opt

# EVOLVE-BLOCK-END