SLD - Domain Mixture Scaling Law - SLDAgent + Claude Haiku 4.5

All Runs (sorted by R²)

Best Run 1 R² = 0.998086

▼

Python

# EVOLVE-BLOCK-START
"""
Enhanced scaling law for multi-domain LLM finetuning
Optimized with smart initialization, adaptive regularization, and refined optimization strategy
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    """
    Multi-domain loss prediction using power law basis
    
    Model: loss[d] = sum_f(c[d,f] * x[f]^a[f]) + b[d]
    
    params layout (35 total):
    [0-4]: shared exponents for input features
    [5-9]: domain-specific bias terms
    [10-34]: coefficients (5×5 matrix, one row per output domain)
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    params = np.asarray(params, dtype=np.float64).ravel()
    
    # Parse parameters with numerical stability
    shared_exp = np.clip(params[:5], -2.0, 2.0)
    domain_bias = params[5:10]
    coeffs = params[10:35].reshape(5, 5)
    
    # Safe input preparation for power operations
    X_safe = np.clip(X, 1e-8, 1.0)
    
    # Vectorized computation
    X_powered = X_safe ** shared_exp[None, :]  # (N, 5)
    pred = X_powered @ coeffs.T + domain_bias[None, :]  # (N, 5)
    
    # Clip to valid loss range
    return np.clip(pred, 1.0, 5.0)


def fit_scaling_law(data_points, loss_values):
    """
    Fit scaling law with advanced initialization and adaptive regularization
    
    Key improvements:
    1. Per-domain least-squares coefficient initialization
    2. Adaptive exponent initialization based on domain variance
    3. Centered/scaled features for better initialization numerics
    4. Adaptive regularization scaled by data statistics
    5. Two-stage optimization with progressive refinement
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.atleast_2d(np.asarray(loss_values, dtype=np.float64))
    
    if y.ndim == 1:
        y = y[:, None]
    
    N, F = X.shape
    D = y.shape[1]
    
    # Compute data statistics for adaptive initialization
    y_mean = np.mean(y, axis=0)
    y_std = np.std(y, axis=0)
    
    # Initialize parameters
    init_params = np.zeros(35)
    
    # Adaptive exponent initialization based on domain variance
    y_var_normalized = y_std / (np.max(y_std) + 1e-8)
    init_params[0:5] = 0.3 + 0.3 * y_var_normalized  # Range [0.3, 0.6]
    
    # Initialize biases from per-domain means
    init_params[5:10] = y_mean[:5]
    
    # Improved coefficient initialization using centered/scaled features
    X_safe = np.clip(X, 1e-8, 1.0)
    X_centered = X_safe - np.mean(X_safe, axis=0, keepdims=True)
    X_std = np.std(X_safe, axis=0, keepdims=True) + 1e-8
    X_scaled = X_centered / X_std
    
    # Fit coefficients per domain using scaled features
    for d in range(D):
        try:
            y_d = y[:, d] - np.mean(y[:, d])
            # Solve least-squares with scaled features for better numerics
            c_d = np.linalg.lstsq(X_scaled, y_d, rcond=None)[0]
            # Normalize to prevent extreme initialization
            c_d_norm = np.linalg.norm(c_d) + 1e-8
            c_d_normalized = c_d / c_d_norm
            init_params[10 + d*5:10 + (d+1)*5] = np.clip(c_d_normalized, -1.0, 1.0)
        except:
            init_params[10 + d*5:10 + (d+1)*5] = 0.02
    
    # Objective function with adaptive regularization
    def objective(flat_params):
        try:
            pred = scaling_law_func(X, flat_params)
            
            if pred.shape != y.shape:
                return 1e10
            
            # Main MSE loss
            mse = np.mean((pred - y) ** 2)
            
            # Adaptive coefficient regularization
            coeffs = flat_params[10:35]
            coeff_reg = 0.0006 * np.mean(coeffs ** 2)
            
            # Exponent regularization: keep close to initialized adaptive values
            exp_deviation = flat_params[0:5] - init_params[0:5]
            exp_reg = 0.00008 * np.sum(exp_deviation ** 2)
            
            # Bias regularization: keep biases anchored to data mean
            bias_deviation = flat_params[5:10] - y_mean[:5]
            bias_reg = 0.00005 * np.sum(bias_deviation ** 2)
            
            return mse + coeff_reg + exp_reg + bias_reg
        except:
            return 1e10
    
    # Adaptive bounds based on data statistics
    loss_min, loss_max = np.min(y), np.max(y)
    
    bounds = [
        *[(-2.0, 2.0)] * 5,                          # exponents
        *[(loss_min - 0.3, loss_max + 0.3)] * 5,     # biases
        *[(-1.5, 1.5)] * 25                           # coefficients
    ]
    
    # Primary optimization with balanced settings
    result = minimize(
        objective,
        init_params,
        method='L-BFGS-B',
        bounds=bounds,
        options={
            'maxiter': 1500,
            'ftol': 1e-9,
            'gtol': 1e-8,
            'maxcor': 15
        }
    )
    
    # Secondary fine-tuning for improved convergence
    if result.success:
        result2 = minimize(
            objective,
            result.x,
            method='L-BFGS-B',
            bounds=bounds,
            options={
                'maxiter': 500,
                'ftol': 1e-10,
                'gtol': 1e-9,
                'maxcor': 20
            }
        )
        # Use better result
        if result2.fun < result.fun:
            return result2.x.ravel()
        return result.x.ravel()
    else:
        return init_params.ravel()
# EVOLVE-BLOCK-END

#2 Run 3 R² = 0.997906

▼

Python

# EVOLVE-BLOCK-START
"""
Optimized scaling law for multi-domain LLM finetuning
Domain-adaptive architecture with selective high-impact interactions
"""
import numpy as np
from scipy.optimize import minimize, Bounds

def scaling_law_func(data_points, params):
    """
    Domain-aware scaling law with domain-specific tuning.
    
    Parameter structure (32 params):
    - params[0:5]: base loss per domain
    - params[5:10]: power law exponents [0.1, 2.0]
    - params[10:15]: domain scaling factors
    - params[15:20]: mixture interaction coefficients
    - params[20:25]: domain-specific adjustments
    - params[25:32]: pairwise interactions (7 high-impact pairs)
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    params = np.atleast_1d(np.asarray(params, dtype=np.float64))
    N, n_domains = X.shape
    
    if params.ndim == 1 and len(params) <= 32:
        base_loss = params[0:5]
        exponents = np.clip(params[5:10], 0.1, 2.0)
        scales = params[10:15]
        mix_coeff = params[15:20]
        domain_adj = params[20:25]
        pairwise = params[25:32] if len(params) >= 32 else np.zeros(7)
        
        pred = base_loss[np.newaxis, :]
        X_safe = np.clip(X, 1e-7, 1.0)
        power_term = (X_safe ** exponents[np.newaxis, :]) * scales[np.newaxis, :]
        pred = pred + power_term
        
        mix_effect = np.dot(X, mix_coeff)
        pred = pred + mix_effect[:, np.newaxis] * (0.5 + domain_adj[np.newaxis, :])
        
        pair_configs = [(0,1), (1,2), (1,3), (1,4), (0,2), (2,3), (3,4)]
        for idx, (i, j) in enumerate(pair_configs):
            if idx < len(pairwise):
                interaction = pairwise[idx] * X[:, i] * X[:, j]
                pred[:, i] += interaction * 0.25
                pred[:, j] += interaction * 0.25
        
        return np.clip(pred, 1.0, 5.0)
    
    elif params.ndim == 2:
        return np.array([scaling_law_func(X, p) for p in params])
    else:
        return np.zeros((N, n_domains))


def fit_scaling_law(data_points, loss_values):
    """
    Fit scaling law with domain-weighted optimization.
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.atleast_1d(np.asarray(loss_values, dtype=np.float64))
    if y.ndim == 1:
        y = y[:, np.newaxis]
    
    N, n_domains = X.shape
    y_mean = np.mean(y, axis=0)
    y_std = np.std(y, axis=0) + 1e-8
    y_range = np.max(y, axis=0) - np.min(y, axis=0)
    
    base_loss = y_mean
    exponents = 0.4 + 0.3 * (y_std / (np.max(y_std) + 1e-8))
    scales = y_range * 0.22
    mix_coeff = np.zeros(5)
    domain_adj = np.zeros(5)
    domain_adj[1] = 0.15
    pairwise = np.zeros(7)
    
    init_params = np.concatenate([
        base_loss, exponents, scales, mix_coeff, domain_adj, pairwise
    ])
    
    bounds = Bounds(
        lb=np.array([1.0]*5 + [0.1]*5 + [-2.0]*5 + [-2.0]*5 + [-0.5]*5 + [-1.0]*7),
        ub=np.array([5.0]*5 + [2.0]*5 + [2.0]*5 + [2.0]*5 + [0.5]*5 + [1.0]*7)
    )
    
    def objective(params):
        try:
            pred = scaling_law_func(X, params)
            if pred.shape != y.shape:
                return 1e10
            weights = np.array([1.0, 1.3, 1.0, 1.0, 1.0])
            mse = np.mean((pred - y) ** 2 * weights[np.newaxis, :])
            reg_inter = 0.0004 * np.sum(np.abs(params[20:]) ** 2)
            return mse + reg_inter
        except:
            return 1e10
    
    best_result = None
    best_loss = float('inf')
    
    for method, opts in [
        ('L-BFGS-B', {'maxiter': 2000, 'ftol': 1e-10, 'gtol': 1e-9}),
        ('trust-constr', {'maxiter': 1000, 'xtol': 1e-10}),
        ('Nelder-Mead', {'maxiter': 600, 'xatol': 1e-9, 'fatol': 1e-10})
    ]:
        try:
            result = minimize(objective, init_params, method=method, 
                            bounds=bounds if method != 'Nelder-Mead' else None,
                            options=opts)
            if result.fun < best_loss:
                best_loss = result.fun
                best_result = result
        except:
            pass
    
    params_opt = best_result.x if best_result is not None else init_params
    return np.clip(params_opt, bounds.lb, bounds.ub)

# EVOLVE-BLOCK-END

#3 Run 5 R² = 0.993756

▼

Python

# EVOLVE-BLOCK-START
"""
Evolved scaling law for multi-domain LLM finetuning
Simplified architecture with improved interaction modeling
Uses 32 parameters for better expressivity and fit
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Refined scaling law architecture:
    - 5 base coefficients for direct domain effects (linear)
    - 5 power-law exponents for logarithmic terms
    - 10 domain-specific interaction coefficients (one per domain pair)
    - 7 global/cross-domain parameters for mixture effects
    Total: 32 parameters (within 35 limit)
    
    Functional form:
    loss_d = base_coeff_d * X_d + exp_coeff_d * log(X_d + eps)
             + sum_i(inter_coeff_di * X_i * X_d) + global_effect
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    N, F = X.shape  # N samples, F=5 domains
    params = np.asarray(params, dtype=np.float64).ravel()
    
    # Parse parameters
    base_coeff = params[0:5]        # (5,) - linear coefficients
    exp_coeff = params[5:10]        # (5,) - log coefficients
    inter_coeff_matrix = params[10:20]  # (10,) - pairwise interactions
    global_bias = params[20:25]    # (5,) - domain biases
    cross_scale = params[25:27]    # (2,) - cross-domain scaling
    entropy_coeff = params[27:32]  # (5,) - entropy-based regularization
    
    # Clamp exponents for stability
    exp_coeff = np.clip(exp_coeff, -2.0, 2.0)
    cross_scale = np.clip(cross_scale, -2.0, 2.0)
    entropy_coeff = np.clip(entropy_coeff, -1.0, 1.0)
    
    # Initialize predictions
    pred = np.zeros((N, 5), dtype=np.float64)
    
    # 1. Direct linear terms
    pred += base_coeff[np.newaxis, :] * X
    
    # 2. Log-linear terms for power-law scaling behavior
    X_safe = np.clip(X, 1e-8, 1.0)
    X_log = np.log(X_safe)
    pred += exp_coeff[np.newaxis, :] * X_log
    
    # 3. Pairwise interactions with domain-specific effects
    # Each domain gets affected by its interactions with all other domains
    pairs = [
        (0, 1), (0, 2), (0, 3), (0, 4),
        (1, 2), (1, 3), (1, 4),
        (2, 3), (2, 4),
        (3, 4)
    ]
    
    interaction_matrix = np.zeros((N, 5))
    for idx, (i, j) in enumerate(pairs):
        product = X[:, i] * X[:, j]
        coeff = inter_coeff_matrix[idx]
        # Both domains affected by the interaction
        interaction_matrix[:, i] += coeff * product
        interaction_matrix[:, j] += coeff * product
    
    pred += interaction_matrix
    
    # 4. Domain-specific biases
    pred += global_bias[np.newaxis, :]
    
    # 5. Cross-domain coupling effects
    # Using mixture entropy and concentration
    X_sum = np.sum(X, axis=1, keepdims=True)  # Should be ~1.0
    X_entropy = -np.sum(X * np.log(np.clip(X, 1e-8, 1.0)), axis=1, keepdims=True)
    
    # Cross-scale effect 1: proportional to concentration
    concentration_effect = cross_scale[0] * (1.0 - X_entropy / np.log(5.0))
    pred += concentration_effect
    
    # Cross-scale effect 2: maximum domain proportion effect
    X_max = np.max(X, axis=1, keepdims=True)
    max_effect = cross_scale[1] * X_max
    pred += max_effect
    
    # 6. Entropy-based regularization per domain
    entropy_reg = entropy_coeff[np.newaxis, :] * (X_entropy / np.log(5.0))
    pred += entropy_reg
    
    # Ensure predictions are in valid loss range
    pred = np.clip(pred, 0.5, 5.0)
    
    return pred


def fit_scaling_law(data_points, loss_values):
    """
    Optimized fitting strategy:
    1. Smart initialization using correlation analysis
    2. Efficient multi-start local optimization
    3. Global exploration with differential evolution
    4. Targeted refinement
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.atleast_2d(np.asarray(loss_values, dtype=np.float64))
    
    if y.ndim == 1 or y.shape[1] == 1:
        y = y.reshape(-1, 1)
    
    N, F = X.shape  # F=5
    n_domains = y.shape[1]
    
    # Data statistics
    y_mean = np.mean(y, axis=0)
    y_std = np.std(y, axis=0) + 1e-8
    X_mean = np.mean(X, axis=0)
    
    def initialize_params(strategy=0):
        """Multiple initialization strategies"""
        params = np.zeros(32)
        
        if strategy == 0:
            # Strategy 0: Data-driven from correlation
            # Linear model: y ≈ base_coeff * X + bias
            X_pinv = np.linalg.pinv(np.hstack([X, np.ones((N, 1))]))
            linear_sol = X_pinv @ y
            params[0:5] = np.clip(linear_sol[:5, 0], -5.0, 5.0)
            params[20:25] = np.clip(linear_sol[5:, 0], -3.0, 3.0)
            
            # Log coefficients: small regularized values
            params[5:10] = np.random.randn(5) * 0.15 - 0.05
            
            # Interactions: correlated pairs get larger effects
            params[10:20] = np.random.randn(10) * 0.1
            
            # Cross-domain: medium magnitude
            params[25:27] = np.random.randn(2) * 0.3
            
            # Entropy coefficients: small
            params[27:32] = np.random.randn(5) * 0.05
            
        elif strategy == 1:
            # Strategy 1: Conservative baseline
            params[0:5] = 1.8 + np.random.randn(5) * 0.3
            params[5:10] = -0.15 + np.random.randn(5) * 0.1
            params[20:25] = (y_mean - 2.0) / 2.0
            params[25:27] = np.random.randn(2) * 0.2
            params[27:32] = np.random.randn(5) * 0.05
            
        else:
            # Strategy 2: Diverse random
            params[0:5] = np.random.uniform(0.5, 3.5, 5)
            params[5:10] = np.random.uniform(-1.0, 1.0, 5)
            params[10:20] = np.random.randn(10) * 0.25
            params[20:25] = np.random.randn(5) * 0.5
            params[25:27] = np.random.randn(2) * 0.5
            params[27:32] = np.random.randn(5) * 0.1
        
        return params
    
    def objective(params_flat):
        """MSE loss with adaptive regularization"""
        try:
            pred = scaling_law_func(X, params_flat)
            pred = np.atleast_2d(pred)
            
            if pred.shape[1] != n_domains:
                pred = pred.T
            
            # MSE loss
            error = pred - y
            mse = np.mean(error ** 2)
            
            # Adaptive L2 regularization
            reg_base = 0.0002 * np.sum(params_flat[0:5] ** 2)
            reg_exp = 0.0003 * np.sum(params_flat[5:10] ** 2)
            reg_inter = 0.0001 * np.sum(params_flat[10:20] ** 2)
            reg_cross = 0.0002 * np.sum(params_flat[25:27] ** 2)
            
            return mse + reg_base + reg_exp + reg_inter + reg_cross
        except:
            return 1e10
    
    # Parameter bounds
    bounds = [
        (-5.0, 5.0),    # base_coeff (5)
        (-5.0, 5.0),
        (-5.0, 5.0),
        (-5.0, 5.0),
        (-5.0, 5.0),
        (-2.0, 2.0),    # exp_coeff (5)
        (-2.0, 2.0),
        (-2.0, 2.0),
        (-2.0, 2.0),
        (-2.0, 2.0),
        (-1.5, 1.5),    # inter_coeff_matrix (10)
        (-1.5, 1.5),
        (-1.5, 1.5),
        (-1.5, 1.5),
        (-1.5, 1.5),
        (-1.5, 1.5),
        (-1.5, 1.5),
        (-1.5, 1.5),
        (-1.5, 1.5),
        (-1.5, 1.5),
        (-3.0, 3.0),    # global_bias (5)
        (-3.0, 3.0),
        (-3.0, 3.0),
        (-3.0, 3.0),
        (-3.0, 3.0),
        (-2.0, 2.0),    # cross_scale (2)
        (-2.0, 2.0),
        (-1.0, 1.0),    # entropy_coeff (5)
        (-1.0, 1.0),
        (-1.0, 1.0),
        (-1.0, 1.0),
        (-1.0, 1.0),
    ]
    
    best_params = None
    best_loss = float('inf')
    
    # Phase 1: Multi-start local optimization
    for attempt in range(5):
        init_params = initialize_params(strategy=attempt % 3)
        
        try:
            result = minimize(
                objective,
                init_params,
                method='L-BFGS-B',
                bounds=bounds,
                options={'maxiter': 1000, 'ftol': 1e-10, 'gtol': 1e-8}
            )
            
            if result.fun < best_loss:
                best_loss = result.fun
                best_params = result.x
        except Exception:
            pass
    
    # Phase 2: Global optimization
    try:
        result_de = differential_evolution(
            objective,
            bounds,
            maxiter=400,
            popsize=30,
            seed=42,
            atol=1e-10,
            tol=1e-10,
            workers=1,
            polish=True
        )
        
        if result_de.fun < best_loss:
            best_loss = result_de.fun
            best_params = result_de.x
    except Exception:
        pass
    
    # Phase 3: Final local refinement
    if best_params is not None:
        try:
            result_final = minimize(
                objective,
                best_params,
                method='L-BFGS-B',
                bounds=bounds,
                options={'maxiter': 800, 'ftol': 1e-11}
            )
            
            if result_final.fun < best_loss:
                best_params = result_final.x
        except Exception:
            pass
    
    if best_params is None:
        best_params = initialize_params(strategy=0)
    
    return best_params

# EVOLVE-BLOCK-END

#4 Run 4 R² = 0.989829

▼

Python

# EVOLVE-BLOCK-START
"""
Advanced scaling law for multi-domain LLM loss prediction.
Combines global and local optimization with per-domain quadratic flexibility.

Key improvements from 0.9010 baseline:
1. Two-stage optimization: differential evolution + L-BFGS-B
2. Per-domain quadratic adjustments for model expressiveness
3. Adaptive regularization with per-domain weighting
4. Better initialization from domain statistics
5. Total: 35 parameters (optimal budget)
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Predict multi-domain losses using enhanced power law model.
    
    Model: loss_d = base_d + coeff_d * prop_d^exp_d 
                    + sum_j(interact_dj * interact_sense_d * prop_d * prop_j)
                    + quad_d * prop_d^2
    
    Parameters (35 total):
    - [0:5] base_losses: per-domain baseline loss
    - [5:10] coeffs: power law coefficients
    - [10:15] exponents: power law exponents
    - [15:30] interactions: 15 pairwise interaction terms
    - [30:35] quad_coeffs: per-domain quadratic adjustments (all 5 domains)
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    N, F = X.shape
    params = np.asarray(params, dtype=np.float64).flatten()
    
    base_losses = params[0:5]
    coeffs = params[5:10]
    exponents = params[10:15]
    interact_weights = params[15:30]
    quad_coeffs = params[30:35]
    
    # Numerical stability clamping
    exponents = np.clip(exponents, -1.8, 1.8)
    coeffs = np.clip(coeffs, 0.01, 18.0)
    quad_coeffs = np.clip(quad_coeffs, -1.2, 1.2)
    
    # Power law term
    X_safe = np.maximum(X, 1e-10)
    power_term = coeffs[np.newaxis, :] * (X_safe ** exponents[np.newaxis, :])
    
    # Domain-sensitive interaction terms
    inter_term = np.zeros((N, F), dtype=np.float64)
    idx = 0
    for i in range(F):
        for j in range(i+1, F):
            if idx < 15:
                interact_effect = X[:, i] * X[:, j]
                inter_term[:, i] += interact_weights[idx] * interact_effect
                inter_term[:, j] += interact_weights[idx] * interact_effect
                idx += 1
    
    # Per-domain quadratic adjustment for all 5 domains
    quad_term = quad_coeffs[np.newaxis, :] * (X ** 2)
    
    predictions = base_losses[np.newaxis, :] + power_term + inter_term + quad_term
    
    return predictions


def fit_scaling_law(data_points, loss_values):
    """
    Fit scaling law with two-stage optimization:
    Stage 1: Differential evolution for global exploration
    Stage 2: L-BFGS-B with domain-adaptive weighting for precision
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.atleast_1d(np.asarray(loss_values, dtype=np.float64))
    
    if y.ndim == 1:
        y = y[:, np.newaxis]
    
    N, F = X.shape
    P = 35
    y_target = y if y.shape[1] > 1 else y
    
    # Per-domain statistics for initialization
    if y_target.shape[1] > 1:
        domain_means = np.mean(y_target, axis=0)
    else:
        domain_means = np.full(5, np.mean(y_target))
    
    def objective(params_flat):
        """MSE with adaptive per-domain weighting and regularization."""
        try:
            pred = scaling_law_func(X, params_flat)
            
            if y_target.shape[1] == 1:
                pred_agg = np.mean(pred, axis=1, keepdims=True)
                mse = np.mean((pred_agg - y_target) ** 2)
            else:
                # Per-domain MSE with adaptive weighting for struggling domains
                per_domain_mse = np.mean((pred - y_target) ** 2, axis=0)
                domain_weights = np.array([1.0, 1.5, 1.0, 1.0, 1.5])
                mse = np.mean(per_domain_mse * domain_weights)
            
            # Adaptive regularization
            exp_reg = 0.0010 * np.sum(params_flat[10:15] ** 2)
            cross_reg = 0.00012 * np.sum(params_flat[15:30] ** 2)
            quad_reg = 0.0015 * np.sum(params_flat[30:35] ** 2)
            
            total = mse + exp_reg + cross_reg + quad_reg
            return total if np.isfinite(total) else 1e8
        except:
            return 1e8
    
    # Parameter bounds
    bounds = (
        [(1.2, 4.3)] * 5 +        # base_losses
        [(0.02, 18.0)] * 5 +      # coeffs
        [(-1.8, 1.8)] * 5 +       # exponents
        [(-2.0, 2.0)] * 15 +      # cross_interactions
        [(-1.2, 1.2)] * 5         # quad_coeffs
    )
    
    # Smart initialization based on domain statistics
    init_params = np.zeros(P)
    init_params[0:5] = domain_means
    init_params[5:10] = 0.85
    init_params[10:15] = 0.18
    init_params[15:30] = 0.005
    init_params[30:35] = -0.05
    
    # Stage 1: Global optimization with differential evolution
    result_de = differential_evolution(
        objective,
        bounds,
        maxiter=350,
        popsize=16,
        atol=1e-9,
        tol=1e-9,
        seed=42,
        workers=1,
        updating='deferred',
        polish=True
    )
    
    # Stage 2: Multi-start local refinement
    candidates = [result_de.x, init_params.copy()]
    
    # Add perturbed candidates for diversity
    for _ in range(3):
        perturbed = init_params.copy()
        perturbed[5:10] += np.random.normal(0, 0.08, 5)
        perturbed[10:15] += np.random.normal(0, 0.1, 5)
        perturbed[15:30] += np.random.normal(0, 0.1, 15)
        perturbed[30:35] += np.random.normal(0, 0.08, 5)
        candidates.append(perturbed)
    
    best_result = None
    best_loss = float('inf')
    
    for start_params in candidates:
        result = minimize(
            objective,
            start_params,
            method='L-BFGS-B',
            bounds=bounds,
            options={'ftol': 1e-9, 'maxiter': 2000, 'maxfun': 4000}
        )
        
        if result.fun < best_loss:
            best_loss = result.fun
            best_result = result
    
    # Stage 3: Final refinement with ultra-tight tolerance
    if best_result is not None:
        final_refine = minimize(
            objective,
            best_result.x,
            method='L-BFGS-B',
            bounds=bounds,
            options={'ftol': 1e-10, 'maxiter': 500, 'maxfun': 1000}
        )
        
        if final_refine.fun < best_loss:
            best_result = final_refine
    
    return best_result.x if best_result is not None else init_params

# EVOLVE-BLOCK-END

#5 Run 2 R² = 0.910012

▼

Python

# EVOLVE-BLOCK-START
"""
Enhanced scaling law with exponential basis and filtered interactions
Combines exponential nonlinearity with intelligently weighted interactions
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    X = np.atleast_2d(np.asarray(data_points))  # (N, 5)
    params = np.asarray(params).reshape(-1)
    N = X.shape[0]
    
    # Parameter allocation: 5 base + 10 pairs + 5 quad + 5 exp + 3 filter = 28 params
    base = params[:5]           # Base loss per domain
    pair_coeff = params[5:15]   # Pairwise interaction coefficients
    quad = params[15:20]        # Quadratic proportion terms
    exp_coeff = params[20:25]   # Exponential basis coefficients
    interact_filter = params[25:28]  # Interaction filtering parameters
    
    pred = np.zeros((N, 5))
    
    for i in range(N):
        x = X[i]  # (5,) domain proportions, sum to 1
        
        # Start with base loss
        L = base.copy()
        
        # Linear proportion effects with adaptive scaling
        L += interact_filter[0] * x
        
        # Quadratic effects (non-linear scaling with proportion)
        L += quad * np.square(x)
        
        # Exponential basis for smooth nonlinearity
        exp_basis = np.exp(-interact_filter[1] * x) - 1.0  # Ranges from -1 to 0
        L += exp_coeff * exp_basis
        
        # Filtered pairwise interactions
        pair_idx = 0
        interaction_strength = np.clip(interact_filter[2], 0.1, 1.0)
        for d1 in range(5):
            for d2 in range(d1 + 1, 5):
                # Weight interaction by minimum proportion to reduce spurious effects
                interaction_weight = np.minimum(x[d1], x[d2]) * interaction_strength
                L[d1] += pair_coeff[pair_idx] * x[d2] * interaction_weight
                pair_idx += 1
        
        # Ensure valid loss predictions
        pred[i] = np.clip(L, 0.5, 5.0)
    
    return pred if pred.shape[1] > 1 else pred[:, 0]


def fit_scaling_law(data_points, loss_values):
    X = np.atleast_2d(np.asarray(data_points))
    y = np.atleast_2d(np.asarray(loss_values))
    if y.ndim == 1:
        y = y[:, None]
    
    N, n_out = y.shape
    
    def objective(p):
        pred = scaling_law_func(X, p)
        if pred.ndim == 1:
            pred = pred[:, None]
        
        # Main loss: mean squared error
        mse = np.mean((pred - y) ** 2)
        
        # Adaptive tiered regularization
        reg_base = 0.0003 * np.sum(p[:5] ** 2)              # Minimal on base
        reg_pair = 0.0025 * np.sum(p[5:15] ** 2)            # Moderate on pairs
        reg_quad = 0.0015 * np.sum(p[15:20] ** 2)           # Mild on quadratic
        reg_exp = 0.001 * np.sum(p[20:25] ** 2)             # Light on exponential
        reg_filter = 0.0005 * np.sum((p[25:28] - 1.0) ** 2) # Weak centering
        
        return mse + reg_base + reg_pair + reg_quad + reg_exp + reg_filter
    
    # Data-aware initialization
    y_mean = np.mean(y, axis=0)
    y_std = np.std(y, axis=0)
    y_global_std = np.std(y)
    y_cov = np.cov(y.T)
    
    init = np.zeros(28)
    
    # Base losses from data mean
    init[:5] = y_mean
    
    # Pairwise interactions initialized from covariance structure
    pair_idx = 0
    for d1 in range(5):
        for d2 in range(d1 + 1, 5):
            corr_weight = np.clip(y_cov[d1, d2] / (y_std[d1] * y_std[d2] + 1e-8), -0.5, 0.5)
            init[5 + pair_idx] = corr_weight * y_global_std * 0.02
            pair_idx += 1
    
    # Quadratic terms with domain-aware scaling
    init[15:20] = np.random.randn(5) * (y_std * 0.01)
    
    # Exponential basis coefficients
    init[20:25] = np.random.randn(5) * (y_std * 0.005)
    
    # Interaction filter parameters
    init[25] = 0.8  # Linear scaling factor
    init[26] = 0.5  # Exponential decay rate
    init[27] = 0.7  # Interaction weight filter
    
    # Multi-stage optimization with adaptive scheduling
    result1 = minimize(
        objective,
        init,
        method='BFGS',
        options={'gtol': 1e-2, 'maxiter': 300}
    )
    
    result2 = minimize(
        objective,
        result1.x,
        method='BFGS',
        options={'gtol': 1e-4, 'maxiter': 400}
    )
    
    result3 = minimize(
        objective,
        result2.x,
        method='BFGS',
        options={'gtol': 1e-6, 'maxiter': 500}
    )
    
    # Return best result
    results = [result1, result2, result3]
    best_result = min(results, key=lambda r: objective(r.x))
    params = best_result.x
    
    return params
# EVOLVE-BLOCK-END