SLD - Domain Mixture Scaling Law - SLDAgent + Claude Sonnet 4.5

All Runs (sorted by R²)

Best Run 4 R² = 0.998727

▼

Python

# EVOLVE-BLOCK-START
"""
Refined per-domain scaling law with robust cross-domain modeling
Uses 35 parameters: 7 per domain with enhanced numerical stability and fitting
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    """
    Per-domain scaling law with comprehensive modeling.
    Each of 5 domains uses 7 parameters (35 total):
    - scale: power law coefficient
    - exponent: power law shape (clipped for stability)
    - bias: baseline loss offset
    - quad: quadratic self-interaction
    - cross1, cross2, cross3: top 3 cross-domain linear effects
    
    Model: loss_d = scale * prop_d^exp + quad * prop_d^2 + 
                    sum(cross_i * prop_other_i) + bias
    """
    X = np.atleast_2d(np.asarray(data_points))  # (N, 5)
    N, F = X.shape
    params = np.asarray(params).flatten()
    
    # Ensure exactly 35 parameters
    if len(params) < 35:
        params = np.pad(params, (0, 35 - len(params)), constant_values=0.0)
    params = params[:35]
    
    # Numerical stability with safe clipping
    X_safe = np.clip(X, 1e-9, 1.0)
    
    predictions = np.zeros((N, F))
    
    for d in range(F):
        idx = d * 7
        
        # Extract domain parameters
        scale = params[idx]
        exponent = params[idx + 1]
        bias = params[idx + 2]
        quad = params[idx + 3]
        cross_weights = params[idx + 4:idx + 7]
        
        # Clip exponent for stability
        exponent = np.clip(exponent, 0.1, 2.3)
        
        # Power law: main self-effect
        power_term = scale * (X_safe[:, d] ** exponent)
        
        # Quadratic: concentration effects
        quad_term = quad * (X_safe[:, d] ** 2)
        
        # Cross-domain: systematic other domain selection
        cross_term = 0.0
        other_domains = [i for i in range(F) if i != d]
        for w_idx in range(min(3, len(other_domains))):
            cross_term += cross_weights[w_idx] * X_safe[:, other_domains[w_idx]]
        
        # Combine terms
        predictions[:, d] = power_term + quad_term + cross_term + bias
    
    return predictions


def fit_scaling_law(data_points, loss_values):
    """
    Robust fitting with enhanced initialization and progressive optimization
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.atleast_2d(np.asarray(loss_values))
    
    N, F = X.shape
    
    # Shape correction
    if y.shape[1] != F:
        if y.shape[0] == F and y.shape[1] == N:
            y = y.T
        elif y.shape[1] == 1:
            y = np.tile(y, (1, F))
    
    # Enhanced data-driven initialization
    init = np.zeros(35)
    
    for d in range(F):
        idx = d * 7
        y_d = y[:, d]
        x_d = X[:, d]
        
        # Robust statistics using percentiles
        q10, q25, q50, q75, q90 = np.percentile(y_d, [10, 25, 50, 75, 90])
        iqr = q75 - q25
        
        # Estimate exponent via log-space regression on filtered data
        mask = (x_d > 0.01) & (y_d > q10)
        if np.sum(mask) > 3:
            log_x = np.log(x_d[mask] + 1e-7)
            log_y = np.log(np.maximum(y_d[mask] - q10 + 0.05, 0.05))
            
            if np.std(log_x) > 1e-5:
                # Robust slope estimation
                slope = np.polyfit(log_x, log_y, 1)[0]
                init[idx + 1] = np.clip(slope, 0.4, 1.1)
            else:
                init[idx + 1] = 0.65
        else:
            init[idx + 1] = 0.65
        
        # Scale: based on IQR
        init[idx] = iqr * 0.5
        
        # Bias: lower percentile baseline with median contribution
        init[idx + 2] = q10 * 0.8 + q50 * 0.2
        
        # Quadratic: small stabilizing term
        init[idx + 3] = iqr * 0.02
        
        # Cross-domain: proportional to residual with variation
        init[idx + 4:idx + 7] = iqr * 0.015 * (1.0 + 0.08 * np.random.randn(3))
    
    def objective(params):
        try:
            pred = scaling_law_func(X, params)
            
            # Adaptive domain weighting: inverse variance
            domain_vars = np.var(y, axis=0)
            weights = 1.0 / (domain_vars + 0.03)
            weights = F * weights / np.sum(weights)
            
            mse = np.mean(((pred - y) ** 2) * weights[None, :])
            
            # Hierarchical regularization
            # Lightest on main effects
            r_scale = 0.00002 * np.sum(params[::7] ** 2)
            r_bias = 0.00001 * np.sum(params[2::7] ** 2)
            
            # Moderate on shape parameters
            r_exp = 0.00007 * np.sum((params[1::7] - 0.7) ** 2)
            r_quad = 0.00009 * np.sum(params[3::7] ** 2)
            
            # Stronger on cross-domain to prevent overfitting
            r_cross = 0.00028 * (np.sum(params[4::7] ** 2) + 
                                  np.sum(params[5::7] ** 2) + 
                                  np.sum(params[6::7] ** 2))
            
            return mse + r_scale + r_bias + r_exp + r_quad + r_cross
        except:
            return 1e10
    
    # Carefully tuned bounds
    bounds = []
    for _ in range(F):
        bounds.extend([
            (-9, 13),      # scale
            (0.1, 2.3),    # exponent
            (-7, 11),      # bias
            (-3, 3),       # quad
            (-4, 4),       # cross1
            (-4, 4),       # cross2
            (-4, 4)        # cross3
        ])
    
    # Progressive multi-stage optimization
    best_params = init
    best_loss = objective(init)
    
    # Stage 1: Exploration with moderate tolerance
    res1 = minimize(objective, init, method='L-BFGS-B', bounds=bounds,
                    options={'maxiter': 1400, 'ftol': 1e-9, 'gtol': 1e-8})
    
    if res1.success and res1.fun < best_loss:
        best_params = res1.x
        best_loss = res1.fun
    
    # Stage 2: Refinement with tighter tolerance
    res2 = minimize(objective, best_params, method='L-BFGS-B', bounds=bounds,
                    options={'maxiter': 1200, 'ftol': 1e-11, 'gtol': 1e-9})
    
    if res2.success and res2.fun < best_loss:
        best_params = res2.x
        best_loss = res2.fun
    
    # Stage 3: Final polishing
    res3 = minimize(objective, best_params, method='L-BFGS-B', bounds=bounds,
                    options={'maxiter': 600, 'ftol': 1e-12})
    
    if res3.success and res3.fun < best_loss:
        best_params = res3.x
    
    return best_params
# EVOLVE-BLOCK-END

#2 Run 3 R² = 0.998636

▼

Python

# EVOLVE-BLOCK-START
"""
Advanced scaling law with domain interactions and adaptive power laws
Uses exactly 35 parameters for enhanced multi-domain modeling
- 5 output domains × 7 params each
- Per domain: 1 base + 1 self-coeff + 1 self-exp + 4 cross-coeffs
Key innovation: Separate exponents per domain, better handling of cross-domain effects
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Enhanced per-domain model with domain-specific exponents
    
    For each of 5 output domains (7 params each):
    - params[0]: base loss
    - params[1]: self-domain coefficient
    - params[2]: self-domain exponent
    - params[3:7]: cross-domain coefficients (4 others)
    
    Innovation: Each domain has its own power law behavior
    """
    X = np.atleast_2d(np.asarray(data_points))
    N = X.shape[0]
    params = np.asarray(params).flatten()
    
    if len(params) < 35:
        params = np.pad(params, (0, 35 - len(params)), constant_values=1.0)
    params = params[:35].reshape(5, 7)
    
    predictions = np.zeros((N, 5))
    
    for d_out in range(5):
        p = params[d_out]
        base = p[0]
        self_coeff = p[1]
        self_exp = np.clip(p[2], 0.2, 3.5)  # Wider range for flexibility
        
        # Start with base
        pred = base
        
        # Self-domain: power law with domain-specific exponent
        x_self = np.maximum(X[:, d_out], 1e-10)
        pred += self_coeff * np.power(x_self, self_exp)
        
        # Cross-domain effects with quadratic interaction term
        cross_idx = 0
        for d_in in range(5):
            if d_in != d_out:
                cross_coeff = p[3 + cross_idx]
                x_cross = X[:, d_in]
                
                # Linear + small quadratic interaction with self-domain
                pred += cross_coeff * x_cross
                
                # Add subtle interaction between cross-domain and self-domain
                # This helps capture non-linear transfer effects
                interaction = 0.1 * cross_coeff * x_cross * x_self
                pred += interaction
                
                cross_idx += 1
        
        predictions[:, d_out] = pred
    
    return predictions


def fit_scaling_law(data_points, loss_values):
    """
    Hybrid optimization: Global search + local refinement
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.atleast_2d(np.asarray(loss_values))
    
    if y.shape[1] == 1:
        y = np.repeat(y, 5, axis=1)
    
    P = 35
    
    # Enhanced statistics
    y_mean = np.mean(y, axis=0)
    y_std = np.std(y, axis=0)
    y_min = np.min(y, axis=0)
    y_max = np.max(y, axis=0)
    y_range = y_max - y_min
    
    # Compute domain-specific correlations
    correlations = np.zeros((5, 5))
    for i in range(5):
        for j in range(5):
            if np.std(X[:, i]) > 1e-8 and np.std(y[:, j]) > 1e-8:
                correlations[i, j] = np.corrcoef(X[:, i], y[:, j])[0, 1]
    
    # Advanced initialization
    init = np.zeros(P)
    for d in range(5):
        idx = d * 7
        
        # Base: start lower to let other terms contribute
        init[idx] = y_min[d] + 0.2 * y_range[d]
        
        # Self-domain coefficient: strong effect
        init[idx + 1] = y_std[d] * 0.85
        
        # Self-domain exponent: domain-specific initialization
        # Domains with higher variance might benefit from different exponents
        variance_ratio = y_std[d] / (np.mean(y_std) + 1e-8)
        init[idx + 2] = 0.85 + 0.3 * variance_ratio  # Range ~0.85-1.15
        
        # Cross-domain coefficients: correlation-weighted
        cross_idx = 0
        for d_in in range(5):
            if d_in != d:
                corr_strength = np.abs(correlations[d_in, d])
                # Scale by correlation and relative domain importance
                init[idx + 3 + cross_idx] = y_std[d] * 0.12 * (1.0 + 1.5 * corr_strength)
                cross_idx += 1
    
    # Adaptive bounds per domain
    bounds = []
    for d in range(5):
        bounds.append((y_min[d] - 0.5, y_max[d] + 0.5))  # base
        bounds.append((-30.0, 30.0))  # self-coeff (wider for flexibility)
        bounds.append((0.2, 3.5))  # self-exp (wider range)
        bounds.extend([(-25.0, 25.0)] * 4)  # cross-coeffs
    
    def objective(p):
        try:
            pred = scaling_law_func(X, p)
            
            # MSE with per-domain weighting (focus on harder domains)
            errors = (pred - y) ** 2
            domain_weights = 1.0 + 0.5 * (y_std / (np.mean(y_std) + 1e-8))
            mse = np.mean(errors * domain_weights)
            
            P_mat = p[:35].reshape(5, 7)
            
            # Adaptive regularization
            self_coeff_scale = np.mean(np.abs(P_mat[:, 1]))
            cross_coeff_scale = np.mean(np.abs(P_mat[:, 3:7]))
            
            # L2 on self-domain coefficients
            reg_self = 2e-5 * np.sum(P_mat[:, 1] ** 2) * (1 + self_coeff_scale / 12)
            
            # L2 on cross-domain coefficients
            reg_cross = 4e-5 * np.sum(P_mat[:, 3:7] ** 2) * (1 + cross_coeff_scale / 10)
            
            # Exponent regularization: prefer near 1.0 but allow variation
            reg_exp = 2e-4 * np.sum((P_mat[:, 2] - 1.0) ** 2)
            
            # Base regularization
            reg_base = 5e-6 * np.sum((P_mat[:, 0] - y_mean) ** 2)
            
            return mse + reg_self + reg_cross + reg_exp + reg_base
        except:
            return 1e10
    
    best_params, best_score = None, float('inf')
    
    # Stage 1: Differential evolution for global exploration (quick pass)
    try:
        result = differential_evolution(
            objective,
            bounds,
            maxiter=150,
            popsize=8,
            seed=42,
            atol=1e-10,
            tol=1e-10
        )
        if result.fun < best_score:
            best_score, best_params = result.fun, result.x
    except:
        pass
    
    # Stage 2: L-BFGS-B from smart initialization
    try:
        res = minimize(
            objective, init, method='L-BFGS-B', bounds=bounds,
            options={'maxiter': 4000, 'ftol': 1e-13, 'gtol': 1e-9}
        )
        if res.fun < best_score:
            best_score, best_params = res.fun, res.x
    except:
        pass
    
    # Stage 3: L-BFGS-B from DE result
    if best_params is not None:
        try:
            res = minimize(
                objective, best_params, method='L-BFGS-B', bounds=bounds,
                options={'maxiter': 3000, 'ftol': 1e-13, 'gtol': 1e-9}
            )
            if res.fun < best_score:
                best_score, best_params = res.fun, res.x
        except:
            pass
    
    # Stage 4: Multiple restarts with strategic perturbations
    for attempt in range(3):
        try:
            if best_params is not None:
                # Perturb around best solution
                noise_scale = 0.05 * (attempt + 1)
                init_p = best_params + np.random.randn(P) * noise_scale
            else:
                # Perturb around initialization
                noise_scale = 0.08 * (attempt + 1)
                init_p = init + np.random.randn(P) * noise_scale
            
            res = minimize(
                objective, init_p, method='L-BFGS-B', bounds=bounds,
                options={'maxiter': 2500, 'ftol': 1e-12}
            )
            if res.fun < best_score:
                best_score, best_params = res.fun, res.x
        except:
            continue
    
    # Stage 5: Final refinement
    if best_params is not None and best_score < 0.03:
        try:
            res = minimize(
                objective, best_params, method='L-BFGS-B', bounds=bounds,
                options={'maxiter': 1500, 'ftol': 1e-14, 'gtol': 1e-10}
            )
            if res.fun < best_score:
                best_params = res.x
        except:
            pass
    
    return best_params if best_params is not None else init
# EVOLVE-BLOCK-END

#3 Run 5 R² = 0.997703

▼

Python

# EVOLVE-BLOCK-START
"""
Enhanced polynomial scaling law with domain-aware interactions
Key improvements:
- Pure power law basis without residual computation
- Polynomial saturation (quadratic + cubic) for better modeling
- Enhanced asymmetric interactions with complementarity effects
- Domain-specific adaptive weighting in optimization
- Refined regularization strategy
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    """
    Enhanced polynomial scaling law with asymmetric interactions
    Parameters (35 total):
    - 5 power law coefficients
    - 5 flexible exponents (per domain)
    - 5 quadratic coefficients (diminishing returns)
    - 5 cubic coefficients (saturation modeling)
    - 10 asymmetric interaction weights
    - 5 output biases
    """
    X = np.atleast_2d(np.asarray(data_points))
    N, F = X.shape
    params = np.asarray(params).ravel()
    
    # Parse parameters (35 total)
    power_coeffs = params[:F]           # 5 - power law coefficients
    exponents = params[F:2*F]           # 5 - flexible exponents
    quad_coeffs = params[2*F:3*F]       # 5 - quadratic terms
    cubic_coeffs = params[3*F:4*F]      # 5 - cubic saturation
    interactions = params[4*F:4*F+10]   # 10 - asymmetric interactions
    biases = params[4*F+10:5*F+10]      # 5 - output biases
    
    # Numerical stability
    X_safe = np.clip(X, 1e-6, 1.0)
    exponents = np.clip(exponents, 0.3, 1.8)
    
    # 1. Pure power law terms: c * x^exp (no residual subtraction)
    power_terms = power_coeffs[None, :] * (X_safe ** exponents[None, :])
    
    # 2. Quadratic diminishing returns: q * x^2
    quad_terms = quad_coeffs[None, :] * (X_safe ** 2)
    
    # 3. Cubic saturation effects: cub * x^3
    cubic_terms = cubic_coeffs[None, :] * (X_safe ** 3)
    
    # 4. Enhanced asymmetric interactions with complementarity
    interaction_matrix = np.zeros((N, F))
    idx = 0
    for i in range(F):
        for j in range(i+1, F):
            weight = interactions[idx]
            
            # Symmetric multiplicative coupling (geometric-like)
            sym_coupling = 0.5 * weight * X_safe[:, i] * X_safe[:, j]
            
            # Asymmetric complementarity effects
            # When domain i has high weight and domain j low, there's a cross effect
            asym_i_to_j = 0.3 * weight * X_safe[:, i] * (1.0 - X_safe[:, j])
            asym_j_to_i = 0.3 * weight * X_safe[:, j] * (1.0 - X_safe[:, i])
            
            # Distribute effects to both domains
            interaction_matrix[:, i] += sym_coupling + asym_j_to_i
            interaction_matrix[:, j] += sym_coupling + asym_i_to_j
            
            idx += 1
    
    # Combine all components
    predictions = power_terms + quad_terms + cubic_terms + interaction_matrix + biases[None, :]
    
    return predictions


def fit_scaling_law(data_points, loss_values):
    """
    Enhanced fitting with domain-aware initialization and adaptive optimization
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values)
    N, F = X.shape
    P = 35
    
    if y.ndim == 1:
        y = y.reshape(-1, 1)
    if y.shape[1] == 1:
        y = np.tile(y, (1, F))
    
    # Compute statistics
    y_mean = np.mean(y, axis=0)
    y_std = np.std(y, axis=0)
    y_min = np.min(y, axis=0)
    y_max = np.max(y, axis=0)
    x_mean = np.mean(X, axis=0)
    x_std = np.std(X, axis=0)
    
    # Smart initialization
    init = np.zeros(P)
    
    # Power coefficients: correlation-based initialization
    for i in range(F):
        if x_std[i] > 1e-6:
            corr = np.corrcoef(X[:, i], y[:, i])[0, 1]
            if not np.isnan(corr):
                slope = corr * y_std[i] / (x_std[i] + 1e-6)
                init[i] = slope * 0.75  # Slightly more aggressive than before
            else:
                init[i] = 0.6
        else:
            init[i] = 0.6
    
    # Exponents: start near linear
    init[F:2*F] = 1.0
    
    # Quadratic: small negative for diminishing returns
    init[2*F:3*F] = -0.12
    
    # Cubic: very small for subtle saturation
    init[3*F:4*F] = -0.025
    
    # Interactions: start small positive
    init[4*F:4*F+10] = 0.02
    
    # Biases: domain means
    init[4*F+10:5*F+10] = y_mean
    
    def objective(params):
        try:
            pred = scaling_law_func(X, params)
            residuals = pred - y
            
            # Per-domain MSE
            per_dim_mse = np.mean(residuals ** 2, axis=0)
            
            # Adaptive weighting: balance based on geometric mean of variance and range
            difficulty = np.sqrt(y_std * (y_max - y_min) + 0.1)
            weights = 1.0 / (difficulty + 0.05)
            weights = weights / np.sum(weights)
            
            mse = np.sum(weights * per_dim_mse)
            
            # Differential regularization strategy
            # Light on power terms, moderate on polynomial, stronger on interactions
            reg_power = 0.00003 * np.sum(params[:2*F] ** 2)
            reg_poly = 0.0001 * np.sum(params[2*F:4*F] ** 2)
            reg_inter = 0.0003 * np.sum(params[4*F:4*F+10] ** 2)
            
            # Penalty for extreme exponents
            exp_penalty = 0.0015 * np.sum(np.maximum(0, np.abs(params[F:2*F]) - 1.6) ** 2)
            
            return mse + reg_power + reg_poly + reg_inter + exp_penalty
        except:
            return 1e10
    
    # Bounds
    bounds = []
    bounds.extend([(-8, 8)] * F)           # power coefficients
    bounds.extend([(0.3, 1.8)] * F)        # exponents
    bounds.extend([(-1.2, 0.5)] * F)       # quadratic
    bounds.extend([(-0.4, 0.15)] * F)      # cubic
    bounds.extend([(-1.2, 1.2)] * 10)      # interactions
    bounds.extend([(y_min.min() - 2*y_std.max(), 
                    y_max.max() + 2*y_std.max())] * F)  # biases
    
    # Multi-start optimization with adaptive perturbations
    best_result = None
    best_loss = float('inf')
    
    for restart in range(8):
        if restart == 0:
            init_trial = init.copy()
        else:
            base = best_result.x if best_result is not None else init
            # Progressive perturbation increase
            if restart < 4:
                noise_scale = 0.025 * (1 + restart * 0.3)
            else:
                noise_scale = 0.08 * (1 + (restart-4) * 0.2)
            
            init_trial = base + np.random.randn(P) * noise_scale
            
            # Clip to bounds
            for i, (lb, ub) in enumerate(bounds):
                init_trial[i] = np.clip(init_trial[i], lb, ub)
        
        try:
            result = minimize(
                objective,
                init_trial,
                method='L-BFGS-B',
                bounds=bounds,
                options={'maxiter': 750, 'ftol': 1e-9}
            )
            
            if result.fun < best_loss:
                best_loss = result.fun
                best_result = result
        except:
            continue
    
    return best_result.x if best_result is not None else init
# EVOLVE-BLOCK-END

#4 Run 2 R² = 0.996392

▼

Python

# EVOLVE-BLOCK-START
"""
Refined scaling law with domain-specific power laws and improved numerical stability
Uses exactly 35 parameters: 7 per output domain (5 domains)
- 5 linear coefficients (domain mixture effects)
- 1 power exponent (non-linearity per output)
- 1 bias term (base loss level)

Key improvements:
- Strict 35 parameter limit (7×5)
- Simplified power law model for better generalization
- Enhanced numerical stability with gradient clipping
- Improved initialization using robust statistics
- Adaptive regularization based on parameter magnitude
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    """
    Power law model per domain: y_d = sum_i(w_{d,i} * x_i^alpha_d) + b_d
    
    Parameters per output d:
    - w_{d,i}: 5 weights showing how input domain i affects output domain d
    - alpha_d: power exponent for non-linear transformation
    - b_d: bias term for base loss level
    """
    X = np.atleast_2d(np.asarray(data_points))  # (N, 5)
    N, F = X.shape
    params = np.asarray(params)
    
    if params.ndim == 1:
        params = params[None, :]  # (1, P)
    T, P = params.shape
    
    predictions = np.zeros((N, T))
    
    for t in range(T):
        # Extract 7 parameters for this output domain
        p = params[t, :] if T > 1 else params[0, :]
        
        weights = p[:F]       # 5 weights
        exponent = p[F]       # 1 exponent
        bias = p[F+1]         # 1 bias
        
        # Numerical safety: clip inputs and exponent
        eps = 1e-10
        X_safe = np.clip(X, eps, 1.0)
        exp_clipped = np.clip(exponent, 0.25, 2.8)
        
        # Apply power transformation
        X_powered = X_safe ** exp_clipped
        
        # Weighted sum plus bias
        predictions[:, t] = np.dot(X_powered, weights) + bias
    
    return predictions[:, 0] if T == 1 else predictions


def fit_scaling_law(data_points, loss_values):
    """
    Robust fitting with correlation-based initialization and multi-start optimization
    """
    X = np.atleast_2d(np.asarray(data_points))  # (N, 5)
    y = np.asarray(loss_values)
    N, F = X.shape
    
    if y.ndim == 1:
        y2d = y[:, None]
    else:
        y2d = y
    T = y2d.shape[1]
    
    # Exactly 7 parameters per output
    P = 7
    
    # Robust initialization using correlations and statistics
    init = np.zeros((T, P))
    
    for t in range(T):
        y_t = y2d[:, t]
        y_mean = np.mean(y_t)
        y_std = np.std(y_t)
        
        # Initialize weights based on correlation with clipping
        for i in range(F):
            X_std = np.std(X[:, i])
            
            if X_std > 1e-8 and y_std > 1e-8:
                # Compute Pearson correlation
                X_centered = X[:, i] - np.mean(X[:, i])
                y_centered = y_t - y_mean
                corr = np.sum(X_centered * y_centered) / (N * X_std * y_std)
                corr = np.clip(corr, -1.0, 1.0)
            else:
                corr = 0.0
            
            # Scale by standard deviation ratio with dampening
            scale_factor = (y_std / (X_std + 1e-8)) * 0.18
            init[t, i] = corr * scale_factor
        
        # Initialize exponent close to 1 (near-linear)
        init[t, F] = 1.0
        
        # Initialize bias as mean output
        init[t, F+1] = y_mean
    
    # Adaptive regularization
    reg_weight = 0.003
    reg_exp = 0.0008
    
    def objective(flat_params):
        params = flat_params.reshape(T, P)
        
        try:
            pred = scaling_law_func(X, params)
            
            if pred.ndim == 1:
                pred = pred[:, None]
            
            # Mean squared error
            mse = np.mean((pred - y2d) ** 2)
            
            # L2 regularization on weights (adaptive to magnitude)
            weights_all = params[:, :F]
            reg_weights = reg_weight * np.mean(weights_all ** 2)
            
            # Regularization on exponents (prefer near 1)
            exponents = params[:, F]
            reg_exponents = reg_exp * np.sum((exponents - 1.0) ** 2)
            
            return mse + reg_weights + reg_exponents
        except:
            return 1e10
    
    # Bounds for numerical stability
    bounds = []
    for t in range(T):
        # Weights: allow both positive and negative
        bounds.extend([(-6.5, 6.5)] * F)
        # Exponent: bounded for stability
        bounds.append((0.25, 2.8))
        # Bias: reasonable range for loss values
        bounds.append((0.2, 6.5))
    
    # Multi-start optimization for robustness
    best_result = None
    best_loss = float('inf')
    
    # Attempt 1: Smart initialization
    result = minimize(
        objective,
        init.ravel(),
        method='L-BFGS-B',
        bounds=bounds,
        options={'maxiter': 2500, 'ftol': 1e-11, 'gtol': 1e-8}
    )
    
    if result.fun < best_loss:
        best_loss = result.fun
        best_result = result
    
    # Attempt 2-5: Perturbed initializations with decreasing noise
    np.random.seed(42)
    for restart_idx in range(4):
        # Controlled perturbation with decreasing magnitude
        noise_scale = 0.13 * (1.0 - restart_idx * 0.22)
        perturbed_init = init + np.random.randn(*init.shape) * noise_scale
        
        # Keep exponents in reasonable range
        perturbed_init[:, F] = np.clip(perturbed_init[:, F], 0.7, 1.5)
        
        result = minimize(
            objective,
            perturbed_init.ravel(),
            method='L-BFGS-B',
            bounds=bounds,
            options={'maxiter': 2500, 'ftol': 1e-11, 'gtol': 1e-8}
        )
        
        if result.fun < best_loss:
            best_loss = result.fun
            best_result = result
    
    # Return best result or fallback to initialization
    if best_result is not None and best_result.success:
        params_opt = best_result.x.reshape(T, P)
    else:
        params_opt = init
    
    return params_opt[0] if T == 1 else params_opt
# EVOLVE-BLOCK-END

#5 Run 1 R² = 0.995904

▼

Python

# EVOLVE-BLOCK-START
"""
Refined scaling law with SVD initialization and focused optimization
Key improvements over current:
- SVD-based coefficient initialization for numerical robustness
- Log-space exponent estimation with physical constraints
- Focused 5-trial multi-start with strategic perturbations
- Balanced regularization tuned for generalization
- Enhanced numerical stability
Total: 35 parameters (25 coeffs + 5 exponents + 5 biases)
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    """
    Structured multi-output scaling law: y_i = sum_j(c_ij * x_j^e_j) + b_i
    
    Parameters (35 total):
    - coeffs: 25 params (5 outputs × 5 domains)
    - exponents: 5 params (shared domain transformations)
    - biases: 5 params (output-specific offsets)
    """
    X = np.atleast_2d(np.asarray(data_points))
    N, F = X.shape
    params = np.asarray(params)
    
    if params.ndim == 1:
        params = params[None, :]
    
    # Multi-output case with 35 parameters
    if params.shape[0] == 1 and F == 5 and params.shape[1] == 35:
        coeffs = params[0, :25].reshape(5, 5)
        exponents = params[0, 25:30]
        biases = params[0, 30:35]
        
        # Enhanced numerical stability with tighter clipping
        X_safe = np.clip(X, 1e-10, 1.0)
        X_transformed = X_safe ** exponents[None, :]
        pred = X_transformed @ coeffs.T + biases[None, :]
        
    else:
        # Fallback for non-standard cases
        T = params.shape[0]
        P = params.shape[1]
        
        if P >= 2 * F + 1:
            coeffs = params[:, :F]
            exponents = params[:, F:2*F]
            bias = params[:, -1]
            
            X_safe = np.clip(X, 1e-10, 1.0)
            pred = (coeffs[None, :, :] * (X_safe[:, None, :] ** exponents[None, :, :])).sum(axis=2) + bias[None, :]
            pred = pred[:, 0] if pred.shape[1] == 1 else pred
        else:
            pred = np.zeros((N, T))
    
    return pred


def fit_scaling_law(data_points, loss_values):
    """
    Enhanced optimization with SVD initialization and focused multi-start
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values)
    N, F = X.shape
    
    if y.ndim == 1:
        y = y[:, None]
    T = y.shape[1]
    
    if F == 5 and T == 5:
        X_safe = np.clip(X, 1e-10, 1.0)
        
        # SVD-based coefficient initialization for numerical stability
        init_coeffs = np.zeros((5, 5))
        for t in range(T):
            try:
                # SVD provides better conditioning than direct least squares
                U, s, Vt = np.linalg.svd(X, full_matrices=False)
                # Regularize small singular values to prevent instability
                s_reg = np.where(s > 0.01 * s[0], 1.0 / s, 0.0)
                X_pinv = Vt.T @ np.diag(s_reg) @ U.T
                init_coeffs[t] = X_pinv @ y[:, t]
            except:
                # Fallback to ridge regression
                try:
                    XtX = X.T @ X + 0.008 * np.eye(F)
                    Xty = X.T @ y[:, t]
                    init_coeffs[t] = np.linalg.solve(XtX, Xty)
                except:
                    init_coeffs[t] = np.ones(F) * (np.mean(y[:, t]) / F)
        
        # Log-space exponent estimation for physically grounded initialization
        init_exponents = np.ones(5) * 0.72
        for f in range(F):
            if np.std(X[:, f]) > 1e-6:
                sorted_idx = np.argsort(X[:, f])
                x_sorted = X[sorted_idx, f]
                y_mean_sorted = np.mean(y[sorted_idx], axis=1)
                
                # Log-space regression for power law estimation
                valid_mask = x_sorted > 0.05
                if np.sum(valid_mask) >= 5:
                    x_log = np.log(x_sorted[valid_mask] + 1e-10)
                    y_log = np.log(y_mean_sorted[valid_mask] + 1e-10)
                    
                    try:
                        # Fit log(y) ~ exp * log(x) + const
                        x_mean = np.mean(x_log)
                        y_mean = np.mean(y_log)
                        cov = np.sum((x_log - x_mean) * (y_log - y_mean))
                        var = np.sum((x_log - x_mean) ** 2)
                        
                        if abs(var) > 1e-8:
                            exp_est = cov / var
                            init_exponents[f] = np.clip(exp_est, 0.3, 1.4)
                        else:
                            init_exponents[f] = 0.72
                    except:
                        init_exponents[f] = 0.72
                
                # Additional curvature refinement
                if len(x_sorted) >= 6:
                    n = len(x_sorted)
                    q1, q2, q3 = n // 4, n // 2, 3 * n // 4
                    
                    s1 = (y_mean_sorted[q2] - y_mean_sorted[q1]) / max(x_sorted[q2] - x_sorted[q1], 1e-6)
                    s2 = (y_mean_sorted[q3] - y_mean_sorted[q2]) / max(x_sorted[q3] - x_sorted[q2], 1e-6)
                    
                    # Adjust based on curvature pattern
                    if s1 > s2 * 1.25:
                        init_exponents[f] = min(init_exponents[f], 0.65)
                    elif s2 > s1 * 1.25:
                        init_exponents[f] = max(init_exponents[f], 1.0)
        
        # Robust bias initialization using trimmed mean
        init_biases = np.zeros(5)
        for t in range(T):
            pred_linear = X @ init_coeffs[t]
            residuals = y[:, t] - pred_linear
            # Trimmed mean for outlier resistance
            sorted_res = np.sort(residuals)
            trim = max(1, N // 10)
            init_biases[t] = np.mean(sorted_res[trim:-trim]) * 0.45
        
        init = np.concatenate([init_coeffs.ravel(), init_exponents, init_biases])
        
        def objective(params_flat):
            pred = scaling_law_func(X, params_flat)
            mse = np.mean((pred - y) ** 2)
            
            coeffs_part = params_flat[:25]
            exponents_part = params_flat[25:30]
            
            # Balanced regularization tuned for generalization
            reg_coeffs = 0.00045 * np.sum(coeffs_part ** 2)
            
            # Domain-specific target exponents
            target_exp = np.array([0.68, 0.72, 0.75, 0.70, 0.73])
            reg_exponents = 0.0020 * np.sum((exponents_part - target_exp) ** 2)
            
            return mse + reg_coeffs + reg_exponents
        
        # Tighter bounds for better stability
        bounds = [(None, None)] * 25 + [(0.25, 1.8)] * 5 + [(None, None)] * 5
        
        # Focused 5-trial multi-start with strategic exploration
        best_result = None
        best_loss = float('inf')
        
        for trial in range(5):
            if trial == 0:
                # Use smart initialization
                init_trial = init.copy()
            elif trial == 1:
                # Perturb coefficients with moderate noise
                init_trial = init.copy()
                init_trial[:25] += np.random.randn(25) * 0.075
            elif trial == 2:
                # Explore different exponent values
                init_trial = init.copy()
                init_trial[25:30] += np.random.randn(5) * 0.10
                init_trial[25:30] = np.clip(init_trial[25:30], 0.25, 1.8)
            elif trial == 3:
                # Balanced global perturbation
                init_trial = init + np.random.randn(35) * 0.09
                init_trial[25:30] = np.clip(init_trial[25:30], 0.25, 1.8)
            else:
                # Conservative exploration with bias focus
                init_trial = init.copy()
                init_trial[:25] += np.random.randn(25) * 0.05
                init_trial[30:35] += np.random.randn(5) * 0.15
            
            try:
                result = minimize(
                    objective, 
                    init_trial, 
                    method='L-BFGS-B', 
                    bounds=bounds, 
                    options={'maxiter': 1700, 'ftol': 1e-11, 'gtol': 1e-9}
                )
                
                if result.fun < best_loss:
                    best_loss = result.fun
                    best_result = result
            except:
                continue
        
        params_opt = best_result.x if (best_result is not None and best_result.success) else init
        
    else:
        # Fallback for non-standard dimensions
        P = 2 * F + 1
        init = np.ones((T, P))
        init[:, :F] = np.mean(y) / F
        init[:, F:2*F] = 0.72
        
        def objective(flat_params):
            params = flat_params.reshape(T, P)
            pred = scaling_law_func(X, params)
            mse = np.mean((pred - y) ** 2)
            reg = 0.0007 * np.sum(params ** 2)
            return mse + reg
        
        result = minimize(objective, init.ravel(), method='L-BFGS-B')
        params_opt = result.x.reshape(T, P) if result.success else init
        params_opt = params_opt[0] if T == 1 else params_opt
    
    return params_opt
# EVOLVE-BLOCK-END