SLD - Parallel Scaling Law - SLDAgent + Claude Sonnet 4.5

All Runs (sorted by R²)

Best Run 4 R² = 0.999957

▼

Python

# EVOLVE-BLOCK-START
"""
Enhanced parallel scaling law with multiplicative interaction:
L = C * N^(-alpha) / (1 + beta * log(P)) + gamma
This form captures the diminishing returns of parallelization more naturally.
4 parameters: alpha, beta, gamma, C
"""
import numpy as np
from scipy.optimize import differential_evolution, minimize

def scaling_law_func(data_points, params):
    """
    Scaling law with logarithmic parallel benefit:
    L = C * N^(-alpha) / (1 + beta * log(P)) + gamma
    params = [alpha, beta, gamma, C]
    """
    X = np.atleast_2d(np.asarray(data_points))
    params = np.asarray(params)
    
    if params.ndim == 1:
        params = params[None, :]
    
    N, P = X[:, 0], X[:, 1]
    
    results = []
    for p in params:
        alpha, beta, gamma, C = p
        N_safe = np.maximum(N, 1e6)
        P_safe = np.maximum(P, 1.0)
        
        # Logarithmic parallel scaling with diminishing returns
        log_P = np.log(P_safe)
        parallel_factor = 1.0 + beta * log_P
        parallel_factor = np.maximum(parallel_factor, 0.1)  # Numerical stability
        
        pred = C * np.power(N_safe, -alpha) / parallel_factor + gamma
        results.append(pred)
    
    result = np.array(results).T
    return result[:, 0] if result.shape[1] == 1 else result


def fit_scaling_law(data_points, loss_values):
    """
    Robust optimization with expanded search space for new formulation
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values)
    
    if y.ndim == 1:
        y = y[:, None]
    
    N_samples, N_outputs = y.shape
    all_params = []
    
    for i in range(N_outputs):
        y_i = y[:, i]
        y_std = np.std(y_i)
        
        def objective(params):
            pred = scaling_law_func(X, params)
            if pred.ndim > 1:
                pred = pred[:, i]
            mse = np.mean((pred - y_i) ** 2)
            # Normalize regularization by data scale
            reg = (1e-4 * y_std) * (params[0]**2 + params[1]**2)
            return mse + reg
        
        # Adjusted bounds for logarithmic formulation
        # beta now scales log(P) instead of P directly
        bounds = [
            (0.01, 0.50),    # alpha: parameter scaling exponent
            (0.01, 0.40),    # beta: log-parallel scaling coefficient
            (0.50, 2.50),    # gamma: asymptotic loss floor
            (1e-4, 1e5)      # C: scaling constant
        ]
        
        # Multi-restart global optimization
        best_score = float('inf')
        best_x = None
        
        # Try more seeds for robustness with new formulation
        for seed in [42, 123, 256, 789]:
            result = differential_evolution(
                objective,
                bounds,
                maxiter=350,
                popsize=16,
                seed=seed,
                strategy='best1bin',
                atol=1e-9,
                tol=1e-9,
                mutation=(0.5, 1.5),
                recombination=0.8,
                polish=False,
                workers=1
            )
            
            if result.fun < best_score:
                best_score = result.fun
                best_x = result.x
        
        # Multi-method local refinement
        methods = ['L-BFGS-B', 'TNC']
        best_local_score = best_score
        best_local_x = best_x
        
        for method in methods:
            try:
                result_local = minimize(
                    objective,
                    best_x,
                    method=method,
                    bounds=bounds,
                    options={'maxiter': 2000, 'ftol': 1e-11}
                )
                
                if result_local.success and result_local.fun < best_local_score:
                    best_local_score = result_local.fun
                    best_local_x = result_local.x
            except:
                continue
        
        final_params = best_local_x if best_local_score < best_score else best_x
        all_params.append(final_params)
    
    params_array = np.array(all_params)
    return params_array[0] if N_outputs == 1 else params_array
# EVOLVE-BLOCK-END

#2 Run 1 R² = 0.999911

▼

Python

# EVOLVE-BLOCK-START
"""
Optimized scaling law: L = a * N^b + c * P^d
Additive power law with normalized inputs for stability
4 parameters: [a, b, c, d]
"""
import numpy as np
from scipy.optimize import differential_evolution, minimize

def scaling_law_func(data_points, params):
    """
    Scaling law: L = a * (N/1e9)^b + c * P^d
    Normalized N for numerical stability
    """
    X = np.atleast_2d(np.asarray(data_points))
    a, b, c, d = params[:4]
    
    N = X[:, 0] / 1e9  # Normalize to billions
    P = X[:, 1]
    
    # Ensure numerical stability
    N = np.maximum(N, 1e-8)
    P = np.maximum(P, 1.0)
    
    return a * np.power(N, b) + c * np.power(P, d)


def fit_scaling_law(data_points, loss_values):
    """
    Fit using differential evolution + local refinement
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values).flatten()
    
    P = X[:, 1]
    
    # Analyze parallel effect
    y_mean = np.mean(y)
    p_losses = {}
    for p in np.unique(P):
        p_losses[p] = np.mean(y[P == p])
    
    # Estimate parallel delta
    if len(p_losses) > 1:
        p_sorted = sorted(p_losses.keys())
        parallel_delta = p_losses[p_sorted[0]] - p_losses[p_sorted[-1]]
    else:
        parallel_delta = 0.05
    
    def objective(params):
        pred = scaling_law_func(X, params)
        return np.mean((pred - y) ** 2) + 1e-9 * np.sum(params ** 2)
    
    # Tight bounds based on top performers
    bounds = [
        (0.1, 4.5),      # a: main coefficient
        (-0.3, 0.3),     # b: param exponent
        (-2.0, 2.0),     # c: parallel coefficient
        (-0.7, 0.2)      # d: parallel exponent
    ]
    
    # Global search
    result = differential_evolution(
        objective,
        bounds,
        seed=42,
        maxiter=400,
        popsize=15,
        atol=1e-11,
        tol=1e-11,
        polish=True
    )
    
    # Local refinement
    result = minimize(
        objective,
        result.x,
        method='L-BFGS-B',
        bounds=bounds,
        options={'maxiter': 2000, 'ftol': 1e-13}
    )
    
    return result.x
# EVOLVE-BLOCK-END

#3 Run 2 R² = 0.999894

▼

Python

# EVOLVE-BLOCK-START
"""
Compact multiplicative scaling law: loss = a * num_params^b * parallel_size^c * (1 + d/parallel_size)
Simpler form capturing parameter decay and parallel efficiency with fewer operations.
"""
import numpy as np
from scipy.optimize import differential_evolution, minimize

def scaling_law_func(data_points, params):
    """
    Scaling law: loss = a * num_params^b * parallel_size^c * (1 + d/parallel_size)
    params: [a, b, c, d] - base, param exp, parallel exp, interaction term
    """
    data_points = np.atleast_2d(np.asarray(data_points))
    params = np.asarray(params).ravel()
    
    num_params = data_points[:, 0] / 1e9  # Normalize to billions
    parallel_size = data_points[:, 1]
    
    a, b, c, d = params
    
    # Multiplicative form with interaction term
    return a * np.power(num_params, b) * np.power(parallel_size, c) * (1 + d / parallel_size)


def fit_scaling_law(data_points, loss_values):
    """
    Compact two-stage fitting: global + local refinement.
    """
    data_points = np.atleast_2d(np.asarray(data_points))
    loss_values = np.asarray(loss_values).ravel()
    
    def objective(params):
        try:
            pred = scaling_law_func(data_points, params)
            if not np.all(np.isfinite(pred)):
                return 1e10
            return np.mean((pred - loss_values) ** 2) + 0.0001 * (params[1]**2 + params[2]**2)
        except:
            return 1e10
    
    # Adjusted bounds for multiplicative form
    bounds = [(0.3, 10.0), (-0.6, 0.1), (-0.5, 0.1), (-0.5, 0.5)]
    
    # Global optimization
    result = differential_evolution(
        objective,
        bounds,
        strategy='best1bin',
        maxiter=500,
        popsize=24,
        mutation=(0.5, 1.2),
        recombination=0.8,
        seed=42,
        atol=1e-9,
        tol=1e-9,
        polish=True
    )
    
    best_params = result.x
    best_score = result.fun
    
    # Local refinement
    if result.success:
        try:
            local = minimize(
                objective,
                result.x,
                method='L-BFGS-B',
                bounds=bounds,
                options={'maxiter': 1800, 'ftol': 1e-12}
            )
            if local.success and local.fun < best_score:
                best_params = local.x
                best_score = local.fun
        except:
            pass
        
        # Final polish
        try:
            final = minimize(
                objective,
                best_params,
                method='L-BFGS-B',
                bounds=bounds,
                options={'maxiter': 2200, 'ftol': 1e-13, 'gtol': 1e-11}
            )
            if final.success and final.fun < best_score:
                best_params = final.x
        except:
            pass
    
    return best_params
# EVOLVE-BLOCK-END

#4 Run 3 R² = 0.999889

▼

Python

# EVOLVE-BLOCK-START
"""
Simplified power-log scaling law
Uses 4 parameters: [a, b, c, d]
Form: loss = a * N^b - c * log(1 + d * (P - 1))
where N = num_params (normalized), P = parallel_size

Theoretical basis:
- Power law for parameter scaling (Chinchilla, GPT-3)
- Logarithmic benefit from parallel aggregation (information theory)
- Subtractive form: parallel copies independently reduce loss
- Minimal complexity for robust fitting
"""
import numpy as np
from scipy.optimize import differential_evolution

def scaling_law_func(data_points, params):
    """
    Compute: loss = a * N^b - c * log(1 + d * (P - 1))
    """
    data_points = np.atleast_2d(np.asarray(data_points))
    params = np.asarray(params).ravel()
    
    N = np.maximum(data_points[:, 0] / 1e9, 1e-3)  # Normalize to billions
    P = np.maximum(data_points[:, 1], 1.0)
    
    a, b, c, d = params
    
    base = a * np.power(N, b)
    benefit = c * np.log1p(d * (P - 1.0))
    
    return base - benefit


def fit_scaling_law(data_points, loss_values):
    """
    Fit via differential evolution with refined bounds
    """
    data_points = np.atleast_2d(np.asarray(data_points))
    loss_values = np.asarray(loss_values).ravel()
    
    def objective(params):
        try:
            pred = scaling_law_func(data_points, params)
            if not np.all(np.isfinite(pred)):
                return 1e10
            return np.mean((pred - loss_values) ** 2) + 1e-10 * np.sum(params**2)
        except:
            return 1e10
    
    bounds = [
        (0.3, 6.0),      # a: base coefficient  
        (-0.6, -0.03),   # b: parameter exponent (negative)
        (0.0, 0.4),      # c: parallel benefit magnitude
        (0.05, 4.0)      # d: parallel efficiency rate
    ]
    
    # Multi-start for robustness
    best_params = None
    best_loss = float('inf')
    
    for seed in [42, 123, 789]:
        result = differential_evolution(
            objective,
            bounds=bounds,
            seed=seed,
            maxiter=600,
            atol=1e-12,
            tol=1e-12,
            popsize=25,
            strategy='best1bin',
            mutation=(0.6, 1.8),
            recombination=0.8,
            polish=True,
            workers=1
        )
        
        if result.fun < best_loss:
            best_loss = result.fun
            best_params = result.x
    
    return best_params
# EVOLVE-BLOCK-END

#5 Run 5 R² = 0.999885

▼

Python

# EVOLVE-BLOCK-START
"""
Optimized scaling law: loss = a * N^b + c / (1 + d * P)
Uses hyperbolic decay for parallel benefits with streamlined fitting
Key: Simpler code with better parameter estimation
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Scaling law: loss = a * (N/1e9)^b + c / (1 + d * P)
    
    params[0] = a: base scale for power law
    params[1] = b: power exponent (negative)
    params[2] = c: parallel benefit scale
    params[3] = d: parallel saturation rate
    """
    data_points = np.atleast_2d(np.asarray(data_points))
    params = np.asarray(params).ravel()
    
    N_norm = np.maximum(data_points[:, 0] / 1e9, 0.1)
    P = np.maximum(data_points[:, 1], 1.0)
    
    a, b, c, d = params
    
    # Power law + hyperbolic parallel benefit
    loss = a * np.power(N_norm, b) + c / (1.0 + d * P)
    
    return loss


def fit_scaling_law(data_points, loss_values):
    """
    Streamlined hybrid optimization
    """
    data_points = np.atleast_2d(np.asarray(data_points))
    loss_values = np.asarray(loss_values).ravel()
    
    P = data_points[:, 1]
    
    # Data-driven estimates
    high_p = P >= np.percentile(P, 70)
    low_p = P <= np.percentile(P, 30)
    
    baseline = np.mean(loss_values[high_p]) if np.any(high_p) else np.min(loss_values)
    parallel_gain = (np.mean(loss_values[low_p]) - baseline) if np.any(low_p) and np.any(high_p) else np.std(loss_values) * 0.5
    parallel_gain = max(parallel_gain, 0.01)
    
    mean_loss = np.mean(loss_values)
    max_loss = np.max(loss_values)
    
    def objective(params):
        pred = scaling_law_func(data_points, params)
        mse = np.mean((pred - loss_values) ** 2)
        reg = 1e-8 * (params[0]**2 + params[2]**2)
        return mse + reg
    
    # Optimized bounds
    bounds = [
        (baseline * 0.3, max_loss * 2),    # a
        (-0.4, 0.05),                       # b
        (0.001, parallel_gain * 3),         # c
        (0.001, 5.0)                        # d
    ]
    
    best_params = None
    best_score = float('inf')
    
    # Global search
    try:
        res = differential_evolution(
            objective, bounds, seed=42,
            maxiter=400, popsize=15,
            atol=1e-10, tol=1e-10,
            workers=1, polish=True
        )
        best_params = res.x
        best_score = res.fun
    except:
        pass
    
    # Strategic local searches
    inits = [
        [baseline * 1.05, -0.06, parallel_gain * 0.9, 0.35],
        [baseline * 0.95, -0.08, parallel_gain * 1.1, 0.45],
        [mean_loss * 0.9, -0.05, parallel_gain * 0.7, 0.25],
        [baseline * 1.2, -0.09, parallel_gain * 1.3, 0.6],
    ]
    
    if best_params is not None:
        inits.insert(0, best_params)
    
    for init in inits:
        try:
            res = minimize(
                objective, init,
                method='L-BFGS-B', bounds=bounds,
                options={'maxiter': 2000, 'ftol': 1e-12, 'gtol': 1e-9}
            )
            if res.fun < best_score:
                best_score = res.fun
                best_params = res.x
        except:
            continue
    
    # Final polish
    if best_params is not None:
        try:
            res = minimize(
                objective, best_params,
                method='L-BFGS-B', bounds=bounds,
                options={'maxiter': 1500, 'ftol': 1e-13}
            )
            if res.fun < best_score:
                best_params = res.x
        except:
            pass
    
    # Fallback
    if best_params is None:
        best_params = np.array([baseline * 1.05, -0.06, parallel_gain * 0.9, 0.35])
    
    return best_params
# EVOLVE-BLOCK-END