SLD - Parallel Scaling Law - SLDAgent + Claude Haiku 4.5

All Runs (sorted by R²)

Best Run 4 R² = 0.999952

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law for LLM parallel training with enhanced optimization
Uses 4-parameter model: loss = a * N^b / P^c + d
- N = num_params, P = parallel_size
- Captures both parameter scaling and parallel benefit with superior fitting
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Scaling law: loss = a * num_params^b / parallel_size^c + d
    
    params: [a, b, c, d] (exactly 4 parameters)
    - a: coefficient (> 0)
    - b: exponent for num_params (typically -0.1 to 0.2)
    - c: exponent for parallel_size (typically 0 to 0.5)
    - d: baseline loss offset
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    params = np.asarray(params, dtype=np.float64)
    
    num_params = X[:, 0]
    parallel_size = X[:, 1]
    
    a = params[0]
    b = params[1]
    c = params[2]
    d = params[3]
    
    # Numerically stable computation with epsilon guards
    eps = 1e-10
    num_params_safe = np.maximum(num_params, eps)
    parallel_size_safe = np.maximum(parallel_size, eps)
    
    # Compute: a * N^b / P^c + d
    numerator = a * np.power(num_params_safe, b)
    denominator = np.power(parallel_size_safe, c)
    
    loss = numerator / (denominator + eps) + d
    
    return loss


def fit_scaling_law(data_points, loss_values):
    """
    Fit 4-parameter scaling law using three-phase optimization:
    Phase 1: Smart local optimization from data-driven initialization
    Phase 2: Global optimization if needed with fine convergence
    Phase 3: Local refinement on best solution found
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64).ravel()
    
    num_params = X[:, 0]
    parallel_size = X[:, 1]
    
    # Data statistics
    n_min, n_max = num_params.min(), num_params.max()
    p_min, p_max = parallel_size.min(), parallel_size.max()
    y_min, y_max = y.min(), y.max()
    y_range = y_max - y_min
    
    def objective(params):
        """Objective function with robustness checks"""
        try:
            pred = scaling_law_func(X, params)
            if np.any(np.isnan(pred)) or np.any(np.isinf(pred)):
                return 1e10
            mse = np.mean((pred - y) ** 2)
            return max(float(mse), 0)
        except:
            return 1e10
    
    # Data-driven bounds based on observations
    bounds = [
        (1e-6, 1e3),            # a: coefficient (positive)
        (-0.2, 0.2),            # b: num_params exponent (small)
        (0.0, 0.5),             # c: parallel_size exponent (small positive)
        (y_min - 0.5, y_max + 0.5)  # d: baseline offset
    ]
    
    # Enhanced smart initialization from data characteristics
    y_span = y_range if y_range > 1e-6 else 1.0
    a_init = y_span / np.power(np.maximum(n_max, 1), 0.05)
    b_init = -0.05
    c_init = 0.15
    d_init = y_min - 0.05 * y_span
    
    x0 = np.array([a_init, b_init, c_init, d_init])
    x0 = np.clip(x0, [b[0] for b in bounds], [b[1] for b in bounds])
    
    # Phase 1: Local optimization from smart initialization with tight convergence
    result_local = minimize(
        objective,
        x0,
        method='L-BFGS-B',
        bounds=bounds,
        options={'maxiter': 1000, 'ftol': 1e-11, 'gtol': 1e-9}
    )
    
    best_params = result_local.x if result_local.success else x0
    best_loss = objective(best_params)
    
    # Phase 2: Global optimization if local result is suboptimal
    if best_loss > 0.005:
        result_global = differential_evolution(
            objective,
            bounds,
            maxiter=500,
            popsize=20,
            seed=42,
            atol=1e-11,
            tol=1e-11,
            workers=1,
            updating='deferred',
            strategy='best1bin'
        )
        
        if result_global.fun < best_loss:
            best_params = result_global.x
            best_loss = result_global.fun
            
            # Phase 3: Local refinement on global solution
            result_local2 = minimize(
                objective,
                best_params,
                method='L-BFGS-B',
                bounds=bounds,
                options={'maxiter': 1000, 'ftol': 1e-11, 'gtol': 1e-9}
            )
            
            if result_local2.fun < best_loss:
                best_params = result_local2.x
    else:
        # Fine-tune already-good local result
        result_local_refined = minimize(
            objective,
            best_params,
            method='L-BFGS-B',
            bounds=bounds,
            options={'maxiter': 1000, 'ftol': 1e-12, 'gtol': 1e-10}
        )
        
        if result_local_refined.fun < best_loss:
            best_params = result_local_refined.x
    
    return best_params
# EVOLVE-BLOCK-END

#2 Run 1 R² = 0.999949

▼

Python

# EVOLVE-BLOCK-START
"""
Optimized scaling law for LLM parallel training
Multiplicative inverse power law with high-precision fitting
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    """
    Scaling law: loss = a + b / (num_params^c * parallel_size^d)
    
    Multiplicative form in denominator captures synergistic scaling effects between
    model size and parallel aggregation.
    
    params[0] = a (baseline loss)
    params[1] = b (scaling coefficient)
    params[2] = c (parameter exponent)
    params[3] = d (parallel exponent)
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    p = np.atleast_1d(np.asarray(params, dtype=np.float64))
    
    # Extract features with numerical stability bounds
    num_params = np.maximum(X[:, 0], 1e6)
    parallel_size = np.maximum(X[:, 1], 1.0)
    
    # Multiplicative inverse scaling law
    return p[0] + p[1] / (np.power(num_params, p[2]) * np.power(parallel_size, p[3]))


def fit_scaling_law(data_points, loss_values):
    """
    Fit scaling law using L-BFGS-B with smart data-driven initialization
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.atleast_1d(np.asarray(loss_values, dtype=np.float64))
    
    def objective(params):
        try:
            pred = scaling_law_func(X, params)
            mse = np.mean((pred - y) ** 2)
            return mse if np.isfinite(mse) else 1e10
        except:
            return 1e10
    
    # Data-driven initialization
    y_min = np.min(y)
    y_max = np.max(y)
    y_range = y_max - y_min
    
    # Baseline loss: slightly below minimum observed loss
    a_init = y_min * 0.91
    
    # Scaling coefficient: captures dynamic range of variation
    b_init = y_range * 0.78
    
    # Exponents: based on empirical scaling law literature
    # Parameter scaling is more dominant than parallel scaling
    c_init = 0.08
    d_init = 0.11
    
    x0 = np.array([a_init, b_init, c_init, d_init])
    
    # Physical bounds ensuring stability and validity
    bounds = [
        (0.001, y_max * 1.5),      # a: baseline loss (positive, reasonable)
        (1e-10, y_range * 100),    # b: scaling strength (positive, wide)
        (0.001, 0.5),              # c: parameter exponent (small positive)
        (0.001, 0.5),              # d: parallel exponent (small positive)
    ]
    
    # Single-stage L-BFGS-B optimization with tight convergence
    result = minimize(
        objective,
        x0,
        method='L-BFGS-B',
        bounds=bounds,
        options={
            'ftol': 1e-13,
            'gtol': 1e-10,
            'maxiter': 4000,
            'maxfun': 4000,
            'maxcor': 25
        }
    )
    
    return result.x if result.success else x0

# EVOLVE-BLOCK-END

#3 Run 2 R² = 0.999894

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM parallel training scenarios
Uses a 4-parameter model: base scaling with parallel efficiency
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Scaling law: loss = a * (num_params^-b) * (1 + c / parallel_size^d)
    
    This models:
    - Power law decrease with model size (num_params^-b)
    - Degradation with parallelization (1 + c / parallel_size^d term)
    - Parameters: [a, b, c, d] (exactly 4)
    
    Physical interpretation:
    - a: base loss constant
    - b: scaling exponent with model size (typically 0.05-0.15)
    - c: parallel efficiency loss magnitude (typically 0.01-0.1)
    - d: parallel efficiency decay rate (typically 0.5-2.0)
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    params = np.asarray(params, dtype=np.float64)
    
    if len(params.shape) == 1:
        params = params.reshape(1, -1)
    
    # Extract the first row of parameters (single model)
    p = params[0]
    
    if len(p) != 4:
        raise ValueError(f"Expected 4 parameters, got {len(p)}")
    
    a, b, c, d = p
    
    num_params = X[:, 0]
    parallel_size = X[:, 1]
    
    # Clip parameters for numerical stability
    b = np.clip(b, 0.001, 0.5)
    d = np.clip(d, 0.01, 5.0)
    c = np.clip(c, -0.5, 2.0)
    a = np.clip(a, 0.1, 10.0)
    
    # Base power law component
    base_loss = a * np.power(num_params, -b)
    
    # Parallel efficiency factor: loss increases slightly with parallelization
    # When parallel_size=1: factor=1, when parallel_size=4: factor=1+c/4^d
    parallel_factor = 1.0 + c / np.power(parallel_size, d)
    
    # Combined prediction
    loss = base_loss * parallel_factor
    
    return loss


def fit_scaling_law(data_points, loss_values):
    """
    Fit 4-parameter scaling law using global optimization for better convergence.
    Uses differential_evolution for global search, then local refinement.
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64)
    
    def objective(params):
        try:
            pred = scaling_law_func(X, params)
            mse = np.mean((pred - y) ** 2)
            if np.isnan(mse) or np.isinf(mse):
                return 1e10
            return mse
        except:
            return 1e10
    
    # Bounds for global optimization
    bounds = [
        (0.1, 10.0),      # a: base constant
        (0.001, 0.5),     # b: power law exponent
        (-0.5, 2.0),      # c: parallel loss magnitude
        (0.01, 5.0),      # d: parallel decay rate
    ]
    
    # Try global optimization first
    result_global = differential_evolution(
        objective,
        bounds,
        seed=42,
        maxiter=300,
        atol=1e-8,
        tol=1e-8,
        workers=1,
        updating='deferred'
    )
    
    # Local refinement
    result_local = minimize(
        objective,
        result_global.x,
        method='L-BFGS-B',
        bounds=bounds,
        options={'ftol': 1e-9, 'maxiter': 500}
    )
    
    # Return best result
    if result_local.fun < result_global.fun:
        return result_local.x
    else:
        return result_global.x
# EVOLVE-BLOCK-END

#4 Run 3 R² = 0.999893

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law: loss = a/(N^b) + c/(P^d)
Simplified 4-parameter inverse power law with streamlined optimization.
Removes redundant stages and complexity while maintaining high accuracy.
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Scaling law: loss = a / (num_params ^ b) + c / (parallel_size ^ d)
    params: [a, b, c, d] - 4 parameters
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    params = np.asarray(params, dtype=np.float64)
    
    num_params = X[:, 0]
    parallel_size = X[:, 1]
    
    a, b, c, d = params[0], params[1], params[2], params[3]
    
    eps = 1e-10
    loss = a / (np.maximum(num_params, eps) ** b) + c / (np.maximum(parallel_size, eps) ** d)
    
    return loss


def fit_scaling_law(data_points, loss_values):
    """
    Fit 4-parameter scaling law with direct optimization.
    Single-stage approach using differential evolution with local refinement.
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64)
    
    num_params = X[:, 0]
    parallel_size = X[:, 1]
    
    # Data-driven initialization
    y_mean = np.mean(y)
    N_min = np.min(num_params)
    
    # Stratify data for parameter estimation
    p1_idx = parallel_size < 1.5
    p4_idx = parallel_size > 3.0
    small_n_idx = num_params < np.percentile(num_params, 33)
    large_n_idx = num_params > np.percentile(num_params, 67)
    
    # Estimate parameters from data stratification
    if np.sum(p1_idx) > 0 and np.sum(p4_idx) > 0:
        c_est = (np.mean(y[p1_idx]) - np.mean(y[p4_idx])) * 0.5
    else:
        c_est = y_mean * 0.05
    
    if np.sum(small_n_idx) > 0 and np.sum(large_n_idx) > 0:
        a_est = (np.mean(y[small_n_idx]) - np.mean(y[large_n_idx])) * N_min ** 0.1
    else:
        a_est = y_mean * N_min ** 0.1
    
    c_est = max(0.0001, c_est)
    a_est = max(0.01, a_est)
    
    def objective(params):
        """MSE objective with safety."""
        try:
            pred = scaling_law_func(X, params)
            if np.any(np.isnan(pred)) or np.any(np.isinf(pred)):
                return 1e10
            return np.mean((pred - y) ** 2)
        except:
            return 1e10
    
    # Bounds for parameters
    bounds = [
        (max(0.01, a_est * 0.1), min(1000, a_est * 10)),
        (0.01, 2.0),
        (max(0.0001, c_est * 0.1), min(100, c_est * 10)),
        (0.01, 2.0)
    ]
    
    best_params = None
    best_loss = float('inf')
    
    # Global optimization with differential evolution
    try:
        result_de = differential_evolution(
            objective,
            bounds,
            seed=42,
            maxiter=300,
            popsize=20,
            tol=1e-9,
            atol=1e-11,
            workers=1,
            polish=True
        )
        best_loss = result_de.fun
        best_params = result_de.x
    except:
        best_params = np.array([a_est, 0.08, c_est, 0.25])
        best_loss = objective(best_params)
    
    # Single local refinement pass from best solution
    if best_params is not None:
        try:
            result_local = minimize(
                objective,
                best_params,
                method='L-BFGS-B',
                bounds=bounds,
                options={'ftol': 1e-12, 'gtol': 1e-10, 'maxiter': 500}
            )
            if result_local.fun < best_loss:
                best_params = result_local.x
        except:
            pass
    
    if best_params is None:
        best_params = np.array([a_est, 0.08, c_est, 0.25])
    
    return np.clip(best_params, [b[0] for b in bounds], [b[1] for b in bounds])

# EVOLVE-BLOCK-END

#5 Run 5 R² = 0.999893

▼

Python

# EVOLVE-BLOCK-START
"""
Optimized scaling law for parallel LLM training.
Additive inverse power law: L(N, P) = a / N^alpha + b / P^beta
Streamlined with efficient initialization and direct optimization.
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """Predict loss using additive inverse power law."""
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    params = np.asarray(params, dtype=np.float64)
    
    num_params = X[:, 0]
    parallel_size = X[:, 1]
    
    a = np.abs(params[0]) + 1e-10
    alpha = np.clip(params[1], 0.02, 0.25)
    b = np.abs(params[2]) + 1e-10
    beta = np.clip(params[3], 0.2, 1.2)
    
    return a / np.power(num_params, alpha) + b / np.power(parallel_size, beta)


def fit_scaling_law(data_points, loss_values):
    """Fit scaling law with smart initialization and hybrid optimization."""
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64)
    
    num_params = X[:, 0]
    parallel_size = X[:, 1]
    
    def objective(params):
        try:
            pred = scaling_law_func(X, params)
            return np.mean((pred - y) ** 2) if np.all(np.isfinite(pred)) else 1e10
        except:
            return 1e10
    
    # Smart initialization via log-log analysis
    log_np = np.log(num_params)
    log_ps = np.log(parallel_size)
    log_y = np.log(np.clip(y, 1e-8, None))
    
    # Estimate exponents from median-based slope estimation
    def get_exp(log_var, log_loss):
        split = log_var > np.median(log_var)
        if np.sum(split) > 1 and np.sum(~split) > 1:
            slope = (np.mean(log_loss[~split]) - np.mean(log_loss[split])) / (np.mean(log_var[~split]) - np.mean(log_var[split]))
            return -slope
        return 0.0
    
    alpha_init = np.clip(get_exp(log_np, log_y) or 0.075, 0.02, 0.25)
    beta_init = np.clip(get_exp(log_ps, log_y) or 0.5, 0.2, 1.2)
    
    # Estimate coefficients from median point
    mid = len(y) // 2
    a_init = np.clip(np.abs(y[mid] * (num_params[mid] ** alpha_init)) + 1e-8, 1e-6, 1e4)
    b_init = np.clip(np.abs(y[mid] * (parallel_size[mid] ** beta_init)) + 1e-8, 1e-6, 1e2)
    
    bounds = [(1e-8, 1e4), (0.02, 0.25), (1e-8, 1e2), (0.2, 1.2)]
    
    # Two-phase optimization: global then local
    res_de = differential_evolution(
        objective, bounds, seed=42, maxiter=400, popsize=20,
        atol=1e-10, tol=1e-10, polish=True
    )
    
    # Local refinement with strong convergence criteria
    res_final = minimize(
        objective, res_de.x, method='L-BFGS-B', bounds=bounds,
        options={'maxiter': 2000, 'ftol': 1e-13, 'gtol': 1e-11}
    )
    
    return res_final.x if res_final.fun < res_de.fun else res_de.x
# EVOLVE-BLOCK-END