SLD - U-shaped Scaling Law - SLDAgent + Claude Sonnet 4.5

All Runs (sorted by R²)

Best Run 1 R² = 0.931613

▼

Python

# EVOLVE-BLOCK-START
"""
Simplified U-shaped scaling law for double descent pattern.
Uses shifted quadratic with exponential modulation - optimized for stability and fitting quality.
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    U-shaped form: y = a*(x-c)^2 + b*(x-c) + d*exp(-e*|x-c|) + f
    
    Parameters (6):
    - a: quadratic strength (U-shape curvature)
    - b: linear term (asymmetry)
    - c: horizontal shift (minimum location)
    - d: exponential amplitude (initial descent)
    - e: exponential decay rate
    - f: vertical offset (baseline)
    """
    X = np.atleast_2d(np.asarray(data_points))
    log_flops = X[:, 0]
    
    params = np.asarray(params)
    if params.ndim == 1:
        params = params[None, :]
    
    if params.shape[1] < 6:
        params = np.pad(params, ((0, 0), (0, 6 - params.shape[1])), constant_values=0)
    
    a, b, c, d, e, f = params[0, :6]
    
    # Shifted coordinate for centering
    x_shift = log_flops - c
    
    # Quadratic base for U-shape
    quadratic = a * x_shift**2 + b * x_shift
    
    # Exponential modulation with numerical stability
    exp_arg = np.clip(-np.abs(e) * np.abs(x_shift), -50, 50)
    exponential = d * np.exp(exp_arg)
    
    return quadratic + exponential + f


def fit_scaling_law(data_points, loss_values):
    """
    Fit using intelligent multi-start local optimization with adaptive fallback
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values)
    log_flops = X[:, 0]
    
    # Data statistics
    y_mean = np.mean(y)
    y_std = np.std(y)
    y_min = np.min(y)
    y_max = np.max(y)
    y_range = y_max - y_min
    
    x_min = np.min(log_flops)
    x_max = np.max(log_flops)
    x_range = x_max - x_min
    x_mean = np.mean(log_flops)
    
    # Find empirical minimum for smart initialization
    min_idx = np.argmin(y)
    x_at_min = log_flops[min_idx]
    
    def objective(params):
        try:
            pred = scaling_law_func(X, params)
            mse = np.mean((pred - y)**2)
            # Minimal regularization for numerical stability
            reg = 1e-8 * np.sum(params**2)
            return mse + reg
        except:
            return 1e10
    
    # Parameter bounds [a, b, c, d, e, f]
    bounds = [
        (0, 4*y_range),                              # a: positive for U-shape
        (-3*y_range, 3*y_range),                     # b: linear asymmetry
        (x_min - 0.6, x_max + 0.6),                  # c: shift parameter
        (-4*y_range, y_range),                       # d: exponential amplitude
        (0.1, 10.0),                                  # e: decay rate
        (y_min - 1.5*y_std, y_max + 1.5*y_std)       # f: baseline offset
    ]
    
    # Smart initialization strategies based on data
    init_attempts = [
        # Strategy 1: Conservative centered at empirical min
        [y_range*0.35, 0, x_at_min, -y_std*0.8, 1.0, y_mean],
        
        # Strategy 2: Stronger U-shape with moderate exponential
        [y_range*0.6, -y_std*0.3, x_at_min, -1.5*y_std, 1.3, y_mean],
        
        # Strategy 3: Gentle U with strong initial descent
        [y_range*0.25, y_std*0.2, x_at_min, -2*y_std, 0.9, y_mean],
        
        # Strategy 4: Early minimum bias
        [y_range*0.4, -y_std*0.4, x_min + 0.35*x_range, -y_std*1.2, 1.1, y_mean],
        
        # Strategy 5: Late minimum bias
        [y_range*0.4, y_std*0.3, x_max - 0.35*x_range, -y_std*1.2, 1.1, y_mean],
        
        # Strategy 6: Sharp curvature
        [y_range*0.8, 0, x_at_min, -y_std*0.6, 1.8, y_mean],
        
        # Strategy 7: Centered on data mean
        [y_range*0.45, -y_std*0.15, x_mean, -y_std, 1.15, y_mean],
    ]
    
    best_result = None
    best_loss = float('inf')
    
    # Multi-start local optimization
    for init in init_attempts:
        try:
            res = minimize(
                objective, 
                init, 
                method='L-BFGS-B', 
                bounds=bounds,
                options={'maxiter': 1000, 'ftol': 1e-10}
            )
            if res.fun < best_loss:
                best_loss = res.fun
                best_result = res
        except:
            continue
    
    # Global search fallback if local optimization is insufficient
    if best_result is None or best_loss > 0.25:
        try:
            res_de = differential_evolution(
                objective, 
                bounds, 
                maxiter=180, 
                popsize=15, 
                seed=42, 
                atol=1e-9, 
                tol=1e-9, 
                polish=True, 
                workers=1
            )
            if res_de.fun < best_loss:
                best_result = res_de
        except:
            pass
    
    # Return best result or robust fallback
    if best_result is not None and hasattr(best_result, 'x'):
        return best_result.x
    
    # Robust fallback based on data
    return np.array([y_range*0.35, 0, x_at_min, -y_std*0.8, 1.0, y_mean])
# EVOLVE-BLOCK-END

#2 Run 2 R² = 0.888006

▼

Python

# EVOLVE-BLOCK-START
"""
Optimized U-shaped scaling law using damped quadratic with asymptotic correction.
Captures double descent with improved numerical stability and parameter efficiency.
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Stable U-shaped scaling law: y = a*(x-b)^2/(1+c*|x-b|) + d*tanh(e*(x-f))
    
    Components:
    - Damped quadratic: Natural U-shape with controlled growth at extremes
    - Hyperbolic tangent: Smooth, bounded asymptotic transition
    
    Parameters (6 total):
    - a: U-shape amplitude (positive)
    - b: U-shape center/minimum location
    - c: damping factor (controls U-width and prevents unbounded growth)
    - d: asymptotic shift magnitude (typically negative for improvement)
    - e: transition steepness
    - f: transition center
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    params = np.asarray(params, dtype=np.float64)
    
    if params.ndim == 1:
        params = params[None, :]
    
    # Ensure exactly 6 parameters
    if params.shape[1] < 6:
        params = np.pad(params, ((0, 0), (0, 6 - params.shape[1])), constant_values=0)
    
    log_flops = X[:, 0]
    T = params.shape[0]
    N = len(log_flops)
    pred = np.zeros((N, T))
    
    for t in range(T):
        a, b, c, d, e, f = params[t, :6]
        
        # Damped quadratic for stable U-shape
        dx = log_flops - b
        # Use absolute value for symmetric damping
        damping = 1.0 + np.abs(c) * np.abs(dx)
        u_shape = a * dx**2 / (damping + 1e-12)
        
        # Tanh for smooth, bounded asymptotic behavior
        # Clip to prevent numerical overflow
        z = np.clip(e * (log_flops - f), -50, 50)
        asymptotic = d * np.tanh(z)
        
        pred[:, t] = u_shape + asymptotic
    
    return pred[:, 0] if T == 1 else pred


def fit_scaling_law(data_points, loss_values):
    """
    Robust multi-stage fitting with data-driven initialization
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64)
    
    if y.ndim == 1:
        y = y[:, None]
    T = y.shape[1]
    
    log_flops = X[:, 0]
    x_min, x_max = log_flops.min(), log_flops.max()
    x_mid = (x_min + x_max) / 2.0
    x_range = x_max - x_min
    
    params_all = []
    
    for t in range(T):
        y_t = y[:, t]
        y_min, y_max = y_t.min(), y_t.max()
        y_range = y_max - y_min
        
        # Smart initialization: Find U-shape minimum using quartile
        # This is more robust than using argmin
        sorted_idx = np.argsort(y_t)
        quartile_idx = sorted_idx[len(sorted_idx) // 4]
        b_init = log_flops[quartile_idx]
        
        # Estimate amplitude from data spread
        a_init = abs(y_range) / (x_range**2) * 0.5
        
        # Estimate overall improvement trend (start to end)
        improvement = y_t[0] - y_t[-1]
        
        # Initialize parameters with informed guesses
        init_params = np.array([
            a_init,               # a: U-shape amplitude
            b_init,               # b: U-center at quartile minimum
            0.5,                  # c: moderate damping
            improvement * -0.35,  # d: asymptotic improvement (negative)
            1.5,                  # e: moderate transition rate
            x_mid                 # f: transition around middle
        ])
        
        def objective(p):
            pred = scaling_law_func(X, p)
            mse = np.mean((pred - y_t) ** 2)
            # Very light regularization
            reg = 1e-8 * np.sum(p**2)
            return mse + reg
        
        # Adaptive bounds based on data characteristics
        bounds = [
            (0.001, abs(y_range) * 5),                       # a (positive)
            (x_min - x_range * 0.5, x_max + x_range * 0.5), # b
            (0.01, 5.0),                                      # c (positive)
            (y_min * 1.5, y_max * 0.5),                      # d
            (0.1, 10.0),                                      # e (positive)
            (x_min - x_range * 0.5, x_max + x_range * 0.5)  # f
        ]
        
        # Stage 1: Local optimization from smart initialization
        res_local = minimize(
            objective, init_params, method='L-BFGS-B', bounds=bounds,
            options={'maxiter': 500, 'ftol': 1e-9}
        )
        
        best_p = res_local.x if res_local.success else init_params
        best_loss = objective(best_p)
        
        # Stage 2: Global search if local optimization is insufficient
        if best_loss > 0.05:
            try:
                res_global = differential_evolution(
                    objective, bounds, seed=42, maxiter=70, popsize=10,
                    atol=1e-8, tol=1e-8, workers=1, strategy='best1bin'
                )
                
                if res_global.fun < best_loss:
                    best_p = res_global.x
                    # Refine global result with local search
                    res_refine = minimize(
                        objective, best_p, method='L-BFGS-B',
                        bounds=bounds, options={'maxiter': 250}
                    )
                    if res_refine.success:
                        best_p = res_refine.x
            except:
                pass
        
        params_all.append(best_p)
    
    params_final = np.array(params_all)
    return params_final[0] if T == 1 else params_final
# EVOLVE-BLOCK-END

#3 Run 3 R² = 0.793763

▼

Python

# EVOLVE-BLOCK-START
"""
U-shaped scaling law using quadratic baseline with Gaussian dip
Captures double descent pattern with 6 parameters:
- Quadratic polynomial for overall U-shape
- Gaussian bump/dip for localized transition region
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    U-shaped scaling law: a*x^2 + b*x + c + d*exp(-((x-e)/f)^2)
    
    params[0] (a): quadratic coefficient - overall curvature
    params[1] (b): linear coefficient - asymmetry
    params[2] (c): constant offset - baseline
    params[3] (d): Gaussian amplitude - dip depth
    params[4] (e): Gaussian center - dip location
    params[5] (f): Gaussian width - dip sharpness
    """
    X = np.atleast_2d(np.asarray(data_points))
    params = np.asarray(params)
    
    if params.ndim == 1:
        params = params[None, :]
    T, P = params.shape
    
    # Ensure 6 parameters
    if P < 6:
        params = np.pad(params, ((0, 0), (0, 6 - P)), constant_values=0.0)
    
    log_flops = X[:, 0]
    
    a = params[:, 0]
    b = params[:, 1]
    c = params[:, 2]
    d = params[:, 3]
    e = params[:, 4]
    f = params[:, 5]
    
    # Quadratic baseline
    baseline = a[None, :] * log_flops[:, None]**2 + b[None, :] * log_flops[:, None] + c[None, :]
    
    # Gaussian dip (numerically stable)
    f_safe = np.maximum(np.abs(f[None, :]), 0.01)
    z_sq = ((log_flops[:, None] - e[None, :]) / f_safe)**2
    z_sq_clipped = np.clip(z_sq, 0, 50)
    gaussian = d[None, :] * np.exp(-z_sq_clipped)
    
    pred = baseline + gaussian
    
    return pred[:, 0] if pred.shape[1] == 1 else pred


def fit_scaling_law(data_points, loss_values):
    """
    Multi-stage fitting:
    1. Polynomial fit for baseline initialization
    2. Residual analysis for dip parameters
    3. Differential evolution for global optimization
    4. L-BFGS-B for local refinement
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values)
    
    if y.ndim == 1:
        y2d = y[:, None]
    else:
        y2d = y
    T = y2d.shape[1]
    
    log_flops = X[:, 0]
    x_min, x_max = log_flops.min(), log_flops.max()
    x_range = x_max - x_min
    y_mean = np.mean(y2d, axis=0)
    y_std = np.std(y2d, axis=0) + 1e-8
    
    def objective(flat_params):
        params = flat_params.reshape(T, 6)
        pred = scaling_law_func(X, params)
        mse = np.mean((pred - y2d) ** 2)
        # Light regularization on quadratic to prevent overfitting
        reg = 1e-6 * np.sum(params[:, 0]**2)
        return mse + reg
    
    # Smart initialization via polynomial fitting + residual analysis
    init = np.zeros((T, 6))
    for t in range(T):
        y_t = y2d[:, t]
        
        # Step 1: Fit quadratic baseline
        try:
            poly_coeffs = np.polyfit(log_flops, y_t, 2)
            init[t, 0] = poly_coeffs[0]  # a
            init[t, 1] = poly_coeffs[1]  # b
            init[t, 2] = poly_coeffs[2]  # c
            poly_pred = np.polyval(poly_coeffs, log_flops)
        except:
            init[t, 0] = 0.05
            init[t, 1] = 0.0
            init[t, 2] = y_mean[t]
            poly_pred = init[t, 2] * np.ones_like(log_flops)
        
        # Step 2: Analyze residuals for dip parameters
        residuals = y_t - poly_pred
        min_res_idx = np.argmin(residuals)
        
        # Dip center at deepest residual
        init[t, 4] = log_flops[min_res_idx]  # e
        
        # Dip amplitude from minimum residual
        init[t, 3] = np.min(residuals) * 0.8  # d (slightly conservative)
        
        # Dip width from spread of negative residuals
        threshold = np.min(residuals) * 0.5
        neg_mask = residuals < threshold
        if np.sum(neg_mask) > 2:
            neg_locs = log_flops[neg_mask]
            init[t, 5] = np.std(neg_locs) + 0.1  # f
        else:
            init[t, 5] = x_range * 0.2  # default width
    
    # Define parameter bounds
    bounds = []
    for t in range(T):
        bounds.extend([
            (-2.0, 2.0),    # a: quadratic (moderate for stability)
            (-15.0, 15.0),  # b: linear
            (y_mean[t] - 8*y_std[t], y_mean[t] + 4*y_std[t]),  # c: constant
            (1.5*np.min(y2d[:, t]), 0.5*np.max(y2d[:, t])),  # d: amplitude
            (x_min - 0.6*x_range, x_max + 0.6*x_range),  # e: center
            (0.05, 2.5*x_range)  # f: width (positive)
        ])
    
    best_result = init.ravel()
    best_loss = objective(best_result)
    
    # Stage 1: Global search with differential evolution
    try:
        result_de = differential_evolution(
            objective,
            bounds,
            maxiter=250,
            popsize=15,
            seed=42,
            atol=1e-8,
            tol=1e-8,
            workers=1,
            strategy='best1bin',
            updating='deferred',
            polish=False
        )
        if result_de.fun < best_loss:
            best_loss = result_de.fun
            best_result = result_de.x
    except:
        pass
    
    # Stage 2: Local refinement
    try:
        result_local = minimize(
            objective,
            best_result,
            method='L-BFGS-B',
            bounds=bounds,
            options={'maxiter': 1200, 'ftol': 1e-12, 'gtol': 1e-10}
        )
        if result_local.success and result_local.fun < best_loss:
            best_result = result_local.x
    except:
        pass
    
    params_opt = best_result.reshape(T, 6)
    return params_opt[0] if T == 1 else params_opt
# EVOLVE-BLOCK-END

#4 Run 4 R² = 0.793244

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law for U-shaped/double descent pattern in LLM performance
Uses inverted Gaussian + polynomial for smooth, stable U-shape modeling
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Inverted Gaussian + quadratic polynomial:
    y = -a * exp(-((x - x0)/w)^2) + b*x^2 + c*x + d
    
    - Gaussian dip: smooth, symmetric U-shape with natural tails
    - Quadratic: asymmetry and long-range curvature
    - Linear: overall trend
    - 6 parameters, excellent numerical stability
    """
    X = np.atleast_2d(np.asarray(data_points))
    log_flops = X[:, 0]
    
    params = np.asarray(params)
    if params.ndim == 1:
        params = params[None, :]
    
    a = params[:, 0]   # Gaussian amplitude
    x0 = params[:, 1]  # Dip center
    w = params[:, 2]   # Dip width
    b = params[:, 3]   # Quadratic coef
    c = params[:, 4]   # Linear coef
    d = params[:, 5]   # Offset
    
    # Gaussian dip (inverted, with numerical stability)
    w_safe = np.clip(np.abs(w), 0.05, 50.0)
    z = (log_flops[:, None] - x0[None, :]) / w_safe[None, :]
    z_clipped = np.clip(z, -10, 10)
    gaussian = -a[None, :] * np.exp(-z_clipped**2)
    
    # Polynomial background
    poly = (b[None, :] * log_flops[:, None]**2 + 
            c[None, :] * log_flops[:, None] + 
            d[None, :])
    
    pred = gaussian + poly
    return pred[:, 0] if pred.shape[1] == 1 else pred


def fit_scaling_law(data_points, loss_values):
    """
    Adaptive differential evolution + multi-start L-BFGS-B refinement
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values)
    
    if y.ndim == 1:
        y2d = y[:, None]
    else:
        y2d = y
    T = y2d.shape[1]
    P = 6
    
    def objective(flat_params):
        params = flat_params.reshape(T, P)
        try:
            pred = scaling_law_func(X, params)
            mse = np.mean((pred - y2d) ** 2)
            reg = 5e-9 * np.sum(params**2)
            return mse + reg
        except:
            return 1e12
    
    # Analyze data
    log_flops = X[:, 0]
    x_min, x_max = np.min(log_flops), np.max(log_flops)
    x_range = x_max - x_min
    
    y_mean = np.mean(loss_values)
    y_std = np.std(loss_values)
    y_range = np.max(loss_values) - np.min(loss_values)
    
    # Find dip location
    sorted_idx = np.argsort(log_flops)
    x_sorted = log_flops[sorted_idx]
    y_sorted = loss_values[sorted_idx]
    
    mid_start = len(x_sorted) // 4
    mid_end = 3 * len(x_sorted) // 4
    dip_idx = mid_start + np.argmin(y_sorted[mid_start:mid_end])
    x_dip = x_sorted[dip_idx]
    
    # Smart bounds
    bounds = [
        (0.4 * y_range, 4.5 * y_range),  # a: amplitude
        (x_min - 0.25*x_range, x_max + 0.25*x_range),  # x0: center
        (0.08 * x_range, 2.5 * x_range),  # w: width
        (-1.8, 1.8),  # b: quadratic
        (-3.5, 3.5),  # c: linear
        (y_mean - 2.5*y_std, y_mean + 2.5*y_std)  # d: offset
    ] * T
    
    # Differential evolution with tuned hyperparameters
    result_de = differential_evolution(
        objective,
        bounds,
        seed=42,
        maxiter=450,
        popsize=22,
        atol=1e-8,
        tol=1e-8,
        strategy='best1bin',
        mutation=(0.45, 1.3),
        recombination=0.8,
        polish=True,
        workers=1
    )
    
    # Multi-start local optimization
    best_result = result_de
    best_loss = result_de.fun
    
    for i in range(5):
        if i == 0:
            init_params = result_de.x.copy()
        else:
            # Smart perturbation
            init_params = result_de.x + np.random.randn(len(result_de.x)) * 0.02 * y_std
            for j, (lb, ub) in enumerate(bounds):
                init_params[j] = np.clip(init_params[j], lb, ub)
        
        result = minimize(
            objective,
            init_params,
            method='L-BFGS-B',
            bounds=bounds,
            options={'maxiter': 2000, 'ftol': 1e-11, 'gtol': 1e-9}
        )
        
        if result.fun < best_loss:
            best_result = result
            best_loss = result.fun
    
    params_opt = best_result.x.reshape(T, P)
    return params_opt[0] if T == 1 else params_opt
# EVOLVE-BLOCK-END

#5 Run 5 R² = 0.785188

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law for U-shaped/double descent pattern in LLM scaling
Uses quadratic form centered at variable point plus linear and exponential terms
"""
import numpy as np
from scipy.optimize import differential_evolution, minimize

def scaling_law_func(data_points, params):
    """
    Models U-shaped scaling with centered quadratic + linear + exponential.
    Form: a*(x-b)^2 + c*x + d + e*exp(-f*x) where x = log_flops
    6 parameters: quadratic shape (a,b), linear trend (c), baseline (d), exponential correction (e,f)
    """
    X = np.atleast_2d(np.asarray(data_points))
    log_flops = X[:, 0]
    
    params = np.asarray(params)
    if params.ndim == 1:
        params = params[None, :]
    
    a, b, c, d, e, f = [params[:, i] for i in range(6)]
    
    x = log_flops[:, None]
    pred = (a[None, :] * (x - b[None, :])**2 + 
            c[None, :] * x + 
            d[None, :] + 
            e[None, :] * np.exp(-np.clip(f[None, :] * x, -20, 20)))
    
    return pred[:, 0] if pred.shape[1] == 1 else pred


def fit_scaling_law(data_points, loss_values):
    """
    Fits using differential evolution with adaptive bounds and refinement
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values)
    log_flops = X[:, 0]
    
    y2d = y[:, None] if y.ndim == 1 else y
    T = y2d.shape[1]
    
    # Data statistics
    y_mean, y_std = np.mean(y), np.std(y)
    y_min, y_max = np.min(y), np.max(y)
    log_mean = np.mean(log_flops)
    log_min, log_max = np.min(log_flops), np.max(log_flops)
    log_range = log_max - log_min
    
    params_list = []
    for t in range(T):
        y_t = y2d[:, t]
        
        def objective(p):
            pred = scaling_law_func(X, p.reshape(1, -1))
            mse = np.mean((pred - y_t) ** 2)
            reg = 1e-8 * np.sum(p**2)  # Light regularization
            return mse + reg
        
        # Adaptive bounds based on data characteristics
        scale = y_std / (log_range**2 + 1e-10)
        bounds = [
            (0, 20 * scale),  # a: positive for U-shape
            (log_min - 0.5, log_max + 0.5),  # b: center
            (-10 * y_std / log_range, 10 * y_std / log_range),  # c: linear
            (y_min - 2*y_std, y_max + 2*y_std),  # d: offset
            (-3 * y_std, 3 * y_std),  # e: exp amplitude
            (0.01, 10.0)  # f: exp decay
        ]
        
        # Global optimization with differential evolution
        result = differential_evolution(
            objective,
            bounds=bounds,
            maxiter=200,
            popsize=15,
            seed=42,
            atol=1e-9,
            tol=1e-9,
            polish=True,
            workers=1
        )
        
        # Refine with local optimization
        if result.success:
            result_local = minimize(
                objective,
                result.x,
                method='L-BFGS-B',
                bounds=bounds,
                options={'maxiter': 300, 'ftol': 1e-12}
            )
            params_t = result_local.x if result_local.success else result.x
        else:
            # Fallback: simple quadratic initialization
            params_t = np.array([
                2.0 * scale,
                log_mean,
                0.0,
                y_mean,
                -0.2 * y_std,
                1.0
            ])
        
        params_list.append(params_t)
    
    params_opt = np.array(params_list)
    return params_opt[0] if T == 1 else params_opt
# EVOLVE-BLOCK-END