SLD - SFT Scaling Law - SLDAgent + Claude Haiku 4.5

All Runs (sorted by R²)

Best Run 5 R² = 0.997360

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law for LLM finetuning: L = a + b/(d + N^c)
Theoretically grounded with optimized hybrid strategy: DE global search + targeted grid refinement
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Scaling law: L = a + b/(d + N^c)
    params: [a, b, c, d] where:
      a: asymptotic loss floor
      b: scaling coefficient
      c: power exponent (0.01-2.0)
      d: stability offset
    """
    X = np.asarray(data_points).ravel()
    a, b, c, d = params[:4]
    
    N = np.maximum(X, 1.0)
    c_safe = np.clip(c, 0.01, 2.0)
    d_safe = np.abs(d) + 1e-6
    
    return a + b / (d_safe + np.power(N, c_safe))


def fit_scaling_law(data_points, loss_values):
    """
    Fit using optimized strategy: DE first, then selective grid, then refinement
    """
    X = np.asarray(data_points).ravel()
    y = np.asarray(loss_values).ravel()
    
    if len(X) < 4:
        return np.array([np.mean(y), 0.1, 0.5, 1.0])
    
    y_min, y_max = np.min(y), np.max(y)
    y_range = y_max - y_min
    
    # Smart initialization
    a_init = y_min * 0.9
    b_init = y_range * 5.0
    
    def objective(params):
        """MSE with penalty for unphysical parameters"""
        try:
            pred = scaling_law_func(X, params)
            mse = np.mean((pred - y) ** 2)
            
            if params[1] <= 0 or params[3] < 0:
                mse += 1e8
            if np.isnan(mse) or np.isinf(mse):
                return 1e10
            return mse
        except:
            return 1e10
    
    # Tight parameter bounds
    bounds = [
        (y_min * 0.3, y_max * 1.2),     # a
        (y_range * 0.1, y_range * 100), # b (positive)
        (0.01, 2.0),                     # c
        (1e-6, 1000.0)                   # d (positive)
    ]
    
    best_params = None
    best_loss = np.inf
    
    # Strategy 1: Global differential evolution optimization (primary)
    try:
        result_de = differential_evolution(
            objective, bounds, seed=42, maxiter=150,
            popsize=15, atol=1e-10, tol=1e-10,
            workers=1, updating='deferred'
        )
        if result_de.fun < best_loss:
            best_loss = result_de.fun
            best_params = result_de.x
    except:
        pass
    
    # Strategy 2: Selective grid search from DE result region
    if best_params is not None:
        c_center = best_params[2]
        d_center = best_params[3]
        
        # Grid around DE solution
        c_grid = [max(0.01, c_center * 0.5), c_center, min(2.0, c_center * 2.0)]
        d_grid = [max(1e-6, d_center * 0.3), d_center, min(1000.0, d_center * 3.0)]
        
        for c_val in c_grid:
            for d_val in d_grid:
                x0 = np.array([a_init, b_init, c_val, d_val])
                try:
                    result = minimize(
                        objective, x0, method='L-BFGS-B',
                        bounds=bounds,
                        options={'maxiter': 400, 'ftol': 1e-11}
                    )
                    if result.fun < best_loss:
                        best_loss = result.fun
                        best_params = result.x
                except:
                    pass
    
    # Strategy 3: Final L-BFGS-B refinement with tight tolerances
    if best_params is not None:
        try:
            result = minimize(
                objective, best_params, method='L-BFGS-B',
                bounds=bounds,
                options={'maxiter': 500, 'ftol': 1e-12, 'gtol': 1e-11}
            )
            if result.fun < best_loss:
                best_params = result.x
        except:
            pass
    
    # Fallback
    if best_params is None:
        best_params = np.array([a_init, b_init, 0.5, 1.0])
    
    # Ensure valid output
    best_params[1] = np.clip(best_params[1], bounds[1][0], bounds[1][1])
    best_params[3] = np.clip(best_params[3], bounds[3][0], bounds[3][1])
    
    return best_params[:4]

# EVOLVE-BLOCK-END

#2 Run 4 R² = 0.993348

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law for LLM finetuning combining power-law decay and logarithmic terms
with robust global optimization via differential evolution
"""
import numpy as np
from scipy.optimize import differential_evolution, minimize

def scaling_law_func(data_points, params):
    X = np.asarray(data_points).ravel()
    a, b, alpha, c = params[0], params[1], params[2], params[3]
    # Form: a + b/(X^alpha) + c*log(X)
    # Numerically stable formulation handling saturation and log effects
    safe_X = np.maximum(X, 1.0)
    pred = a + b / (safe_X ** alpha) + c * np.log(safe_X)
    return pred

def fit_scaling_law(data_points, loss_values):
    X = np.asarray(data_points).ravel()
    y = np.asarray(loss_values).ravel()
    
    # Refined bounds from theoretical understanding of scaling laws
    # a: asymptotic loss at infinite data (0.5-5.0)
    # b: amplitude parameter (-8.0, 8.0) - allows negative contributions
    # alpha: power-law exponent (0.1, 1.5) - standard scaling range
    # c: logarithmic coefficient (-0.8, 0.5) - log correction term
    bounds = [(0.5, 5.0), (-8.0, 8.0), (0.1, 1.5), (-0.8, 0.5)]
    
    def objective(params):
        pred = scaling_law_func(X, params)
        return np.mean((pred - y) ** 2)
    
    # Global optimization with differential evolution
    result = differential_evolution(
        objective, 
        bounds, 
        seed=42, 
        maxiter=350, 
        workers=1, 
        popsize=15, 
        atol=1e-7, 
        tol=1e-7,
        polish=True
    )
    
    if result.fun < 0.05:
        return result.x
    
    # Quick local refinement if needed
    refined = minimize(
        objective, 
        result.x, 
        method='L-BFGS-B', 
        bounds=bounds,
        options={'ftol': 1e-8, 'maxiter': 100}
    )
    
    return refined.x if refined.fun < result.fun else result.x
# EVOLVE-BLOCK-END

#3 Run 1 R² = 0.992090

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law for LLM finetuning: L(N) = a + b/N^α + c*log(N)
Evolved Chinchilla-inspired 4-parameter form with improved initialization.
Data-adaptive bounds and optimized hybrid optimization strategy.
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Chinchilla-inspired scaling law: L(N) = a + b/N^α + c*log(N)
    params: [a, b, alpha, c] (4 parameters)
    - a: asymptotic loss floor
    - b: power-law amplitude
    - alpha: power-law exponent (0.01-2.0)
    - c: logarithmic correction coefficient
    """
    X = np.atleast_2d(np.asarray(data_points))
    N = X[:, 0]
    
    params = np.asarray(params).flatten()
    if len(params) < 4:
        params = np.pad(params, (0, 4 - len(params)), constant_values=0.0)
    
    a, b, alpha, c = params[:4]
    alpha = np.clip(alpha, 0.01, 2.0)
    
    N_safe = np.maximum(N, 1.0)
    loss = a + b / (N_safe ** alpha) + c * np.log(N_safe)
    
    return loss


def fit_scaling_law(data_points, loss_values):
    """
    Evolved hybrid optimization with improved initialization and bounds.
    Adaptive strategy based on data characteristics.
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values).flatten()
    N = X[:, 0]
    
    # Compute robust statistics
    y_min, y_max = np.min(y), np.max(y)
    y_mean = np.mean(y)
    y_std = np.std(y) + 1e-8
    y_range = y_max - y_min + 1e-8
    
    N_min, N_max = np.min(N), np.max(N)
    log_N_min, log_N_max = np.log(N_min), np.log(N_max)
    log_N_range = log_N_max - log_N_min
    
    # Estimate power-law component via log-log regression
    log_N = np.log(N_safe := np.maximum(N, 1.0))
    log_y_centered = np.log(np.maximum(y - y_min + 1e-8, 1e-8))
    
    # Robust slope estimation (ignoring potential outliers)
    valid_idx = np.isfinite(log_y_centered)
    if np.sum(valid_idx) > 2:
        slope = np.polyfit(log_N[valid_idx], log_y_centered[valid_idx], 1)[0]
        alpha_init = np.clip(-slope, 0.1, 1.5)
    else:
        alpha_init = 0.5
    
    def objective(params):
        try:
            pred = scaling_law_func(X, params)
            if np.any(np.isnan(pred)) or np.any(np.isinf(pred)):
                return 1e10
            return np.mean((pred - y) ** 2)
        except:
            return 1e10
    
    # Data-adaptive bounds
    b_range = y_range * (N_max - N_min)
    bounds = [
        (y_min * 0.3, y_max * 1.5),      # a: asymptotic floor (wider range)
        (-b_range, b_range),              # b: adaptive amplitude
        (0.01, 2.0),                      # alpha: standard exponent
        (-1.0, 1.0)                       # c: wider log correction range
    ]
    
    best_params = None
    best_loss = np.inf
    
    # Evolved multi-start initialization with better statistical grounding
    init_candidates = [
        # Power-law dominant (various exponents)
        [y_min, y_range * (N_max - N_min) * 0.5, 0.25, 0.0],
        [y_min, y_range * (N_max - N_min) * 0.5, alpha_init, 0.0],
        [y_min, y_range * (N_max - N_min) * 0.5, 0.8, 0.0],
        [y_min, y_range * (N_max - N_min) * 0.5, 1.2, 0.0],
        
        # With logarithmic correction
        [y_min, y_range * (N_max - N_min) * 0.5, alpha_init, -0.1 * y_range / log_N_range],
        [y_min, y_range * (N_max - N_min) * 0.5, alpha_init, 0.1 * y_range / log_N_range],
        
        # Higher baseline
        [y_mean * 0.8, y_range * (N_max - N_min) * 0.3, alpha_init, 0.0],
        [y_mean * 0.8, y_range * (N_max - N_min) * 0.3, 0.6, -0.05 * y_range / log_N_range],
    ]
    
    # Phase 1: Quick local optimization from multiple initializations
    for init in init_candidates:
        try:
            result = minimize(objective, init, method='L-BFGS-B', bounds=bounds,
                            options={'ftol': 1e-6, 'maxiter': 500})
            if result.fun < best_loss:
                best_loss = result.fun
                best_params = result.x
        except:
            pass
    
    # Phase 2: Adaptive global search if local search insufficient
    # Trigger DE if: (1) no solution found, (2) relative error still high
    de_threshold = y_range ** 2 * 0.03
    if best_params is None or best_loss > de_threshold:
        try:
            result = differential_evolution(
                objective, bounds, seed=42, maxiter=300,
                atol=1e-8, tol=1e-8, workers=1, 
                strategy='best1bin', popsize=15
            )
            if result.fun < best_loss:
                best_loss = result.fun
                best_params = result.x
        except:
            if best_params is None:
                best_params = init_candidates[0]
    
    # Phase 3: Aggressive local refinement with tight tolerance
    if best_params is not None:
        try:
            result = minimize(objective, best_params, method='L-BFGS-B', bounds=bounds,
                            options={'ftol': 1e-9, 'maxiter': 3000})
            if result.fun < best_loss:
                best_params = result.x
        except:
            pass
    
    # Safeguard: ensure valid output
    if best_params is None:
        best_params = np.array([y_min, y_range * (N_max - N_min) * 0.5, 0.5, 0.0])
    
    return np.asarray(best_params)[:4]
# EVOLVE-BLOCK-END

#4 Run 2 R² = 0.991899

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios
Hybrid power-law + logarithmic form with robust two-stage optimization
Uses the form: L(N) = a + b*N^c + d*log(N) with 4 parameters
"""
import numpy as np
from scipy.optimize import minimize, least_squares

def scaling_law_func(data_points, params):
    """
    Scaling law: L(N) = a + b * N^c + d * log(N)
    4 parameters: [a, b, c, d]
    - a: baseline/asymptotic loss
    - b: power law coefficient
    - c: power law exponent (typically negative)
    - d: logarithmic correction term
    """
    X = np.atleast_2d(np.asarray(data_points))
    data_size = np.maximum(X[:, 0], 1.0)
    params = np.asarray(params).flatten()
    
    a = params[0]
    b = params[1]
    c = np.clip(params[2], -2.0, 1.0)  # Constrain exponent range
    d = params[3] if len(params) > 3 else 0.0
    
    # Compute scaling law
    pred = a + b * np.power(data_size, c) + d * np.log(data_size)
    
    return pred


def fit_scaling_law(data_points, loss_values):
    """
    Two-stage optimization:
    Stage 1: Grid search over exponent c, fit other parameters via least squares
    Stage 2: Refine all parameters with bounded optimization
    """
    X = np.atleast_2d(np.asarray(data_points))
    y = np.asarray(loss_values).flatten()
    data_size = np.maximum(X[:, 0], 1.0)
    
    y_min, y_max = np.min(y), np.max(y)
    y_range = y_max - y_min + 1e-6
    
    best_params = np.array([y_min, 0.5, -0.3, 0.0])
    best_loss = np.inf
    
    # Stage 1: Grid search for exponent c with linear regression
    c_candidates = np.linspace(-1.5, 0.0, 16)
    
    for c_try in c_candidates:
        try:
            # Design matrix for linear least squares: [1, N^c, log(N)]
            X_design = np.column_stack([
                np.ones_like(data_size),
                np.power(data_size, c_try),
                np.log(data_size)
            ])
            
            # Solve least squares for [a, b, d]
            coeffs, residuals, rank, s = np.linalg.lstsq(X_design, y, rcond=None)
            
            a, b, d = coeffs[0], coeffs[1], coeffs[2]
            
            # Ensure reasonable parameter values
            a = np.clip(a, y_min - y_range, y_max + y_range)
            b = np.clip(b, -10 * y_range, 10 * y_range)
            d = np.clip(d, -10 * y_range, 10 * y_range)
            
            params = np.array([a, b, c_try, d])
            pred = scaling_law_func(X, params)
            loss = np.mean((pred - y) ** 2)
            
            if loss < best_loss:
                best_loss = loss
                best_params = params.copy()
        except Exception:
            continue
    
    # Stage 2: Local refinement with bounded optimization
    def objective(params_flat):
        params = params_flat.copy()
        params[2] = np.clip(params[2], -1.5, 0.0)  # Keep c in reasonable range
        pred = scaling_law_func(X, params)
        return np.mean((pred - y) ** 2)
    
    def residuals_fn(params_flat):
        params = params_flat.copy()
        params[2] = np.clip(params[2], -1.5, 0.0)
        pred = scaling_law_func(X, params)
        return pred - y
    
    try:
        # Use least_squares for better convergence on residual problems
        result = least_squares(
            residuals_fn,
            best_params,
            bounds=(
                [y_min - y_range, -10 * y_range, -1.5, -10 * y_range],
                [y_max + y_range, 10 * y_range, 0.0, 10 * y_range]
            ),
            max_nfev=1000,
            ftol=1e-10,
            xtol=1e-10
        )
        
        refined_params = result.x
        refined_loss = np.mean(result.fun ** 2)
        
        if refined_loss < best_loss:
            best_params = refined_params
    except Exception:
        pass
    
    # Final polish with BFGS if needed
    try:
        result_bfgs = minimize(
            objective,
            best_params,
            method='BFGS',
            options={'maxiter': 200, 'gtol': 1e-8}
        )
        
        if result_bfgs.fun < np.mean((scaling_law_func(X, best_params) - y) ** 2):
            best_params = result_bfgs.x
            best_params[2] = np.clip(best_params[2], -1.5, 0.0)
    except Exception:
        pass
    
    return best_params
# EVOLVE-BLOCK-END

#5 Run 3 R² = 0.991148

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery using additive power law + logarithmic correction
Optimized for cross-dataset generalization with simplified, stable fitting
Tight bounds and smart initialization for improved efficiency
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Scaling law: L(N) = a + b / (N^α) + c * log(N + 1)
    params = [a, b, alpha, c]
    
    - a: asymptotic loss floor
    - b/(N^α): power-law decay (primary effect)
    - c*log(N+1): logarithmic correction (saturation behavior)
    """
    X = np.atleast_1d(np.asarray(data_points, dtype=np.float64))
    if X.ndim == 2:
        X = X[:, 0]
    
    params = np.atleast_1d(np.asarray(params, dtype=np.float64))
    if len(params) < 4:
        params = np.pad(params, (0, 4 - len(params)), mode='constant')
    
    a, b, alpha, c = params[:4]
    
    # Numerical stability
    N = np.maximum(X, 1.0)
    alpha = np.clip(alpha, 0.01, 1.5)
    b = np.maximum(b, 1e-8)
    
    # Additive form
    loss = a + b / (N ** alpha) + c * np.log(N + 1.0)
    
    return loss


def fit_scaling_law(data_points, loss_values):
    """
    Fit scaling law with tight bounds and smart initialization
    Two-phase optimization for robust cross-dataset generalization
    """
    X = np.atleast_1d(np.asarray(data_points, dtype=np.float64))
    if X.ndim == 2:
        X = X[:, 0]
    
    y = np.atleast_1d(np.asarray(loss_values, dtype=np.float64))
    
    def mse_loss(params):
        try:
            pred = scaling_law_func(X, params)
            if np.any(np.isnan(pred)) or np.any(np.isinf(pred)):
                return 1e10
            return np.mean((pred - y) ** 2)
        except:
            return 1e10
    
    # Data-driven initialization
    y_min, y_max = np.min(y), np.max(y)
    y_range = y_max - y_min + 1e-8
    y_asym = np.mean(y[-3:])
    
    # Estimate log-space slope for better alpha initialization
    log_X = np.log(np.maximum(X, 1.0))
    log_y = np.log(np.maximum(y, 1e-6))
    if len(X) > 2:
        slope = np.polyfit(log_X, log_y, 1)[0]
        alpha_init = np.clip(-slope, 0.1, 1.2)
    else:
        alpha_init = 0.4
    
    # Tight bounds based on data characteristics
    bounds = [
        (y_min - 0.2, y_max),              # a: asymptotic floor
        (0.001, y_range * 80),             # b: power law amplitude
        (0.01, 1.5),                       # alpha: exponent (tighter)
        (-0.3, 0.3)                        # c: log coefficient (constrained)
    ]
    
    # Smart initial guess
    x0 = np.array([
        max(y_asym * 0.95, y_min),
        max(y_range * 8, 0.1),
        alpha_init,
        0.0
    ])
    x0 = np.clip(x0, [b[0] for b in bounds], [b[1] for b in bounds])
    
    # Phase 1: Quick local polish from smart starting point
    try:
        result_local = minimize(
            mse_loss,
            x0,
            method='L-BFGS-B',
            bounds=bounds,
            options={'maxiter': 100, 'ftol': 1e-7}
        )
        best_params = result_local.x
        best_loss = result_local.fun
    except:
        best_params = x0
        best_loss = mse_loss(x0)
    
    # Phase 2: Global optimization with differential evolution
    try:
        result_de = differential_evolution(
            mse_loss,
            bounds,
            seed=42,
            maxiter=350,
            popsize=14,
            workers=1,
            atol=1e-8,
            tol=1e-8,
            mutation=(0.5, 1.5),
            recombination=0.7,
            polish=True
        )
        
        if result_de.fun < best_loss:
            best_params = result_de.x
            best_loss = result_de.fun
    except:
        pass
    
    # Phase 3: Final local refinement
    try:
        result_refine = minimize(
            mse_loss,
            best_params,
            method='L-BFGS-B',
            bounds=bounds,
            options={'maxiter': 150, 'ftol': 1e-8}
        )
        if result_refine.fun < best_loss:
            best_params = result_refine.x
    except:
        pass
    
    return best_params[:4]

# EVOLVE-BLOCK-END