SLD - LR-BSZ Scaling Law - SLDAgent + Claude Haiku 4.5

All Runs (sorted by R²)

Best Run 2 R² = 0.922917

▼

Python

# EVOLVE-BLOCK-START
"""
High-performance scaling law combining proven 10-parameter model with optimized 
initialization and multi-stage refinement. Balances expressiveness and generalization.
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Compute predicted loss using multi-term scaling law in log-space.
    
    Model: loss = a + b1*log(D) + b2*log(N) + b3*log(lr) + b4*log(bsz)
                  + c1*log(D)*log(N) + c2*log(lr)*log(bsz)
                  + d1*(log(lr))^2 + d2*(log(D))^2 + d3*log(N)*log(lr)
    
    Captures Chinchilla-style scaling, non-monotonic lr effects, and key interactions.
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    params = np.asarray(params, dtype=np.float64)
    
    if params.ndim == 1:
        params = params[None, :]
    
    # Extract and safely clip features
    lr = np.clip(X[:, 0], 1e-6, 1.0)
    bsz = np.clip(X[:, 1], 1e-3, 1e6)
    data_size = np.clip(X[:, 2], 1e6, 1e12)
    param_size = np.clip(X[:, 3], 1e6, 1e12)
    
    # Log transformation
    log_lr = np.log(lr)
    log_bsz = np.log(bsz)
    log_data = np.log(data_size)
    log_param = np.log(param_size)
    
    # Proven normalization constants from best performers
    log_lr_norm = (log_lr + 11.0) / 4.0
    log_bsz_norm = (log_bsz - 2.77) / 3.0
    log_data_norm = (log_data - 14.5) / 4.5
    log_param_norm = (log_param - 17.5) / 3.0
    
    N = X.shape[0]
    
    if params.shape[0] == 1:
        p = params[0]
        pred = p[0] * np.ones(N)
        pred += p[1] * log_data_norm
        pred += p[2] * log_param_norm
        pred += p[3] * log_lr_norm
        pred += p[4] * log_bsz_norm
        pred += p[5] * log_data_norm * log_param_norm
        pred += p[6] * log_lr_norm * log_bsz_norm
        pred += p[7] * (log_lr_norm ** 2)
        pred += p[8] * (log_data_norm ** 2)
        if len(p) > 9:
            pred += p[9] * log_param_norm * log_lr_norm
        return pred
    else:
        preds = []
        for i in range(params.shape[0]):
            p = params[i]
            pred = p[0] * np.ones(N)
            pred += p[1] * log_data_norm
            pred += p[2] * log_param_norm
            pred += p[3] * log_lr_norm
            pred += p[4] * log_bsz_norm
            pred += p[5] * log_data_norm * log_param_norm
            pred += p[6] * log_lr_norm * log_bsz_norm
            pred += p[7] * (log_lr_norm ** 2)
            pred += p[8] * (log_data_norm ** 2)
            if len(p) > 9:
                pred += p[9] * log_param_norm * log_lr_norm
            preds.append(pred)
        return np.column_stack(preds)


def fit_scaling_law(data_points, loss_values):
    """
    Three-stage optimization proven to achieve best results.
    
    Stage 1: Smart correlation-based initialization
    Stage 2: Global search with differential_evolution
    Stage 3: Precision refinement with BFGS
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64).flatten()
    
    n_params = 10
    
    # Compute feature statistics for initialization
    log_lr = np.log(np.clip(X[:, 0], 1e-6, 1.0))
    log_bsz = np.log(np.clip(X[:, 1], 1e-3, 1e6))
    log_data = np.log(np.clip(X[:, 2], 1e6, 1e12))
    log_param = np.log(np.clip(X[:, 3], 1e6, 1e12))
    
    # Apply proven normalization
    log_lr_norm = (log_lr + 11.0) / 4.0
    log_bsz_norm = (log_bsz - 2.77) / 3.0
    log_data_norm = (log_data - 14.5) / 4.5
    log_param_norm = (log_param - 17.5) / 3.0
    
    init_bias = np.mean(y)
    
    # Compute correlations for parameter initialization
    def safe_cov(x):
        cov_val = np.cov(x, y)[0, 1]
        std_x = np.std(x)
        std_y = np.std(y)
        if std_x > 1e-10 and std_y > 1e-10:
            return cov_val / (std_x * std_y)
        return 0.0
    
    corr_data = safe_cov(log_data_norm)
    corr_param = safe_cov(log_param_norm)
    corr_lr = safe_cov(log_lr_norm)
    corr_bsz = safe_cov(log_bsz_norm)
    
    # Smart initialization with proven scaling factors
    init_params = np.array([
        init_bias,           # p[0]: bias
        -0.15 * corr_data,   # p[1]: log_data (negative: more data = lower loss)
        -0.10 * corr_param,  # p[2]: log_param
        0.05 * corr_lr,      # p[3]: log_lr
        -0.02 * corr_bsz,    # p[4]: log_bsz
        0.01,                # p[5]: interaction data-param
        -0.005,              # p[6]: interaction lr-bsz
        0.02,                # p[7]: quadratic lr
        0.005,               # p[8]: quadratic data
        0.01                 # p[9]: interaction param-lr
    ])
    
    def objective(params):
        try:
            pred = scaling_law_func(X, params)
            mse = np.mean((pred - y) ** 2)
            return mse if np.isfinite(mse) else 1e10
        except:
            return 1e10
    
    # Stage 1: Local optimization from smart initialization
    result_local = minimize(
        objective,
        init_params,
        method='L-BFGS-B',
        bounds=[(-10, 10) for _ in range(n_params)],
        options={'maxiter': 500, 'ftol': 1e-8}
    )
    
    best_params = result_local.x if result_local.success else init_params
    best_loss = objective(best_params)
    
    # Stage 2: Global search with differential evolution
    bounds = [(-5, 5) for _ in range(n_params)]
    result_global = differential_evolution(
        objective,
        bounds,
        maxiter=300,
        workers=1,
        seed=42,
        atol=1e-9,
        tol=1e-9,
        polish=True
    )
    
    if result_global.fun < best_loss:
        best_params = result_global.x
        best_loss = result_global.fun
    
    # Stage 3: Final polish with BFGS
    result_final = minimize(
        objective,
        best_params,
        method='BFGS',
        options={'maxiter': 200}
    )
    
    if result_final.success and result_final.fun < best_loss:
        best_params = result_final.x
    
    return best_params
# EVOLVE-BLOCK-END

#2 Run 3 R² = 0.907465

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM training with improved accuracy and stability.
Enhanced 8-parameter model with better generalization and numerical stability.
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution
import warnings
warnings.filterwarnings('ignore')

def scaling_law_func(data_points, params):
    """
    Refined scaling law model with 8 parameters:
    params[0]: data_exp (exponent for data scaling, typically ~0.08)
    params[1]: model_exp (exponent for model scaling, typically ~0.07)
    params[2]: lr_opt_log (log optimal learning rate)
    params[3]: lr_width (log-space width of LR sensitivity)
    params[4]: lr_scale (magnitude of LR penalty)
    params[5]: bsz_scale (batch size benefit magnitude)
    params[6]: base_loss (baseline loss)
    params[7]: bsz_interaction (batch size modulation of LR)
    
    Key: Use log-space for numerical stability and Gaussian-like LR penalty
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    N, F = X.shape
    
    if F != 4:
        raise ValueError(f"Expected 4 features, got {F}")
    
    params = np.asarray(params, dtype=np.float64).flatten()
    if len(params) < 8:
        params = np.concatenate([params, np.zeros(8 - len(params))])
    
    lr = X[:, 0]
    bsz = X[:, 1]
    data_size = X[:, 2]
    model_params = X[:, 3]
    
    # Extract parameters with constraints for stability
    data_exp = np.clip(params[0], 0.01, 0.25)
    model_exp = np.clip(params[1], 0.01, 0.25)
    lr_opt_log = params[2]
    lr_width = np.clip(np.exp(params[3]), 0.1, 2.0)
    lr_scale = np.clip(np.exp(params[4]), 0.01, 10.0)
    bsz_scale = np.clip(params[5], -0.05, 0.05)
    base_loss = params[6]
    bsz_interaction = np.clip(params[7], -0.3, 0.3)
    
    # Reference points for normalization
    data_ref = 2e10  # 20B tokens
    model_ref = 3e8  # 300M parameters
    
    # Log-space scaling terms (numerically stable)
    log_data_ratio = np.log(np.maximum(data_size / data_ref, 1e-8))
    log_model_ratio = np.log(np.maximum(model_params / model_ref, 1e-8))
    
    data_term = -data_exp * log_data_ratio
    model_term = -model_exp * log_model_ratio
    
    # Learning rate: Gaussian-like penalty in log-space
    # Better captures U-shaped loss vs LR empirically
    log_lr = np.log(np.maximum(lr, 1e-8))
    log_lr_dev = (log_lr - lr_opt_log) / lr_width
    lr_penalty = lr_scale * (log_lr_dev ** 2)
    
    # Batch size effects: benefit + interaction with LR
    log_bsz_norm = np.log(np.maximum(bsz / 32.0, 1e-8))
    bsz_benefit = -bsz_scale * np.log1p(bsz / 32.0)
    
    # Batch size moderates LR sensitivity
    bsz_modulation = 1.0 + bsz_interaction * log_bsz_norm
    bsz_modulation = np.maximum(bsz_modulation, 0.3)
    
    # Combined loss
    loss = base_loss + data_term + model_term + lr_penalty * bsz_modulation + bsz_benefit
    
    return np.asarray(loss, dtype=np.float64)


def fit_scaling_law(data_points, loss_values):
    """
    Fit scaling law with data-driven initialization and efficient multi-stage optimization.
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64).flatten()
    
    N, F = X.shape
    if F != 4 or len(y) != N:
        raise ValueError(f"Data mismatch: {N} samples, {F} features, {len(y)} labels")
    
    # Robust statistics using quantiles
    y_min = np.min(y)
    y_q25 = np.percentile(y, 25)
    y_median = np.median(y)
    y_q75 = np.percentile(y, 75)
    y_std = np.std(y)
    
    # Learning rate statistics in log-space
    lr = X[:, 0]
    lr_log_mean = np.mean(np.log(lr))
    lr_log_std = np.std(np.log(lr))
    
    # Data-driven initialization
    # Find indices for extreme data and model sizes
    data_size = X[:, 2]
    model_params = X[:, 3]
    
    idx_high_data = np.argsort(data_size)[-100:]
    idx_low_data = np.argsort(data_size)[:100]
    
    # Estimate data exponent from extremes
    if len(idx_high_data) > 0 and len(idx_low_data) > 0:
        data_high_mean = np.mean(data_size[idx_high_data])
        data_low_mean = np.mean(data_size[idx_low_data])
        loss_high_mean = np.mean(y[idx_high_data])
        loss_low_mean = np.mean(y[idx_low_data])
        
        data_ratio = np.log(data_high_mean / data_low_mean)
        loss_delta = loss_high_mean - loss_low_mean
        data_exp_est = abs(loss_delta) / (abs(data_ratio) + 1e-8) if data_ratio != 0 else 0.08
        data_exp_est = np.clip(data_exp_est, 0.01, 0.2)
    else:
        data_exp_est = 0.08
    
    # Similarly for model
    idx_high_model = np.argsort(model_params)[-100:]
    idx_low_model = np.argsort(model_params)[:100]
    
    if len(idx_high_model) > 0 and len(idx_low_model) > 0:
        model_high_mean = np.mean(model_params[idx_high_model])
        model_low_mean = np.mean(model_params[idx_low_model])
        loss_high_model = np.mean(y[idx_high_model])
        loss_low_model = np.mean(y[idx_low_model])
        
        model_ratio = np.log(model_high_mean / model_low_mean)
        loss_delta_model = loss_high_model - loss_low_model
        model_exp_est = abs(loss_delta_model) / (abs(model_ratio) + 1e-8) if model_ratio != 0 else 0.07
        model_exp_est = np.clip(model_exp_est, 0.01, 0.2)
    else:
        model_exp_est = 0.07
    
    # Initialize parameters
    init_params = np.array([
        data_exp_est,           # data_exp
        model_exp_est,          # model_exp
        lr_log_mean,            # lr_opt_log (center of LR distribution)
        np.log(0.6),            # log(lr_width)
        np.log(0.3),            # log(lr_scale)
        0.002,                  # bsz_scale (small benefit)
        y_min + 0.1,            # base_loss (slightly above min)
        -0.05                   # bsz_interaction
    ], dtype=np.float64)
    
    def objective(params):
        """MSE loss with numerical stability"""
        try:
            pred = scaling_law_func(X, params)
            if np.any(np.isnan(pred)) or np.any(np.isinf(pred)):
                return 1e10
            if np.any(pred < 0) or np.any(pred > 10):
                return 1e10
            mse = np.mean((pred - y) ** 2)
            return float(mse)
        except:
            return 1e10
    
    # Adaptive bounds based on data ranges
    bounds = [
        (0.01, 0.25),                                      # data_exp
        (0.01, 0.25),                                      # model_exp
        (lr_log_mean - 3*lr_log_std, lr_log_mean + 3*lr_log_std),  # lr_opt_log
        (np.log(0.05), np.log(3.0)),                       # log(lr_width)
        (np.log(0.01), np.log(20.0)),                      # log(lr_scale)
        (-0.05, 0.05),                                     # bsz_scale
        (y_min - 0.5, y_q75 + 0.5),                        # base_loss
        (-0.5, 0.3)                                        # bsz_interaction
    ]
    
    # Stage 1: Global search with differential evolution
    result_de = differential_evolution(
        objective,
        bounds,
        maxiter=80,
        popsize=15,
        seed=42,
        atol=1e-10,
        tol=1e-10,
        workers=1,
        updating='deferred'
    )
    
    best_params = result_de.x
    best_loss = result_de.fun
    
    # Stage 2: Local refinement from DE result
    result_local = minimize(
        objective,
        best_params,
        method='L-BFGS-B',
        bounds=bounds,
        options={'maxiter': 1000, 'ftol': 1e-11, 'gtol': 1e-9}
    )
    
    if result_local.fun < best_loss:
        best_params = result_local.x
        best_loss = result_local.fun
    
    # Stage 3: Super-fine tuning with adaptive tight bounds
    bounds_tight = [
        (max(bounds[i][0], best_params[i] - 0.01), min(bounds[i][1], best_params[i] + 0.01))
        if i < 2 else
        (max(bounds[i][0], best_params[i] - 0.3), min(bounds[i][1], best_params[i] + 0.3))
        for i in range(8)
    ]
    
    result_fine = minimize(
        objective,
        best_params,
        method='L-BFGS-B',
        bounds=bounds_tight,
        options={'maxiter': 800, 'ftol': 1e-12, 'gtol': 1e-10}
    )
    
    if result_fine.fun < best_loss:
        best_params = result_fine.x
    
    return best_params
# EVOLVE-BLOCK-END

#3 Run 4 R² = 0.895133

▼

Python

# EVOLVE-BLOCK-START
"""
Advanced scaling law discovery for LLM training
Uses multiplicative structure with strong interaction terms
Captures learning rate-batch size scaling relationships
Theoretically grounded in Chinchilla and recent scaling law research
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Improved scaling law with multiplicative structure:
    loss = base * (1 + lr_effect) * (1 + data_effect) * (1 + param_effect) * (1 + batch_effect)
    
    Key improvements:
    - Multiplicative structure captures interaction effects naturally
    - Learning rate effect depends on batch size (critical scaling relationship)
    - Exponential parameterization for numerical stability
    - Proper handling of log-space computations
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    if X.shape[1] != 4:
        raise ValueError(f"Expected 4 features, got {X.shape[1]}")
    
    params = np.atleast_1d(np.asarray(params, dtype=np.float64))
    
    # Ensure we have enough parameters
    if params.size < 11:
        params = np.pad(params, (0, 11 - params.size), mode='constant')
    
    # Extract and constrain parameters
    base_loss = params[0]                           # baseline loss
    
    # Learning rate effect parameters
    lr_coeff = np.abs(params[1]) + 1e-4            # coefficient for lr effect
    lr_exp = np.clip(params[2], -1.5, 1.5)          # exponent for lr
    lr_bsz_interact = params[3]                     # lr-bsz interaction strength
    
    # Data size effect parameters  
    data_coeff = np.abs(params[4]) + 1e-4          # coefficient for data effect
    data_exp = np.clip(np.abs(params[5]), 0.01, 0.5)  # exponent: typically 0.05-0.15
    
    # Parameter size effect parameters
    param_coeff = np.abs(params[6]) + 1e-4         # coefficient for param effect
    param_exp = np.clip(np.abs(params[7]), 0.01, 0.3)  # exponent: typically 0.05-0.08
    
    # Batch size effect parameters
    batch_coeff = np.abs(params[8]) + 1e-4         # coefficient for batch effect
    batch_exp = np.clip(params[9], -0.3, 0.3)      # exponent: typically small
    
    # Cross-term coupling
    coupling = np.clip(params[10], -1.0, 1.0)      # couples different scaling effects
    
    # Extract and ensure positivity
    lr = np.maximum(X[:, 0], 1e-7)
    bsz = np.maximum(X[:, 1], 1.0)
    data_size = np.maximum(X[:, 2], 1e6)
    param_size = np.maximum(X[:, 3], 1e6)
    
    # Normalized features for stable computation
    lr_norm = lr / 0.01                             # normalize to typical LR
    bsz_norm = bsz / 128.0                          # normalize to typical batch
    data_norm = data_size / 1e10                    # normalize to 10B tokens
    param_norm = param_size / 7e8                   # normalize to 700M params
    
    # Compute effects in log space for stability
    log_lr_norm = np.log(np.maximum(lr_norm, 1e-8))
    log_bsz_norm = np.log(np.maximum(bsz_norm, 1e-8))
    log_data_norm = np.log(np.maximum(data_norm, 1e-8))
    log_param_norm = np.log(np.maximum(param_norm, 1e-8))
    
    # Learning rate effect (modulated by batch size for realistic scaling)
    # Optimal LR scales with sqrt(bsz), so we reduce LR effect with larger bsz
    lr_scaling_factor = 1.0 + lr_bsz_interact * 0.5 * log_bsz_norm
    lr_effect = lr_coeff * np.exp(lr_exp * log_lr_norm / np.maximum(lr_scaling_factor, 0.3))
    
    # Data size effect (more data reduces loss)
    data_effect = -data_coeff * np.exp(-data_exp * log_data_norm)
    
    # Parameter size effect (larger model reduces loss, but with diminishing returns)
    param_effect = -param_coeff * np.exp(-param_exp * log_param_norm)
    
    # Batch size effect (larger batches typically better, but saturation effects)
    batch_effect = batch_coeff * np.exp(batch_exp * log_bsz_norm)
    
    # Coupling term: interaction between data and parameter scaling
    coupling_term = coupling * data_effect * param_effect / (1.0 + np.abs(coupling))
    
    # Combine all effects additively (more stable than multiplicative)
    loss = base_loss + lr_effect + data_effect + param_effect + batch_effect + coupling_term
    
    # Clip to physically reasonable range
    loss = np.clip(loss, 1.5, 4.5)
    
    return loss


def fit_scaling_law(data_points, loss_values):
    """
    Advanced multi-stage optimization:
    1. Smart initialization from correlation analysis
    2. Aggressive global search with differential evolution
    3. Multiple local refinements with L-BFGS-B
    4. Final validation and selection
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.atleast_1d(np.asarray(loss_values, dtype=np.float64))
    
    if X.shape[0] != y.shape[0]:
        raise ValueError(f"Shape mismatch: {X.shape[0]} vs {y.shape[0]}")
    if X.shape[1] != 4:
        raise ValueError(f"Expected 4 features, got {X.shape[1]}")
    
    # Data statistics
    y_mean = np.mean(y)
    y_std = np.std(y) + 1e-8
    y_min, y_max = np.min(y), np.max(y)
    
    # Extract features for correlation analysis
    lr = X[:, 0]
    bsz = X[:, 1]
    data_size = X[:, 2]
    param_size = X[:, 3]
    
    # Log transforms for analysis
    log_lr = np.log(np.maximum(lr, 1e-7))
    log_bsz = np.log(np.maximum(bsz, 1.0))
    log_data = np.log(np.maximum(data_size, 1e6))
    log_param = np.log(np.maximum(param_size, 1e6))
    
    # Compute correlations to guide initialization
    corr_lr = np.corrcoef(log_lr, y)[0, 1]
    corr_bsz = np.corrcoef(log_bsz, y)[0, 1]
    corr_data = np.corrcoef(log_data, y)[0, 1]
    corr_param = np.corrcoef(log_param, y)[0, 1]
    
    # Handle NaN correlations
    corr_lr = np.nan_to_num(corr_lr, nan=0.1)
    corr_bsz = np.nan_to_num(corr_bsz, nan=0.1)
    corr_data = np.nan_to_num(corr_data, nan=-0.1)
    corr_param = np.nan_to_num(corr_param, nan=-0.1)
    
    def objective(params_flat):
        """MSE objective with numerical stability checks"""
        try:
            pred = scaling_law_func(X, params_flat)
            if np.any(np.isnan(pred)) or np.any(np.isinf(pred)):
                return 1e10
            mse = np.mean((pred - y) ** 2)
            return np.clip(mse, 0, 1e10)
        except:
            return 1e10
    
    # Smart initialization based on correlation analysis
    init_params = np.array([
        y_mean,                          # base_loss
        np.abs(corr_lr) * 0.3,           # lr_coeff
        np.sign(corr_lr) * 0.4,          # lr_exp
        0.2,                             # lr_bsz_interact
        np.abs(corr_data) * 0.5,         # data_coeff
        0.1,                             # data_exp
        np.abs(corr_param) * 0.3,        # param_coeff
        0.05,                            # param_exp
        np.sign(corr_bsz) * 0.05,        # batch_coeff
        0.05,                            # batch_exp
        0.1                              # coupling
    ])
    
    bounds = [
        (y_mean - 2.0*y_std, y_mean + 2.0*y_std),  # base_loss
        (1e-4, 5.0),                               # lr_coeff
        (-1.5, 1.5),                               # lr_exp
        (-1.0, 1.0),                               # lr_bsz_interact
        (1e-4, 5.0),                               # data_coeff
        (0.01, 0.5),                               # data_exp
        (1e-4, 5.0),                               # param_coeff
        (0.01, 0.3),                               # param_exp
        (1e-4, 1.0),                               # batch_coeff
        (-0.3, 0.3),                               # batch_exp
        (-1.0, 1.0)                                # coupling
    ]
    
    # Stage 1: Local optimization from good initialization
    result_local_1 = minimize(
        objective,
        init_params,
        method='L-BFGS-B',
        bounds=bounds,
        options={'maxiter': 800, 'ftol': 1e-9, 'gtol': 1e-7}
    )
    
    best_params = result_local_1.x
    best_loss = result_local_1.fun
    
    # Stage 2: Aggressive global search with differential evolution
    baseline_mse = np.mean((y - y_mean) ** 2)
    
    result_global = differential_evolution(
        objective,
        bounds,
        maxiter=400,
        popsize=20,
        seed=42,
        atol=1e-10,
        tol=1e-10,
        workers=1,
        updating='deferred',
        polish=True
    )
    
    if result_global.fun < best_loss:
        best_params = result_global.x
        best_loss = result_global.fun
    
    # Stage 3: Multiple local refinements from best global solution
    for _ in range(2):
        result_local = minimize(
            objective,
            best_params,
            method='L-BFGS-B',
            bounds=bounds,
            options={'maxiter': 1500, 'ftol': 1e-10, 'gtol': 1e-8}
        )
        
        if result_local.fun < best_loss:
            best_params = result_local.x
            best_loss = result_local.fun
    
    # Stage 4: Final aggressive refinement if fit is suboptimal
    if best_loss > baseline_mse * 0.1:
        result_final = minimize(
            objective,
            best_params,
            method='L-BFGS-B',
            bounds=bounds,
            options={'maxiter': 2000, 'ftol': 1e-11, 'gtol': 1e-9, 'maxcor': 25}
        )
        
        if result_final.fun < best_loss:
            best_params = result_final.x
    
    return best_params

# EVOLVE-BLOCK-END

#4 Run 5 R² = 0.888055

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM training with optimized log-linear model
Refined 8-parameter model with normalized loss conditioning and adaptive regularization
Simplified from over-parameterized version - focus on essential interactions only
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution
import warnings
warnings.filterwarnings('ignore')

def scaling_law_func(data_points, params):
    """
    Optimized log-linear scaling law with 8 parameters:
    log(loss) = a0 + a1*log(lr) + a2*log(bsz) + a3*log(D) + a4*log(N)
                + a5*log(lr)*log(bsz) + a6*log(D)*log(N) + a7*log(lr)*log(D)
    
    Parameters: [a0, a1, a2, a3, a4, a5, a6, a7] (8 parameters)
    Key interactions based on scaling theory:
    - lr-bsz: learning stability interaction
    - D-N: data-model scaling relationship  
    - lr-D: learning rate adaptation with data scale
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    params = np.atleast_1d(np.asarray(params, dtype=np.float64))
    
    if X.shape[1] != 4:
        raise ValueError(f"Expected 4 features, got {X.shape[1]}")
    
    # Extract features with safety bounds
    lr = np.maximum(X[:, 0], 1e-10)
    bsz = np.maximum(X[:, 1], 1e-10)
    data_size = np.maximum(X[:, 2], 1e-10)
    param_size = np.maximum(X[:, 3], 1e-10)
    
    # Log-transform features
    log_lr = np.log(lr)
    log_bsz = np.log(bsz)
    log_data = np.log(data_size)
    log_param = np.log(param_size)
    
    if len(params) < 8:
        raise ValueError(f"Expected at least 8 parameters, got {len(params)}")
    
    a0, a1, a2, a3, a4, a5, a6, a7 = params[:8]
    
    # Compute log-space prediction with theoretically justified interactions
    log_pred = (a0 + 
               a1 * log_lr + 
               a2 * log_bsz + 
               a3 * log_data + 
               a4 * log_param +
               a5 * log_lr * log_bsz +
               a6 * log_data * log_param +
               a7 * log_lr * log_data)
    
    # Clip for numerical stability
    log_pred = np.clip(log_pred, -2.0, 1.5)
    
    # Convert back to loss space
    pred = np.exp(log_pred)
    
    # Clip to reasonable loss range
    pred = np.clip(pred, 1.5, 4.5)
    
    return pred


def fit_scaling_law(data_points, loss_values):
    """
    Fit scaling law with optimized three-phase strategy:
    1. Robust initialization from regularized log-linear regression on normalized loss
    2. Global search with differential evolution
    3. Local refinement with L-BFGS-B for convergence
    4. Adaptive regularization targeting interaction terms for generalization
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.atleast_1d(np.asarray(loss_values, dtype=np.float64))
    
    if X.shape[0] != y.shape[0]:
        raise ValueError("Data points and loss values must have same length")
    
    if X.shape[1] != 4:
        raise ValueError(f"Expected 4 features, got {X.shape[1]}")
    
    # Normalize loss for better numerical conditioning in optimization
    y_mean = np.mean(y)
    y_std = np.std(y) + 1e-8
    y_norm = (y - y_mean) / y_std
    
    def objective(params):
        """MSE on normalized loss with adaptive regularization."""
        try:
            pred = scaling_law_func(X, params)
            pred_norm = (pred - y_mean) / y_std
            mse = np.mean((pred_norm - y_norm) ** 2)
            
            # Adaptive regularization: stronger on interaction terms
            # Main effects (a1-a4): lighter regularization for flexibility
            # Interactions (a5-a7): stronger regularization to avoid overfitting
            reg_main = 0.0038 * np.sum(params[1:5] ** 2)
            reg_interact = 0.0095 * np.sum(params[5:8] ** 2)
            
            return mse + reg_main + reg_interact
        except:
            return 1e10
    
    # Refined bounds based on scaling law theory and data ranges
    bounds = [
        (0.45, 1.55),      # a0: baseline loss offset (exp: ~1.57-4.70)
        (-1.1, 0.5),       # a1: learning rate effect (typically negative)
        (-0.5, 0.5),       # a2: batch size effect
        (-0.35, 0.05),     # a3: data size effect (typically negative)
        (-0.35, 0.05),     # a4: parameter effect (typically negative)
        (-0.35, 0.35),     # a5: lr-bsz interaction
        (-0.25, 0.25),     # a6: data-param interaction
        (-0.35, 0.35)      # a7: lr-data interaction
    ]
    
    # Smart initialization from ridge-regularized log-linear regression
    log_y = np.log(np.maximum(y, 1e-10))
    log_lr = np.log(np.maximum(X[:, 0], 1e-10))
    log_bsz = np.log(np.maximum(X[:, 1], 1e-10))
    log_data = np.log(np.maximum(X[:, 2], 1e-10))
    log_param = np.log(np.maximum(X[:, 3], 1e-10))
    
    # Normalize log features for better numerical stability in regression
    log_features = np.column_stack([log_lr, log_bsz, log_data, log_param])
    log_features_mean = np.mean(log_features, axis=0)
    log_features_std = np.std(log_features, axis=0) + 1e-8
    log_features_norm = (log_features - log_features_mean) / log_features_std
    
    try:
        # Build design matrix with normalized features
        A = np.column_stack([
            np.ones(len(X)),
            log_features_norm,
            log_features_norm[:, 0] * log_features_norm[:, 1],  # lr-bsz interaction
            log_features_norm[:, 2] * log_features_norm[:, 3],  # data-param interaction
            log_features_norm[:, 0] * log_features_norm[:, 2]   # lr-data interaction
        ])
        
        # Solve with ridge regularization for stability
        ATA = A.T @ A
        ridge_lambda = 1e-6 * np.trace(ATA) / ATA.shape[0]
        ATA_reg = ATA + ridge_lambda * np.eye(ATA.shape[0])
        coef = np.linalg.solve(ATA_reg, A.T @ log_y)
        
        # Denormalize coefficients
        init_params = coef.copy()
        init_params[1:5] = coef[1:5] / log_features_std
        init_params[0] = coef[0] + np.sum(coef[1:5] * log_features_mean / log_features_std)
        
        # Clip to bounds
        init_params = np.clip(init_params, [b[0] for b in bounds], [b[1] for b in bounds])
    except:
        # Fallback initialization
        init_params = np.array([
            np.mean(log_y),
            -0.10, 0.05, -0.10, -0.10,
            0.02, -0.02, 0.01
        ])
        init_params = np.array([np.clip(init_params[i], bounds[i][0], bounds[i][1]) 
                               for i in range(8)])
    
    # Phase 1: Global search with differential evolution (focused resources)
    result_de = differential_evolution(
        objective,
        bounds,
        seed=42,
        maxiter=480,
        popsize=23,
        atol=1e-9,
        tol=1e-9,
        workers=1,
        updating='deferred',
        polish=True,
        init='sobol'
    )
    
    # Phase 2: Local refinement from best global result
    result_lbfgs_de = minimize(
        objective,
        result_de.x,
        method='L-BFGS-B',
        bounds=bounds,
        options={'maxiter': 1800, 'ftol': 1e-11, 'gtol': 1e-9, 'maxcor': 20}
    )
    
    # Phase 3: Alternative refinement from initialization
    result_lbfgs_init = minimize(
        objective,
        init_params,
        method='L-BFGS-B',
        bounds=bounds,
        options={'maxiter': 1500, 'ftol': 1e-11, 'gtol': 1e-9, 'maxcor': 20}
    )
    
    # Phase 4: Fine-tuning from DE with very tight tolerance
    result_lbfgs_fine = minimize(
        objective,
        result_de.x,
        method='L-BFGS-B',
        bounds=bounds,
        options={'maxiter': 1200, 'ftol': 1e-12, 'gtol': 1e-10, 'maxcor': 25}
    )
    
    # Return best result among all candidates
    candidates = [result_lbfgs_de.x, result_lbfgs_init.x, result_lbfgs_fine.x, result_de.x]
    scores = [objective(c) for c in candidates]
    params_opt = candidates[np.argmin(scores)]
    
    return params_opt[:8]
# EVOLVE-BLOCK-END

#5 Run 1 R² = 0.883183

▼

Python

# EVOLVE-BLOCK-START
"""
High-performance scaling law combining theoretical grounding with empirical optimization.
Restores log-power formulation that achieved 0.5032 with critical improvements:
- Better numerical stability through logarithmic scaling
- Comprehensive parameter space with interaction modeling
- Three-stage optimization strategy
- Data-driven initialization with multiple emphases
- Adaptive regularization based on convergence
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution
from scipy.stats import linregress

def scaling_law_func(data_points, params):
    """
    Hybrid scaling law with log-power form:
    loss = a + b*log(D)^alpha + c*log(N)^beta + d*(log(lr)-log(lr_opt))^2 
           + e*log(bsz)^gamma + f*interaction
    
    Theoretical grounding:
    - Log-power scaling for data/params: captures Chinchilla scaling law structure
    - Quadratic LR penalty: reflects optimization landscape near optimum
    - Log-power batch size: models gradient noise scaling
    - LR-BSZ interaction: synergistic effects on training dynamics
    - Numerically stable across 8 orders of magnitude in hyperparameters
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    params = np.asarray(params, dtype=np.float64).flatten()
    
    # Extract features with safety clipping for stability
    lr = np.clip(X[:, 0], 1e-5, 1.0)
    bsz = np.clip(X[:, 1], 1.0, 10000.0)
    data_size = np.clip(X[:, 2], 1e8, 1e12)
    param_size = np.clip(X[:, 3], 1e7, 1e10)
    
    # Ensure 10 parameters
    if len(params) < 10:
        params = np.pad(params, (0, 10 - len(params)), mode='constant', constant_values=0.0)
    
    a = params[0]                           # baseline loss
    b = params[1]                           # data size coefficient
    alpha = np.clip(params[2], 0.2, 2.5)   # data size exponent
    c = params[3]                           # param size coefficient
    beta = np.clip(params[4], 0.2, 2.5)    # param size exponent
    d = np.clip(params[5], 0.0, 0.6)       # learning rate penalty strength
    lr_opt = np.clip(params[6], 1e-4, 0.1) # optimal learning rate
    e = params[7]                           # batch size coefficient
    gamma = np.clip(params[8], 0.05, 2.0)  # batch size exponent
    interaction = params[9]                 # lr-bsz interaction
    
    # Logarithmic transformations for numerical stability
    log_data = np.log(data_size)
    log_param = np.log(param_size)
    log_lr = np.log(lr)
    log_bsz = np.log(bsz)
    log_lr_opt = np.log(lr_opt)
    
    # Data scaling: log-power form (more stable than direct power laws)
    data_term = b * np.power(log_data, alpha)
    
    # Parameter scaling: log-power form
    param_term = c * np.power(log_param, beta)
    
    # Learning rate term: quadratic penalty around optimal
    # Theory: gradient-based optimization has quadratic loss landscape near optimum
    lr_penalty = log_lr - log_lr_opt
    lr_term = d * np.power(lr_penalty, 2)
    
    # Batch size term: log-power form
    # Theory: gradient noise scales as O(1/sqrt(batch_size))
    bsz_term = e * np.power(log_bsz, gamma)
    
    # Learning rate-batch size interaction: models synergistic effects
    # Captures how larger batches can tolerate different learning rates
    interaction_term = interaction * np.power(lr_penalty, 2) * log_bsz
    
    # Combine all components with safety clipping
    pred = a + data_term + param_term + lr_term + bsz_term + interaction_term
    pred = np.clip(pred, 1.5, 4.5)
    
    return pred


def fit_scaling_law(data_points, loss_values):
    """
    Multi-stage optimization with data-driven initialization and aggressive refinement.
    Strategy:
    1. Multi-start local optimization from diverse initializations
    2. Global differential evolution as primary search
    3. Fine-tuning with aggressive local optimization
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64).flatten()
    
    # Normalize for numerical stability during optimization
    y_mean = np.mean(y)
    y_std = np.std(y) + 1e-8
    y_norm = (y - y_mean) / y_std
    
    def objective(params):
        try:
            pred = scaling_law_func(X, params)
            pred_norm = (pred - y_mean) / y_std
            mse = np.mean((pred_norm - y_norm) ** 2)
            # Gentle regularization on scaling coefficients
            # Focus on preventing extreme parameter values
            reg = 0.003 * np.sum(params[1:7]**2)
            return mse + reg
        except:
            return 1e10
    
    # Data-driven initialization using linear regression in log space
    log_data = np.log(X[:, 2])
    log_param = np.log(X[:, 3])
    log_lr = np.log(X[:, 0])
    log_bsz = np.log(X[:, 1])
    
    # Estimate slopes via linear regression (data-driven priors)
    data_slope, _, _, _, _ = linregress(log_data, y)
    param_slope, _, _, _, _ = linregress(log_param, y)
    lr_slope, _, _, _, _ = linregress(log_lr**2, y)
    bsz_slope, _, _, _, _ = linregress(log_bsz, y)
    
    # Multiple initializations with different emphases
    # Each reflects a different hypothesis about scaling dynamics
    inits = [
        # Init 1: Balanced Chinchilla (equal data/param scaling)
        np.array([y_mean, data_slope/1.8, 1.0, param_slope/1.8, 1.0, 0.07, 0.005, bsz_slope/4, 0.9, -0.005]),
        
        # Init 2: Data-dominant (data scaling more important)
        np.array([y_mean, data_slope/1.5, 1.2, param_slope/2.2, 0.8, 0.06, 0.006, bsz_slope/5, 1.0, 0.0]),
        
        # Init 3: Parameter-dominant (parameter scaling more important)
        np.array([y_mean, data_slope/2.2, 0.8, param_slope/1.5, 1.2, 0.08, 0.004, bsz_slope/5, 1.0, 0.01]),
        
        # Init 4: Learning rate focused (stronger LR penalty)
        np.array([y_mean, data_slope/2.0, 0.9, param_slope/2.0, 0.9, 0.12, 0.004, bsz_slope/6, 1.1, 0.02]),
        
        # Init 5: Batch size focused (stronger batch effects)
        np.array([y_mean, data_slope/2.5, 1.1, param_slope/2.5, 1.1, 0.05, 0.007, bsz_slope/3, 0.8, -0.02]),
        
        # Init 6: Strong interaction effects
        np.array([y_mean, data_slope/1.6, 1.3, param_slope/1.6, 1.3, 0.1, 0.006, bsz_slope/2, 1.0, 0.05]),
        
        # Init 7: Weak exponents (smoother)
        np.array([y_mean, data_slope/2.0, 0.7, param_slope/2.0, 0.7, 0.06, 0.005, bsz_slope/4, 0.7, 0.0]),
        
        # Init 8: High interaction negative
        np.array([y_mean, data_slope/1.9, 1.05, param_slope/1.9, 1.05, 0.075, 0.0055, bsz_slope/3.5, 0.95, -0.03]),
    ]
    
    # Parameter bounds based on theoretical considerations
    bounds = [
        (1.5, 4.5),        # a: baseline loss (within observed range)
        (-0.5, 0.1),       # b: data coefficient (mostly negative)
        (0.2, 2.5),        # alpha: data exponent
        (-0.5, 0.1),       # c: param coefficient (mostly negative)
        (0.2, 2.5),        # beta: param exponent
        (0.0, 0.6),        # d: lr penalty strength
        (1e-4, 0.1),       # lr_opt: optimal learning rate
        (-0.3, 0.3),       # e: batch size coefficient
        (0.05, 2.0),       # gamma: batch size exponent
        (-0.15, 0.15),     # interaction: LR-BSZ interaction
    ]
    
    best_params = None
    best_loss = np.inf
    
    # Stage 1: Multi-start local optimization from diverse initializations
    for init in inits:
        try:
            result = minimize(
                objective,
                init,
                method='L-BFGS-B',
                bounds=bounds,
                options={'maxiter': 6000, 'ftol': 1e-10, 'gtol': 1e-8}
            )
            if result.fun < best_loss:
                best_loss = result.fun
                best_params = result.x
        except:
            pass
    
    # Stage 2: Global optimization with differential evolution
    # Use as primary search if local optimization didn't converge well
    if best_params is None or best_loss > 0.25:
        try:
            result = differential_evolution(
                objective,
                bounds,
                seed=42,
                maxiter=2000,
                popsize=35,
                atol=1e-12,
                tol=1e-12,
                workers=1,
                updating='deferred',
                polish=True
            )
            if result.fun < best_loss:
                best_loss = result.fun
                best_params = result.x
        except:
            if best_params is None:
                best_params = inits[0]
    
    # Stage 3: Fine-tune with aggressive optimization
    if best_params is not None and best_loss < 0.5:
        try:
            result = minimize(
                objective,
                best_params,
                method='L-BFGS-B',
                bounds=bounds,
                options={'maxiter': 15000, 'ftol': 1e-12, 'gtol': 1e-9}
            )
            if result.fun < best_loss:
                best_params = result.x
        except:
            pass
    
    return best_params if best_params is not None else inits[0]

# EVOLVE-BLOCK-END