← Back to Leaderboard

LR-BSZ Scaling Law

Agent: SLDAgent
Model: Claude Haiku 4.5
Best R²: 0.922917
Mean R²: 0.899351
Min R²: 0.883183
Runs: 5

All Runs (sorted by R²)

Best Run 2 R² = 0.922917
Python
# EVOLVE-BLOCK-START
"""
High-performance scaling law combining proven 10-parameter model with optimized 
initialization and multi-stage refinement. Balances expressiveness and generalization.
"""
import numpy as np
from scipy.optimize import minimize, differential_evolution

def scaling_law_func(data_points, params):
    """
    Compute predicted loss using multi-term scaling law in log-space.
    
    Model: loss = a + b1*log(D) + b2*log(N) + b3*log(lr) + b4*log(bsz)
                  + c1*log(D)*log(N) + c2*log(lr)*log(bsz)
                  + d1*(log(lr))^2 + d2*(log(D))^2 + d3*log(N)*log(lr)
    
    Captures Chinchilla-style scaling, non-monotonic lr effects, and key interactions.
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    params = np.asarray(params, dtype=np.float64)
    
    if params.ndim == 1:
        params = params[None, :]
    
    # Extract and safely clip features
    lr = np.clip(X[:, 0], 1e-6, 1.0)
    bsz = np.clip(X[:, 1], 1e-3, 1e6)
    data_size = np.clip(X[:, 2], 1e6, 1e12)
    param_size = np.clip(X[:, 3], 1e6, 1e12)
    
    # Log transformation
    log_lr = np.log(lr)
    log_bsz = np.log(bsz)
    log_data = np.log(data_size)
    log_param = np.log(param_size)
    
    # Proven normalization constants from best performers
    log_lr_norm = (log_lr + 11.0) / 4.0
    log_bsz_norm = (log_bsz - 2.77) / 3.0
    log_data_norm = (log_data - 14.5) / 4.5
    log_param_norm = (log_param - 17.5) / 3.0
    
    N = X.shape[0]
    
    if params.shape[0] == 1:
        p = params[0]
        pred = p[0] * np.ones(N)
        pred += p[1] * log_data_norm
        pred += p[2] * log_param_norm
        pred += p[3] * log_lr_norm
        pred += p[4] * log_bsz_norm
        pred += p[5] * log_data_norm * log_param_norm
        pred += p[6] * log_lr_norm * log_bsz_norm
        pred += p[7] * (log_lr_norm ** 2)
        pred += p[8] * (log_data_norm ** 2)
        if len(p) > 9:
            pred += p[9] * log_param_norm * log_lr_norm
        return pred
    else:
        preds = []
        for i in range(params.shape[0]):
            p = params[i]
            pred = p[0] * np.ones(N)
            pred += p[1] * log_data_norm
            pred += p[2] * log_param_norm
            pred += p[3] * log_lr_norm
            pred += p[4] * log_bsz_norm
            pred += p[5] * log_data_norm * log_param_norm
            pred += p[6] * log_lr_norm * log_bsz_norm
            pred += p[7] * (log_lr_norm ** 2)
            pred += p[8] * (log_data_norm ** 2)
            if len(p) > 9:
                pred += p[9] * log_param_norm * log_lr_norm
            preds.append(pred)
        return np.column_stack(preds)


def fit_scaling_law(data_points, loss_values):
    """
    Three-stage optimization proven to achieve best results.
    
    Stage 1: Smart correlation-based initialization
    Stage 2: Global search with differential_evolution
    Stage 3: Precision refinement with BFGS
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64).flatten()
    
    n_params = 10
    
    # Compute feature statistics for initialization
    log_lr = np.log(np.clip(X[:, 0], 1e-6, 1.0))
    log_bsz = np.log(np.clip(X[:, 1], 1e-3, 1e6))
    log_data = np.log(np.clip(X[:, 2], 1e6, 1e12))
    log_param = np.log(np.clip(X[:, 3], 1e6, 1e12))
    
    # Apply proven normalization
    log_lr_norm = (log_lr + 11.0) / 4.0
    log_bsz_norm = (log_bsz - 2.77) / 3.0
    log_data_norm = (log_data - 14.5) / 4.5
    log_param_norm = (log_param - 17.5) / 3.0
    
    init_bias = np.mean(y)
    
    # Compute correlations for parameter initialization
    def safe_cov(x):
        cov_val = np.cov(x, y)[0, 1]
        std_x = np.std(x)
        std_y = np.std(y)
        if std_x > 1e-10 and std_y > 1e-10:
            return cov_val / (std_x * std_y)
        return 0.0
    
    corr_data = safe_cov(log_data_norm)
    corr_param = safe_cov(log_param_norm)
    corr_lr = safe_cov(log_lr_norm)
    corr_bsz = safe_cov(log_bsz_norm)
    
    # Smart initialization with proven scaling factors
    init_params = np.array([
        init_bias,           # p[0]: bias
        -0.15 * corr_data,   # p[1]: log_data (negative: more data = lower loss)
        -0.10 * corr_param,  # p[2]: log_param
        0.05 * corr_lr,      # p[3]: log_lr
        -0.02 * corr_bsz,    # p[4]: log_bsz
        0.01,                # p[5]: interaction data-param
        -0.005,              # p[6]: interaction lr-bsz
        0.02,                # p[7]: quadratic lr
        0.005,               # p[8]: quadratic data
        0.01                 # p[9]: interaction param-lr
    ])
    
    def objective(params):
        try:
            pred = scaling_law_func(X, params)
            mse = np.mean((pred - y) ** 2)
            return mse if np.isfinite(mse) else 1e10
        except:
            return 1e10
    
    # Stage 1: Local optimization from smart initialization
    result_local = minimize(
        objective,
        init_params,
        method='L-BFGS-B',
        bounds=[(-10, 10) for _ in range(n_params)],
        options={'maxiter': 500, 'ftol': 1e-8}
    )
    
    best_params = result_local.x if result_local.success else init_params
    best_loss = objective(best_params)
    
    # Stage 2: Global search with differential evolution
    bounds = [(-5, 5) for _ in range(n_params)]
    result_global = differential_evolution(
        objective,
        bounds,
        maxiter=300,
        workers=1,
        seed=42,
        atol=1e-9,
        tol=1e-9,
        polish=True
    )
    
    if result_global.fun < best_loss:
        best_params = result_global.x
        best_loss = result_global.fun
    
    # Stage 3: Final polish with BFGS
    result_final = minimize(
        objective,
        best_params,
        method='BFGS',
        options={'maxiter': 200}
    )
    
    if result_final.success and result_final.fun < best_loss:
        best_params = result_final.x
    
    return best_params
# EVOLVE-BLOCK-END
#2 Run 3 R² = 0.907465
#3 Run 4 R² = 0.895133
#4 Run 5 R² = 0.888055
#5 Run 1 R² = 0.883183