SLD - Domain Mixture Scaling Law - SLDAgent + Gemini 3 Pro Preview

All Runs (sorted by R²)

Best Run 5 R² = 0.996946

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios
Improved with a "Non-linear Signed Rank-1 Transfer" Model (35 parameters).
This model generalizes the successful Rank-1 approach by allowing the
transfer impact to scale non-linearly with donor data size (tau parameter),
while maintaining signed transfer (interference) and pre-training offsets.
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    """
    Non-linear Signed Rank-1 Transfer Scaling Law.
    
    Model:
    Loss_i = A_i * (D_total_i)^(-alpha_i) + E_i
    D_total_i = D_pre_i + x_i + beta_i * sum_{j != i} (gamma_j * x_j^(tau_j))
    
    Parameters (35):
    0-4:   log_A (Amplitude)
    5-9:   log_alpha (Scaling exponent)
    10-14: log_E (Irreducible loss)
    15-19: logit_beta (Receptivity)
    20-24: raw_gamma (Transferability strength, signed)
    25-29: log_D_pre (Pre-training offset)
    30-34: log_tau (Transfer non-linearity)
    """
    X = np.atleast_2d(np.asarray(data_points))
    params = np.asarray(params).flatten()
    
    # Parameter Extraction
    log_A = params[0:5]
    log_alpha = params[5:10]
    log_E = params[10:15]
    logit_beta = params[15:20]
    raw_gamma = params[20:25]
    log_D_pre = params[25:30]
    log_tau = params[30:35]
    
    # Transformation to physical space
    A = np.exp(log_A)
    alpha = np.exp(log_alpha)
    E = np.exp(log_E)
    beta = 1.0 / (1.0 + np.exp(-logit_beta))
    gamma = np.tanh(raw_gamma)
    D_pre = np.exp(log_D_pre)
    tau = np.exp(log_tau)
    
    # Compute transformed inputs for transfer
    # Use small epsilon for numerical stability of gradients (log(0))
    # X_trans_j = x_j ^ tau_j
    X_safe = X + 1e-9
    X_trans = X_safe ** tau[None, :]
    
    # Weighted transfer components
    # Gamma_X: (N, 5)
    Gamma_X = X_trans * gamma[None, :]
    
    # Sum of all transfer sources (N, 1)
    Sum_Gamma_X = np.sum(Gamma_X, axis=1, keepdims=True)
    
    # Transfer to i from others: Sum - Self
    # Subtract own contribution: gamma_i * x_i^tau_i
    Transfer_Input = Sum_Gamma_X - Gamma_X
    
    # Apply receptivity
    # beta is (5,) -> (1, 5)
    Transfer_Term = beta[None, :] * Transfer_Input
    
    # Total Effective Data
    # D_total = D_pre + x_i + Transfer
    # Note: Target domain data x_i is treated linearly (standard scaling)
    D_total = D_pre[None, :] + X + Transfer_Term
    
    # Numerical stability
    D_total = np.maximum(D_total, 1e-8)
    
    # Predictions
    pred = A[None, :] * (D_total ** -alpha[None, :]) + E[None, :]
    
    return pred

def fit_scaling_law(data_points, loss_values):
    X = np.atleast_2d(np.asarray(data_points))
    Y = np.atleast_2d(np.asarray(loss_values))
    n_domains = 5
    
    # --- Stage 1: Robust Independent Fit ---
    # Fit A, alpha, E, D_pre assuming no transfer.
    # This establishes the baseline scaling curve for each domain.
    
    init_params_list = []
    
    for i in range(n_domains):
        x_i = X[:, i]
        y_i = Y[:, i]
        y_min, y_max = np.min(y_i), np.max(y_i)
        
        # Grid search for initialization to avoid local minima
        best_mse = float('inf')
        best_p = None
        
        # Heuristic grid
        # alpha: slope, d_guess: offset
        for d_guess in [1e-4, 1e-2, 0.1]:
            for alpha_guess in [0.3, 0.7, 1.5]:
                # Estimate A based on endpoints
                # y_range ~ A * (d^-alpha - (1+d)^-alpha)
                denom = (d_guess**-alpha_guess) - ((1.0+d_guess)**-alpha_guess)
                if denom < 1e-6: denom = 1e-6
                A_guess = max((y_max - y_min) / denom, 1e-4)
                
                p0_local = [
                    np.log(A_guess),
                    np.log(alpha_guess),
                    np.log(max(y_min - 0.05, 1e-5)),
                    np.log(d_guess)
                ]
                
                def obj_local(p):
                    A = np.exp(p[0])
                    al = np.exp(p[1])
                    E = np.exp(p[2])
                    d = np.exp(p[3])
                    pred = A * ((x_i + d)**-al) + E
                    return np.mean((pred - y_i)**2)
                
                try:
                    # L-BFGS-B with bounds
                    # alpha in [0.05, 7], E < y_min, D_pre in [1e-6, 1]
                    bounds = [(None, None), (-3, 2), (None, np.log(y_min)), (-14, 0)]
                    res = minimize(obj_local, p0_local, method='L-BFGS-B', bounds=bounds)
                    if res.fun < best_mse:
                        best_mse = res.fun
                        best_p = res.x
                except:
                    pass
        
        if best_p is None:
            # Fallback
            best_p = np.array([0.0, -0.5, np.log(max(y_min-0.1, 1e-5)), -4.6])
            
        init_params_list.append(best_p)
        
    init_params_list = np.array(init_params_list)
    
    # --- Stage 2: Joint Optimization ---
    # Initialize full model parameters.
    # Start with tau = 1.0 (log_tau = 0) -> Linear transfer initially.
    # Start with beta small, gamma neutral.
    
    p0_joint = np.concatenate([
        init_params_list[:, 0], # log_A
        init_params_list[:, 1], # log_alpha
        init_params_list[:, 2], # log_E
        np.full(5, -3.0),       # logit_beta (sigmoid(-3) ~ 0.05)
        np.full(5, 0.0),        # raw_gamma (tanh(0) = 0)
        init_params_list[:, 3], # log_D_pre
        np.full(5, 0.0)         # log_tau (tau = 1)
    ])
    
    def obj_joint(p):
        preds = scaling_law_func(X, p)
        mse = np.mean((preds - Y)**2)
        
        # Regularization
        
        # 1. Weak L2 on all params to prevent drift
        reg_l2 = 1e-7 * np.sum(p**2)
        
        # 2. Sparsity on gamma (prefer simpler transfer models)
        # indices 20-25 are raw_gamma
        reg_gamma = 1e-5 * np.sum(p[20:25]**2)
        
        # 3. Regularize tau towards 1.0 (log_tau towards 0)
        # indices 30-35 are log_tau
        # We prefer linear transfer unless data strongly suggests otherwise
        reg_tau = 1e-4 * np.sum(p[30:35]**2)
        
        # 4. Stability for D_pre (indices 25-30)
        # Penalize if log_d < -12 (d < 6e-6) to avoid numerical instability
        log_d = p[25:30]
        pen_d = 1e-3 * np.sum(np.maximum(0, -12.0 - log_d)**2)
        
        return mse + reg_l2 + reg_gamma + reg_tau + pen_d
        
    try:
        # BFGS is efficient for 35 parameters
        res = minimize(obj_joint, p0_joint, method='BFGS', 
                      options={'maxiter': 2000, 'gtol': 1e-6})
        best_p = res.x
    except:
        best_p = p0_joint
        
    return best_p
# EVOLVE-BLOCK-END

#2 Run 2 R² = 0.996107

▼

Python

# EVOLVE-BLOCK-START
import numpy as np
from scipy.optimize import minimize, nnls

def scaling_law_func(data_points, params):
    # data_points: (N, 5) array of domain proportions
    # params: Flat array of 35 parameters (5 domains * 7 parameters)
    
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    P = np.asarray(params, dtype=np.float64).reshape(5, 7)
    
    # Extract parameters per domain
    # P[:, 0]: log(A)
    # P[:, 1]: log(alpha)
    # P[:, 2]: log(E)
    # P[:, 3:]: logits for betas (4 values)
    
    A = np.exp(P[:, 0])      # (5,)
    alpha = np.exp(P[:, 1])  # (5,)
    E = np.exp(P[:, 2])      # (5,)
    
    # Build Transfer Matrix B (5, 5)
    # Allows transfer > 1.0 (up to 3.0) to capture high-quality data
    B = np.eye(5)
    for i in range(5):
        logits = P[i, 3:]
        # Sigmoid scaled to [0, 3]
        betas = 3.0 / (1.0 + np.exp(-logits))
        mask = np.arange(5) != i
        B[i, mask] = betas
        
    # Effective data: D_eff = X @ B.T
    # Add epsilon for numerical stability
    effective_data = X @ B.T + 1e-9
    
    # Power law: L = A * D^-alpha + E
    pred = A * (effective_data ** -alpha) + E
    
    return pred

def fit_scaling_law(data_points, loss_values):
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    Y = np.atleast_2d(np.asarray(loss_values, dtype=np.float64))
    
    n_domains = 5
    all_params = []
    
    # Fit each domain independently
    for i in range(n_domains):
        y_i = Y[:, i]
        x_self = X[:, i]
        mask_others = np.arange(n_domains) != i
        X_others = X[:, mask_others] # (N, 4)
        
        # Objective: MSE + weak regularization
        def objective(p):
            A = np.exp(p[0])
            a = np.exp(p[1])
            E_val = np.exp(p[2])
            
            b_logits = p[3:]
            # Match the scaling in scaling_law_func
            b = 3.0 / (1.0 + np.exp(-b_logits))
            
            d_eff = x_self + X_others @ b + 1e-9
            pred = A * (d_eff ** -a) + E_val
            
            mse = np.mean((pred - y_i)**2)
            # Regularize: keep params from exploding, prefer sparse transfer
            reg = 1e-6 * (np.sum(p[:3]**2) + np.sum(b_logits**2))
            return mse + reg

        # Robust Initialization Strategy
        # 1. Grid search over alpha and a shared scalar beta
        # 2. Use NNLS to find optimal A, E for each grid point
        best_mse = np.inf
        best_p0 = np.zeros(7)
        
        # Grid ranges
        alphas = [0.1, 0.3, 0.5, 0.8, 1.2, 1.8, 2.5]
        # Scalar betas corresponding to transfer strengths
        # 0.01 (negligible), 1.0 (equal), 1.5 (strong)
        scalar_betas = [0.01, 0.1, 0.5, 1.0, 1.5]
        
        for a_guess in alphas:
            for b_val in scalar_betas:
                # Effective data assumption: all other domains contribute equally b_val
                d_guess = x_self + b_val * np.sum(X_others, axis=1) + 1e-9
                
                # Linear features for A, E
                feat = d_guess ** -a_guess
                M = np.vstack([feat, np.ones_like(feat)]).T
                
                # NNLS solve
                try:
                    sol, resid_norm = nnls(M, y_i)
                    A_est, E_est = sol[0], sol[1]
                    # resid_norm is ||Ax-y||
                    curr_mse = (resid_norm**2) / len(y_i)
                except:
                    A_est, E_est = 1.0, np.min(y_i)
                    curr_mse = np.inf
                
                if curr_mse < best_mse:
                    best_mse = curr_mse
                    
                    # Inverse transform for initialization
                    # A, alpha, E are log
                    A_est = max(A_est, 1e-6)
                    E_est = max(E_est, 1e-6)
                    
                    # b_val = 3 * sigmoid(logit) -> sigmoid = b/3
                    # logit = log(sig / (1-sig)) = log( (b/3) / (1 - b/3) ) = log(b / (3-b))
                    # Clip b_val to safe range (0, 3)
                    safe_b = np.clip(b_val, 1e-4, 2.99)
                    logit_b = np.log(safe_b / (3.0 - safe_b))
                    
                    best_p0 = np.array([
                        np.log(A_est),
                        np.log(a_guess),
                        np.log(E_est),
                        logit_b, logit_b, logit_b, logit_b
                    ])
        
        # Optimization
        try:
            # BFGS with gradient approximation
            res = minimize(objective, best_p0, method='BFGS', options={'maxiter': 200, 'gtol': 1e-6})
            final_p = res.x
        except:
            final_p = best_p0
            
        all_params.append(final_p)
            
    return np.concatenate(all_params)
# EVOLVE-BLOCK-END

#3 Run 3 R² = 0.996106

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios
Model: Domain-Specific Power Law with Low-Rank Transfer Matrix (Rank-1 + Bias)
Improved with Robust Initialization and Multi-Stage Optimization
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    """
    Predicts multi-domain loss based on domain proportions.
    Uses a Rank-1 + Bias parameterization for the transfer matrix logits.
    
    Args:
        data_points: (N, 5) array of domain mixture proportions.
        params: Flat array of 35 parameters.
    
    Returns:
        (N, 5) array of predicted losses.
    """
    X = np.atleast_2d(np.asarray(data_points))
    params = np.asarray(params)
    
    # Unpack parameters (35 total)
    # 0-4:   log A      (Amplitude)
    # 5-9:   log alpha  (Scaling exponent)
    # 10-14: log E      (Irreducible loss)
    # 15-19: b          (Target receptivity bias)
    # 20-24: s          (Source quality bias)
    # 25-29: u          (Interaction factor U)
    # 30-34: v          (Interaction factor V)
    
    log_A = params[0:5]
    log_alpha = params[5:10]
    log_E = params[10:15]
    b = params[15:20]
    s = params[20:25]
    u = params[25:30]
    v = params[30:35]
    
    A = np.exp(log_A)
    alpha = np.exp(log_alpha)
    E = np.exp(log_E)
    
    # Calculate Transfer Matrix W (5, 5) via Low-Rank Logits
    # W_ij = sigmoid(b_i + s_j + u_i * v_j)
    # b_i: Base receptivity of target domain i
    # s_j: Base quality of source domain j
    # u_i * v_j: Specific interaction/alignment between i and j
    
    # Broadcasting: (5,1) + (1,5) + (5,1)*(1,5) -> (5,5)
    logits = b[:, None] + s[None, :] + (u[:, None] * v[None, :])
    W = 1.0 / (1.0 + np.exp(-logits))
    
    # Calculate Effective Data
    # D_eff_i = x_i + sum_{j!=i} W_ij * x_j
    # We compute X @ W.T. 
    # (X @ W.T)_ki = sum_j x_kj * W_ij
    # We need to ensure the diagonal term is treated as 1 (self-transfer)
    # The formula D_eff = x_i + sum_{j!=i} W_ij x_j
    # is equivalent to: (X @ W.T) - (X * diag(W)) + X
    
    term_total = X @ W.T
    w_diag = np.diag(W)
    term_diag = X * w_diag[None, :]
    
    effective_data = term_total - term_diag + X
    
    # Add epsilon for numerical stability
    effective_data = np.maximum(effective_data, 1e-8)
    
    # Power Law Prediction
    # pred = A * D_eff^-alpha + E
    pred = A[None, :] * (effective_data ** -alpha[None, :]) + E[None, :]
    
    return pred

def fit_scaling_law(data_points, loss_values):
    """
    Fits the scaling law parameters using a robust multi-stage strategy.
    """
    X = np.atleast_2d(np.asarray(data_points))
    Y = np.atleast_2d(np.asarray(loss_values))
    N, D = X.shape
    
    # --- Stage 1: Robust Independent Fit (Grid Search + Optimization) ---
    # Fit each domain with a simplified model: L = A * (x + beta*(1-x))^-alpha + E
    # This establishes the baseline power law parameters and average transfer (beta).
    
    init_params_indep = []
    
    for i in range(D):
        x_i = X[:, i]
        y_i = Y[:, i]
        y_min, y_max = np.min(y_i), np.max(y_i)
        
        def obj_indep(p):
            # p: [log A, logit beta, log alpha, log E]
            A_val = np.exp(p[0])
            beta_val = 1.0 / (1.0 + np.exp(-p[1]))
            alpha_val = np.exp(p[2])
            E_val = np.exp(p[3])
            
            eff = x_i + beta_val * (1.0 - x_i) + 1e-8
            pred = A_val * (eff ** -alpha_val) + E_val
            mse = np.mean((pred - y_i)**2)
            # Weak regularization to keep params in check
            reg = 1e-6 * np.sum(p**2)
            return mse + reg
        
        # Grid Search for initialization
        best_local_loss = float('inf')
        best_local_p = None
        
        # Grid: 
        # A: derived from range
        # beta: weak transfer (-3), moderate (-1)
        # alpha: low (0.3), medium (0.7), high (1.2)
        # E: min y
        
        log_A_est = np.log(max(y_max - y_min, 1e-3))
        log_E_est = np.log(max(y_min - 0.1, 1e-4))
        
        beta_logits = [-3.0, -1.0] 
        alpha_vals = [0.3, 0.7, 1.2]
        
        for bl in beta_logits:
            for av in alpha_vals:
                p0 = [log_A_est, bl, np.log(av), log_E_est]
                try:
                    res = minimize(obj_indep, p0, method='BFGS', options={'gtol': 1e-4, 'maxiter': 100})
                    if res.fun < best_local_loss:
                        best_local_loss = res.fun
                        best_local_p = res.x
                except:
                    continue
        
        if best_local_p is None:
            # Fallback
            best_local_p = np.array([0.0, -2.0, -0.5, -2.0])
            
        init_params_indep.append(best_local_p)
            
    init_params_indep = np.array(init_params_indep) # (5, 4)
    
    # Extract baseline parameters
    log_A_base = init_params_indep[:, 0]
    logit_beta_base = init_params_indep[:, 1]
    log_alpha_base = init_params_indep[:, 2]
    log_E_base = init_params_indep[:, 3]
    
    # --- Stage 2: Global Optimization with Interaction Refinement ---
    
    # Initial guess construction
    # We initialize b (target receptivity) with the fitted logit_beta.
    # We initialize s, u, v with small random noise to break symmetry but start near the independent fit solution.
    
    def get_global_obj(params_full):
        pred = scaling_law_func(X, params_full)
        mse = np.mean((pred - Y)**2)
        
        # Regularization
        # 1. Interaction Sparsity (u, v): Prefer Rank-0 (Bias only) if sufficient
        reg_uv = 1e-5 * np.sum(params_full[25:35]**2)
        
        # 2. Source Bias (s): Prefer 0 (all sources equal)
        reg_s = 1e-5 * np.sum(params_full[20:25]**2)
        
        # 3. Target Bias (b): Stay close to independent fit beta (prior)
        # This prevents the model from drifting too far from the robust independent signals
        b_curr = params_full[15:20]
        reg_b = 1e-6 * np.sum((b_curr - logit_beta_base)**2)
        
        # 4. Base params: Weak L2
        reg_base = 1e-7 * np.sum(params_full[:15]**2)
        
        return mse + reg_uv + reg_s + reg_b + reg_base

    # Multi-start optimization for global fit
    best_global_loss = float('inf')
    best_global_params = None
    
    seeds = [0, 1, 2]
    
    for seed in seeds:
        np.random.seed(seed)
        
        # Jitter the base params slightly
        p_base = np.concatenate([log_A_base, log_alpha_base, log_E_base])
        p_base += np.random.normal(0, 0.01, size=p_base.shape)
        
        # Initialize interaction terms
        b_init = logit_beta_base.copy()
        s_init = np.random.normal(0, 0.05, D)
        u_init = np.random.normal(0, 0.05, D)
        v_init = np.random.normal(0, 0.05, D)
        
        p0_global = np.concatenate([p_base, b_init, s_init, u_init, v_init])
        
        try:
            # First phase: Optimize only interaction terms (15:35) keeping base fixed
            # This aligns the transfer matrix before tuning the power law
            def obj_interact_only(p_interact):
                p_full = np.concatenate([p_base, p_interact])
                return get_global_obj(p_full)
            
            res_interact = minimize(obj_interact_only, p0_global[15:], method='BFGS', 
                                    options={'gtol': 1e-4, 'maxiter': 200})
            p0_global[15:] = res_interact.x
            
            # Second phase: Global optimization
            res = minimize(get_global_obj, p0_global, method='BFGS', 
                           options={'gtol': 1e-6, 'maxiter': 1500})
            
            if res.fun < best_global_loss:
                best_global_loss = res.fun
                best_global_params = res.x
        except:
            continue
            
    if best_global_params is None:
        # Fallback to independent fit params extended with zeros
        best_global_params = np.concatenate([
            log_A_base, log_alpha_base, log_E_base,
            logit_beta_base, np.zeros(15)
        ])
        
    return best_global_params
# EVOLVE-BLOCK-END

#4 Run 4 R² = 0.995770

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios
Improved Full Transfer Matrix Model (35 parameters).
Uses a 3-stage fitting process with a Log-Linear Grid Search for robust initialization.
Structure: 
1. Independent fit using grid search over E and linear regression for (A, alpha).
2. Global Rank-1 Transfer fit to capture domain relationships.
3. Per-domain Full Matrix relaxation with regularization towards Rank-1 prior.
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    # data_points: (N, 5) array of domain proportions
    # params: Flat array of 35 parameters (7 per domain)
    
    X = np.atleast_2d(np.asarray(data_points))
    # Reshape params: 5 domains x 7 parameters
    # [log_A, log_alpha, log_E, logit_rho_1, logit_rho_2, logit_rho_3, logit_rho_4]
    P = np.asarray(params).flatten().reshape(5, 7)
    
    # Extract physical parameters
    A = np.exp(P[:, 0])      # Amplitude
    alpha = np.exp(P[:, 1])  # Scaling exponent
    E = np.exp(P[:, 2])      # Irreducible loss
    
    # Transfer coefficients (off-diagonal elements)
    rhos = 1.0 / (1.0 + np.exp(-P[:, 3:]))
    
    # Construct Transfer Matrix R where D_eff = X @ R^T
    # Diagonal is 1.0, Off-diagonals are rhos
    R = np.eye(5)
    mask = ~np.eye(5, dtype=bool)
    R[mask] = rhos.flatten()
    
    # Calculate Effective Data Size
    # D_eff: (N, 5)
    effective_data = X @ R.T + 1e-9
    
    # Compute power law: L = A * D^-alpha + E
    # Broadcasting: A, alpha, E are (1, 5)
    pred = A[None, :] * (effective_data ** -alpha[None, :]) + E[None, :]
    
    return pred


def fit_scaling_law(data_points, loss_values):
    X = np.atleast_2d(np.asarray(data_points))
    Y = np.atleast_2d(np.asarray(loss_values))
    n_domains = 5
    
    # --- Stage 1: Robust Independent Fit ---
    # Initialize A, alpha, E per domain by scanning E and solving linear regression for log(A), alpha
    init_phys_params = []
    
    for i in range(n_domains):
        x = X[:, i]
        y = Y[:, i]
        y_min = np.min(y)
        
        best_mse = np.inf
        # Default fallback
        best_p = [np.log(1.0), np.log(0.5), np.log(max(y_min - 0.1, 1e-3))]
        
        # Grid search for E
        # Power law L = A*x^-a + E => log(L-E) = log(A) - a*log(x)
        # We scan reasonable fractions of min_loss for E
        valid_mask = x > 1e-4
        if np.sum(valid_mask) >= 5:
            x_v = x[valid_mask]
            y_v = y[valid_mask]
            log_x = np.log(x_v)
            
            for e_frac in [0.0, 0.5, 0.8, 0.9, 0.95, 0.99]:
                e_val = y_min * e_frac
                if e_val < 1e-6: e_val = 1e-6
                
                # Check feasibility
                if np.any(y_v <= e_val): continue
                
                log_y_shifted = np.log(y_v - e_val)
                
                # Linear fit
                cov = np.cov(log_x, log_y_shifted)
                if cov[0,0] < 1e-9: continue
                
                slope = cov[0, 1] / cov[0, 0]
                intercept = np.mean(log_y_shifted) - slope * np.mean(log_x)
                
                alpha_est = -slope
                A_est = np.exp(intercept)
                
                # Check physical plausibility (scaling laws usually have alpha > 0)
                if alpha_est <= 0.01 or alpha_est > 4.0: continue
                
                # Evaluate MSE on full data
                pred = A_est * (x**-alpha_est + 1e-9) + e_val
                mse = np.mean((pred - y)**2)
                
                if mse < best_mse:
                    best_mse = mse
                    best_p = [np.log(A_est), np.log(alpha_est), np.log(e_val)]
        
        # Refine with BFGS
        def obj_indep(p):
            A = np.exp(p[0])
            alp = np.exp(p[1])
            E = np.exp(p[2])
            pred = A * ((x + 1e-9)**-alp) + E
            return np.mean((pred - y)**2) + 1e-6*np.sum(p**2)
            
        try:
            res = minimize(obj_indep, best_p, method='BFGS', options={'gtol': 1e-5})
            init_phys_params.append(res.x)
        except:
            init_phys_params.append(best_p)
            
    init_phys_params = np.array(init_phys_params)

    # --- Stage 2: Global Rank-1 Transfer Fit ---
    # Fit rho_ij = beta_i * gamma_j
    # Params: 15 (phys) + 5 (logit_beta) + 5 (logit_gamma) = 25
    
    p_rank1_init = np.concatenate([
        init_phys_params.flatten(),
        np.full(5, -2.0), # beta ~ 0.12
        np.full(5, 0.0)   # gamma ~ 0.5
    ])
    
    def obj_rank1(p):
        P_phys = p[:15].reshape(5, 3)
        A = np.exp(P_phys[:, 0])
        alpha = np.exp(P_phys[:, 1])
        E = np.exp(P_phys[:, 2])
        
        beta = 1.0 / (1.0 + np.exp(-p[15:20]))
        gamma = 1.0 / (1.0 + np.exp(-p[20:25]))
        
        # Effective Data: D = X + beta * (sum(gamma*X) - gamma*X)
        GX = X * gamma[None, :]
        transfer = np.sum(GX, axis=1, keepdims=True) - GX
        D_eff = X + beta[None, :] * transfer + 1e-9
        
        pred = A[None, :] * (D_eff ** -alpha[None, :]) + E[None, :]
        return np.mean((pred - Y)**2) + 1e-6*np.sum(p**2)

    try:
        res_rank1 = minimize(obj_rank1, p_rank1_init, method='BFGS', options={'maxiter': 200})
        p_rank1 = res_rank1.x
    except:
        p_rank1 = p_rank1_init

    # --- Stage 3: Full Matrix Relaxation (Per Domain) ---
    # Initialize with Rank-1 parameters, but allow deviation
    
    phys = p_rank1[:15].reshape(5, 3)
    beta = 1.0 / (1.0 + np.exp(-p_rank1[15:20]))
    gamma = 1.0 / (1.0 + np.exp(-p_rank1[20:25]))
    
    final_params = []
    
    for i in range(n_domains):
        # Calculate initial rhos from Rank-1
        rhos_init = []
        for j in range(n_domains):
            if i == j: continue
            val = np.clip(beta[i] * gamma[j], 1e-4, 1.0 - 1e-4)
            rhos_init.append(np.log(val / (1.0 - val)))
            
        p0_i = np.concatenate([phys[i], rhos_init])
        
        y_target = Y[:, i]
        others_idx = [j for j in range(5) if j != i]
        X_others = X[:, others_idx]
        
        def obj_full(p):
            A = np.exp(p[0])
            alp = np.exp(p[1])
            E = np.exp(p[2])
            
            curr_rhos = 1.0 / (1.0 + np.exp(-p[3:]))
            
            # D_eff = x_i + sum(rho * x_other)
            d_eff = X[:, i] + X_others @ curr_rhos + 1e-9
            
            pred = A * (d_eff ** -alp) + E
            mse = np.mean((pred - y_target)**2)
            
            # Trust-region regularization towards Rank-1 initialization
            reg = 1e-4 * np.sum((p - p0_i)**2)
            return mse + reg
            
        try:
            res_full = minimize(obj_full, p0_i, method='BFGS', options={'gtol': 1e-6})
            final_params.append(res_full.x)
        except:
            final_params.append(p0_i)
            
    return np.concatenate(final_params)
# EVOLVE-BLOCK-END

#5 Run 1 R² = 0.995604

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios
Improved with a Domain-Specific Power Law and Low-Rank Transfer Matrix.
Uses Weighted MSE optimization to balance domain performance and L-BFGS-B with bounds for stability.
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    # data_points: (N, 5) array of domain proportions
    # params: Flat array of 35 parameters
    # Structure:
    # 0-4: log A (Amplitude)
    # 5-9: log alpha (Scaling exponent)
    # 10-14: log E (Irreducible loss)
    # 15-19: b (Target receptivity bias)
    # 20-24: s (Source quality bias)
    # 25-29: u (interaction factor 1)
    # 30-34: v (interaction factor 2)
    
    X = np.atleast_2d(np.asarray(data_points))
    params = np.asarray(params)
    
    # Unpack parameters
    log_A = params[0:5]
    log_alpha = params[5:10]
    log_E = params[10:15]
    b = params[15:20]
    s = params[20:25]
    u = params[25:30]
    v = params[30:35]
    
    A = np.exp(log_A)
    alpha = np.exp(log_alpha)
    E = np.exp(log_E)
    
    # Calculate transfer matrix W (5, 5)
    # W_ij = sigmoid(b_i + s_j + u_i * v_j)
    # models transfer efficiency from domain j to domain i
    W_logits = b[:, None] + s[None, :] + (u[:, None] * v[None, :])
    W = 1.0 / (1.0 + np.exp(-W_logits))
    
    # Calculate effective data size for each domain i
    # D_eff_i = x_i + sum_{j!=i} W_ij * x_j
    # We treat in-domain data x_i as having weight 1.0 (identity).
    
    # Computation:
    # 1. Weighted sum of all sources: X @ W.T
    # 2. Remove diagonal contribution: X * diag(W)
    # 3. Add identity contribution: X
    term_interaction = X @ W.T
    W_diag = np.diag(W)
    term_correction = X * W_diag[None, :]
    
    effective_data = X + term_interaction - term_correction
    effective_data = np.maximum(effective_data, 1e-9)
    
    # Power law prediction
    # pred_i = A_i * (D_eff_i)^(-alpha_i) + E_i
    pred = A[None, :] * (effective_data ** -alpha[None, :]) + E[None, :]
    
    return pred


def fit_scaling_law(data_points, loss_values):
    X = np.atleast_2d(np.asarray(data_points))
    Y = np.atleast_2d(np.asarray(loss_values))
    N, D = X.shape
    
    # Calculate weights for objective function to balance domain importance
    # We weight by inverse variance to normalize the error contribution of each domain
    y_var = np.var(Y, axis=0)
    y_var = np.maximum(y_var, 1e-6) # Safety floor
    weights = 1.0 / y_var
    weights = weights / np.sum(weights) * D # Normalize to sum to D
    
    # --- Step 1: Robust Independent Fits ---
    # Fit each domain independently to get baseline A, alpha, E and beta (avg transfer)
    # Model: y = A * (x + beta * (1-x))^-alpha + E
    
    init_params_list = []
    
    for i in range(D):
        x_i = X[:, i]
        y_i = Y[:, i]
        
        def obj_indep(p):
            A_val = np.exp(p[0])
            beta_val = 1.0 / (1.0 + np.exp(-p[1]))
            alpha_val = np.exp(p[2])
            E_val = np.exp(p[3])
            
            eff = x_i + beta_val * (1.0 - x_i) + 1e-9
            pred = A_val * (eff ** -alpha_val) + E_val
            return np.mean((pred - y_i)**2)
        
        # Grid search initialization candidates
        y_min, y_max = np.min(y_i), np.max(y_i)
        log_A_est = np.log(max(y_max - y_min, 0.01))
        log_E_est = np.log(max(y_min - 0.1, 1e-5))
        
        candidates = [
            [log_A_est, -2.0, -0.5, log_E_est],       # Default
            [log_A_est, -0.5, -0.5, log_E_est],       # High transfer
            [log_A_est + 1.0, -2.0, -0.5, log_E_est - 1.0], # High A, Low E
            [log_A_est, -4.0, -0.3, log_E_est],       # Low transfer
        ]
        
        best_p = candidates[0]
        best_err = np.inf
        
        for p0 in candidates:
            try:
                # Use L-BFGS-B with bounds for stability during init
                res = minimize(obj_indep, p0, method='L-BFGS-B', 
                               bounds=[(-5, 5), (-5, 5), (-3, 2), (-10, 5)])
                if res.fun < best_err:
                    best_err = res.fun
                    best_p = res.x
            except:
                continue
        init_params_list.append(best_p)
            
    initial_params_indep = np.array(init_params_list)
    
    # --- Step 2: Initialize Global Parameters ---
    log_A_init = initial_params_indep[:, 0]
    beta_logits_init = initial_params_indep[:, 1]
    log_alpha_init = initial_params_indep[:, 2]
    log_E_init = initial_params_indep[:, 3]
    
    # Initialize Transfer Matrix components
    # b: initialized to beta_logits (assuming source bias s=0 initially)
    b_init = beta_logits_init
    s_init = np.zeros(D)
    
    # u, v: initialized with small noise to break symmetry
    rng = np.random.RandomState(42)
    u_init = rng.uniform(-0.01, 0.01, D)
    v_init = rng.uniform(-0.01, 0.01, D)
    
    p_global_0 = np.concatenate([
        log_A_init, log_alpha_init, log_E_init, 
        b_init, s_init, u_init, v_init
    ])
    
    # --- Step 3: Global Optimization with Weighted MSE ---
    
    def global_objective(p):
        pred = scaling_law_func(X, p)
        
        # Weighted MSE across domains
        mse_per_dim = np.mean((pred - Y)**2, axis=0)
        weighted_mse = np.sum(mse_per_dim * weights)
        
        # Regularization
        # L2 on interaction params (s, u, v) to prevent overfitting
        reg_interact = 1e-5 * np.sum(p[20:35]**2)
        # Very weak L2 on others to prevent drift
        reg_base = 1e-8 * np.sum(p[:20]**2)
        
        return weighted_mse + reg_interact + reg_base

    # Bounds for all 35 parameters
    # A: (-5, 5), alpha: (-3, 2), E: (-10, 5), others: (-10, 10)
    bounds = []
    bounds.extend([(-5, 5)] * 5)   # A
    bounds.extend([(-3, 2)] * 5)   # alpha
    bounds.extend([(-10, 5)] * 5)  # E
    bounds.extend([(-10, 10)] * 20) # b, s, u, v

    try:
        # L-BFGS-B handles bounds and is robust
        res = minimize(global_objective, p_global_0, method='L-BFGS-B', 
                       bounds=bounds,
                       options={'maxiter': 5000, 'gtol': 1e-7})
        best_p = res.x
    except:
        best_p = p_global_0
        
    return best_p
# EVOLVE-BLOCK-END