SLD - Data-Constrained Scaling Law - SLDAgent + Gemini 3 Pro Preview

All Runs (sorted by R²)

Best Run 3 R² = 0.928929

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios
Optimization Strategy: Hybrid Grid Search + NNLS + Robust Least Squares.
Functional Form: L = E + A*N^-alpha + B*D^-beta + C*R^delta
Features:
- Fixed normalization for numerical stability.
- NNLS for optimal linear parameter initialization.
- Robust 'soft_l1' loss for refinement to handle outliers.
"""
import numpy as np
from scipy.optimize import least_squares, nnls
import itertools

def scaling_law_func(data_points, params):
    # data_points: (N, 3) array [unique_tokens, params, tokens]
    # params: 7 parameters [E, A, alpha, B, beta, C, delta]
    
    # Input handling
    X = np.asarray(data_points, dtype=np.float64)
    if X.ndim == 1:
        X = X[None, :]
        
    p = np.asarray(params, dtype=np.float64)
    squeeze_output = False
    if p.ndim == 1:
        p = p[None, :]
        squeeze_output = True
        
    # Fixed scaling constants (approximate geometric means)
    SCALE_N = 1e9
    SCALE_D = 1e11
    
    # Extract features
    N_norm = X[:, 1:2] / SCALE_N
    D_norm = X[:, 2:3] / SCALE_D
    
    # Repetition Ratio R = Tokens / Unique
    # Add epsilon to denominator to avoid division by zero
    R = X[:, 2:3] / (X[:, 0:1] + 1e-9)
    
    # Extract parameters
    # Use abs() to ensure physical constraints (parameters must be non-negative)
    E     = np.abs(p[:, 0:1])
    A     = np.abs(p[:, 1:2])
    alpha = np.abs(p[:, 2:3])
    B     = np.abs(p[:, 3:4])
    beta  = np.abs(p[:, 4:5])
    C     = np.abs(p[:, 5:6])
    delta = np.abs(p[:, 6:7])
    
    # Functional Form: L = E + A*N^-alpha + B*D^-beta + C*R^delta
    term_N = A * (N_norm ** -alpha)
    term_D = B * (D_norm ** -beta)
    term_R = C * (R ** delta)
    
    pred = E + term_N + term_D + term_R
    
    if squeeze_output:
        return pred.flatten()
    return pred

def fit_scaling_law(data_points, loss_values):
    X = np.asarray(data_points, dtype=np.float64)
    y = np.asarray(loss_values, dtype=np.float64).flatten()
    
    SCALE_N = 1e9
    SCALE_D = 1e11
    
    N_norm = X[:, 1] / SCALE_N
    D_norm = X[:, 2] / SCALE_D
    R = X[:, 2] / (X[:, 0] + 1e-9)
    
    # Grid Search Strategy for initialization
    # Grid ranges based on theoretical expectations (Kaplan, Chinchilla)
    alphas = [0.05, 0.15, 0.33, 0.5, 0.7]
    betas  = [0.05, 0.15, 0.33, 0.5, 0.7]
    deltas = [0.0, 0.5, 1.0, 2.0]
    
    candidates = []
    
    # Target for NNLS: y = E + ...
    # Assume E >= E_min. Solve y - E_min = e_offset + ...
    E_min = 0.5
    y_shifted = np.maximum(y - E_min, 0.0)
    ones = np.ones_like(y)
    
    # Pre-calculate powers for efficiency
    # Not strictly necessary for N=182, but good practice
    
    for a, b, d in itertools.product(alphas, betas, deltas):
        # Basis functions
        f_N = (N_norm + 1e-12) ** -a
        f_D = (D_norm + 1e-12) ** -b
        f_R = (R + 1e-12) ** d
        
        # Design matrix M: [1, f_N, f_D, f_R]
        M = np.vstack([ones, f_N, f_D, f_R]).T
        
        try:
            # NNLS: min ||Mx - y_shifted||^2 s.t. x >= 0
            coeffs, rnorm = nnls(M, y_shifted)
            
            # Reconstruct parameters: E = coeffs[0] + E_min
            p_init = np.array([
                coeffs[0] + E_min, # E
                coeffs[1],         # A
                a,                 # alpha
                coeffs[2],         # B
                b,                 # beta
                coeffs[3],         # C
                d                  # delta
            ])
            candidates.append((rnorm, p_init))
        except:
            continue
            
    # Select top candidates
    candidates.sort(key=lambda x: x[0])
    # Take top 5 candidates to explore different basins
    top_k = [c[1] for c in candidates[:5]]
    
    if not top_k:
        # Fallback
        top_k = [np.array([1.5, 1.0, 0.3, 1.0, 0.3, 0.0, 0.5])]
        
    # Refinement using Robust Least Squares
    def residuals(p):
        return scaling_law_func(X, p) - y
        
    # Bounds: [E, A, alpha, B, beta, C, delta]
    lower_bounds = [0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
    upper_bounds = [10.0, np.inf, 3.0, np.inf, 3.0, np.inf, 5.0]
    
    best_cost = float('inf')
    best_params = top_k[0]
    
    for p0 in top_k:
        try:
            res = least_squares(
                residuals,
                p0,
                bounds=(lower_bounds, upper_bounds),
                method='trf',
                loss='soft_l1',  # Robust to outliers
                f_scale=0.1,     # Inlier scale
                max_nfev=500
            )
            
            # Compare using sum of squared residuals (L2 cost)
            # The optimizer minimizes soft_l1, but we select based on true MSE
            cost = np.sum(res.fun**2)
            
            if cost < best_cost:
                best_cost = cost
                best_params = res.x
        except:
            continue
            
    return best_params
# EVOLVE-BLOCK-END

#2 Run 4 R² = 0.928929

▼

Python

import numpy as np
from scipy.optimize import least_squares

def scaling_law_func(data_points, params):
    # data_points: (N, 3) array [unique_tokens, params, tokens]
    # params: Array of 7 parameters [E, A, alpha, B, beta, C, delta]
    
    # Ensure inputs are at least 2D
    X = np.atleast_2d(np.asarray(data_points))
    P = np.asarray(params)
    
    # Handle batch of parameters vs single set
    squeeze_output = False
    if P.ndim == 1:
        P = P[None, :]
        squeeze_output = True
        
    # Extract features
    unique_tokens = X[:, 0:1]
    model_params = X[:, 1:2]
    tokens = X[:, 2:3]
    
    # Normalization Constants (Geometric means of domain)
    # N: ~3e8, D: ~3e10
    # Centers the inputs near 1.0 to help the optimizer
    N_SCALE = 3e8
    D_SCALE = 3e10
    
    n = model_params / N_SCALE
    d = tokens / D_SCALE
    # Repetition ratio r = D / U
    r = tokens / (unique_tokens + 1e-9)
    
    # Extract Parameters (enforce positive via abs)
    E = np.abs(P[:, 0])
    A = np.abs(P[:, 1])
    alpha = np.abs(P[:, 2])
    B = np.abs(P[:, 3])
    beta = np.abs(P[:, 4])
    C = np.abs(P[:, 5])
    delta = np.abs(P[:, 6])
    
    # Functional Form:
    # L = E + A*N^-alpha + B*D^-beta + C*R^delta
    # Standard Chinchilla terms + Repetition penalty
    
    # Broadcasting: (N, 1) and (1, K) -> (N, K)
    # Add small epsilons to bases to prevent NaN gradients/values
    term_model = A[None, :] * ((n + 1e-12) ** -alpha[None, :])
    term_data = B[None, :] * ((d + 1e-12) ** -beta[None, :])
    term_rep = C[None, :] * ((r + 1e-12) ** delta[None, :])
    
    pred = E[None, :] + term_model + term_data + term_rep
    
    if squeeze_output:
        return pred[:, 0]
    return pred

def fit_scaling_law(data_points, loss_values):
    X = np.asarray(data_points)
    y = np.asarray(loss_values)
    y_min = np.min(y)
    
    # Optimization in log-space for coefficients A, B, C
    # This handles the varying scales (e.g. C might be 1e-5 while A is 1.0)
    # p_opt layout: [E, logA, alpha, logB, beta, logC, delta]
    
    def residuals(p_log):
        # Convert log-space params back to linear for function evaluation
        p_lin = np.array([
            p_log[0],           # E
            np.exp(p_log[1]),   # A
            p_log[2],           # alpha
            np.exp(p_log[3]),   # B
            p_log[4],           # beta
            np.exp(p_log[5]),   # C
            p_log[6]            # delta
        ])
        return scaling_law_func(X, p_lin) - y

    # Heuristic Initialization Strategy
    # We provide guesses for linear parameters, then convert to log space
    # [E, A, alpha, B, beta, C, delta]
    starts = [
        # Balanced Chinchilla
        [1.6, 1.0, 0.33, 1.0, 0.33, 0.001, 1.0],
        # Steep scaling
        [1.5, 5.0, 0.5, 5.0, 0.5, 1e-4, 0.5],
        # High repetition penalty
        [1.8, 0.5, 0.3, 0.5, 0.3, 0.1, 2.0],
        # Data limited
        [1.6, 0.1, 0.1, 2.0, 0.6, 0.01, 1.0],
        # Model limited
        [1.6, 2.0, 0.6, 0.1, 0.1, 0.01, 1.0],
        # Conservative / Flat
        [y_min*0.9, 1.0, 0.1, 1.0, 0.1, 0.0, 0.0],
    ]
    
    # Bounds for optimization variables
    # E: [0.5, y_min] - E must be lower than any observed loss
    # logA, logB, logC: [-inf, inf]
    # alpha, beta: [0, 3]
    # delta: [0, 10]
    
    # Cap E slightly below y_min to force scaling terms to explain variance
    upper_E = max(0.9, y_min - 0.01)
    
    lower_bounds = [0.5,     -np.inf, 0.0, -np.inf, 0.0, -np.inf, 0.0]
    upper_bounds = [upper_E,  np.inf, 3.0,  np.inf, 3.0,  np.inf, 10.0]
    
    best_res = None
    best_cost = float('inf')
    
    for s in starts:
        # Convert start to log space
        p0 = np.array(s)
        # Ensure start E is valid
        p0[0] = min(p0[0], upper_E - 0.05)
        
        p_log_start = np.array([
            p0[0], 
            np.log(p0[1] + 1e-16), 
            p0[2], 
            np.log(p0[3] + 1e-16), 
            p0[4], 
            np.log(p0[5] + 1e-16), 
            p0[6]
        ])
        
        try:
            # Use soft_l1 loss to be robust against outliers
            # f_scale=0.1 means residuals < 0.1 are treated as squared error, > 0.1 as linear
            res = least_squares(residuals, p_log_start, 
                                bounds=(lower_bounds, upper_bounds), 
                                method='trf', 
                                loss='soft_l1', 
                                f_scale=0.1, 
                                max_nfev=1000)
            
            if res.cost < best_cost:
                best_cost = res.cost
                best_res = res
        except Exception:
            continue
            
    if best_res is not None:
        p = best_res.x
        return np.array([
            p[0], np.exp(p[1]), p[2], np.exp(p[3]), p[4], np.exp(p[5]), p[6]
        ])
    
    # Fallback
    return np.array(starts[0])

#3 Run 2 R² = 0.927895

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios.
Refines the physically motivated power law (Program 1) with improved input scaling and optimization.
Key Improvements:
1. Centered normalization constants (Geometric Means) for N and D to improve gradient conditioning.
2. Adjusted R scaling to handle the large dynamic range of repetition ratios (2 to 1e5).
3. Diverse initial guesses covering different regimes (Data-limited, Model-limited, Overfitting).
4. Robust 'soft_l1' loss on logarithmic residuals (MSLE) to handle scale differences.
"""
import numpy as np
from scipy.optimize import least_squares

# Hardcoded scaling constants based on domain analysis (Geometric Means)
# N: 1e8 - 1e9 -> Center ~3e8
# D: 1e9 - 1e12 -> Center ~3e10
# R: 2 - 1e5 -> Center ~450. Using 100.0 to keep bases reasonable.
SCALE_N = 3e8
SCALE_D = 3e10
SCALE_R = 100.0

def scaling_law_func(data_points, params):
    # data_points: (N, 3) array with columns [unique_tokens, params, tokens]
    # params: Array of 7 parameters [E, A, alpha, B, beta, C, delta]
    
    # Ensure inputs are 2D
    X = np.atleast_2d(np.asarray(data_points))
    
    # Handle parameter batching for vectorized evaluation
    params = np.asarray(params)
    squeeze_output = False
    if params.ndim == 1:
        params = params[None, :]
        squeeze_output = True
        
    # Extract parameters
    # [E, A, alpha, B, beta, C, delta]
    E     = params[:, 0][:, None]
    A     = params[:, 1][:, None]
    alpha = params[:, 2][:, None]
    B     = params[:, 3][:, None]
    beta  = params[:, 4][:, None]
    C     = params[:, 5][:, None]
    delta = params[:, 6][:, None]
    
    # Extract raw features
    U_raw = X[:, 0]
    N_raw = X[:, 1]
    D_raw = X[:, 2]
    
    # Normalize inputs
    # Reshape to (1, N_samples) for broadcasting with (N_params, 1)
    n = (N_raw / SCALE_N)[None, :]
    d = (D_raw / SCALE_D)[None, :]
    
    # Repetition ratio R = D / U
    # Add epsilon to U for safety
    R_raw = D_raw / (U_raw + 1e-9)
    r = (R_raw / SCALE_R)[None, :]
    
    # Functional Form:
    # L = E + A*n^-alpha + B*d^-beta + C*r^delta
    
    # Term 1: Model Size Scaling (Power Law)
    # Larger models reduce loss.
    term_model = A * (n ** -alpha)
    
    # Term 2: Dataset Size Scaling (Power Law)
    # More tokens reduce loss (convergence).
    term_data = B * (d ** -beta)
    
    # Term 3: Repetition Penalty
    # Repeated data increases loss (overfitting).
    # We use a simple power law on R.
    term_penalty = C * (r ** delta)
    
    # Total Loss
    pred = E + term_model + term_data + term_penalty
    
    if squeeze_output:
        return pred[0]
    return pred

def fit_scaling_law(data_points, loss_values):
    X = np.asarray(data_points)
    y = np.asarray(loss_values)
    
    # Determine bounds
    min_loss = np.min(y)
    # E must be strictly less than min_loss
    max_E = max(0.0, min_loss - 1e-3)
    
    # Residuals function: Mean Squared Logarithmic Error (MSLE)
    # Optimizes relative error, appropriate for scaling laws.
    def residuals(p):
        pred = scaling_law_func(X, p)
        pred = np.maximum(pred, 1e-8) # Safety
        return np.log(pred) - np.log(y)
    
    # Bounds: [E, A, alpha, B, beta, C, delta]
    # E: [0, max_E]
    # Coefficients: [0, inf]
    # Exponents: [0, 5] (Physical exponents usually < 1, but delta can be higher)
    lower_bounds = [0.0,   0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
    upper_bounds = [max_E, np.inf, 5.0, np.inf, 5.0, np.inf, 10.0]
    
    # Heuristic Initial Guesses
    # We generate a grid of plausible starting points to avoid local minima
    guesses = []
    
    # 1. Balanced guess (Chinchilla-like)
    # E ~ 85% of min loss, alphas ~ 0.33
    guesses.append([min_loss * 0.85, 1.0, 0.33, 1.0, 0.33, 0.1, 1.0])
    
    # 2. Data constrained (High B term)
    guesses.append([min_loss * 0.7, 0.5, 0.3, 2.0, 0.5, 0.2, 1.5])
    
    # 3. Model constrained (High A term)
    guesses.append([min_loss * 0.7, 2.0, 0.5, 0.5, 0.3, 0.1, 1.0])
    
    # 4. High Penalty (Overfitting dominates)
    guesses.append([min_loss * 0.9, 0.5, 0.2, 0.5, 0.2, 1.0, 2.0])
    
    # 5. Low Penalty / High Base
    guesses.append([min_loss * 0.8, 1.0, 0.3, 1.0, 0.3, 0.01, 0.5])
    
    # 6. Steep scaling (High exponents)
    guesses.append([min_loss * 0.6, 3.0, 0.6, 3.0, 0.6, 0.5, 1.5])
    
    best_res = None
    best_cost = np.inf
    
    for p0 in guesses:
        try:
            res = least_squares(
                residuals, 
                p0, 
                bounds=(lower_bounds, upper_bounds),
                loss='soft_l1', 
                f_scale=0.02, # Robust loss scale
                method='trf',
                max_nfev=2000,
                ftol=1e-9,
                xtol=1e-9
            )
            if res.cost < best_cost:
                best_cost = res.cost
                best_res = res.x
        except Exception:
            continue
            
    if best_res is None:
        return np.array(guesses[0])
        
    return best_res
# EVOLVE-BLOCK-END

#4 Run 5 R² = 0.900646

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios
Improved program using a Thresholded Power-Law Epoch Penalty (Muennighoff-inspired).
Models loss as L = E + A*N^-alpha + B*D^-beta + C*max(0, D/U - 1)^delta.
Uses log-space optimization for coefficients and robust multi-start initialization.
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    # data_points: (N, 3) array [unique_tokens, params, tokens]
    # params: Array of 7 parameters [E, A, alpha, B, beta, C, delta]
    
    X = np.atleast_2d(np.asarray(data_points))
    params = np.asarray(params)
    
    # Handle parameter batching
    squeeze_output = False
    if params.ndim == 1:
        params = params[None, :]
        squeeze_output = True
        
    # Constants for normalization (centering features)
    Sc_N = 1e9
    Sc_D = 1e11
    
    # Features
    U = X[:, 0:1]
    N_param = X[:, 1:2]
    D_train = X[:, 2:3]
    
    # Normalized inputs
    n = N_param / Sc_N
    d = D_train / Sc_D
    
    # Epoch ratio R = D / U
    # Penalty applies to excess epochs: max(0, R - 1)
    R = D_train / (U + 1e-9)
    r_excess = np.maximum(0.0, R - 1.0)
    
    # Extract parameters
    # Use abs() to enforce constraints
    E     = np.abs(params[:, 0])
    A     = np.abs(params[:, 1])
    alpha = np.abs(params[:, 2])
    B     = np.abs(params[:, 3])
    beta  = np.abs(params[:, 4])
    C     = np.abs(params[:, 5])
    delta = np.abs(params[:, 6])
    
    # Term 1: Model Size Scaling -> A * n^-alpha
    # Broadcasting: (1, M) * (N, 1) -> (N, M)
    term_model = A[None, :] * (n ** -alpha[None, :])
    
    # Term 2: Dataset Size Scaling -> B * d^-beta
    term_data = B[None, :] * (d ** -beta[None, :])
    
    # Term 3: Epoch Penalty -> C * r_excess^delta
    # Only applies when R > 1
    mask = (R > 1.0).astype(float)
    term_epoch = C[None, :] * (r_excess ** delta[None, :]) * mask
    
    # Combine
    pred = E[None, :] + term_model + term_data + term_epoch
    
    if squeeze_output:
        return pred.flatten()
    return pred

def fit_scaling_law(data_points, loss_values):
    X = np.asarray(data_points)
    y = np.asarray(loss_values)
    
    # Constants
    Sc_N = 1e9
    Sc_D = 1e11
    
    # Features
    n = X[:, 1] / Sc_N
    d = X[:, 2] / Sc_D
    R = X[:, 2] / (X[:, 0] + 1e-9)
    r_excess = np.maximum(0.0, R - 1.0)
    mask = (R > 1.0).astype(float)
    
    # Objective: Log-space optimization for coefficients
    # p_log: [lnE, lnA, alpha, lnB, beta, lnC, delta]
    def objective(pl):
        E = np.exp(pl[0])
        A = np.exp(pl[1])
        al = pl[2]
        B = np.exp(pl[3])
        be = pl[4]
        C = np.exp(pl[5])
        de = pl[6]
        
        pred = E + A * (n ** -al) + B * (d ** -be) + C * (r_excess ** de) * mask
        return np.mean((pred - y)**2)

    # Bounds
    # lnE: 0 to 2.5 (E ~ 1.0 to 12.0)
    # lnA, lnB: -20 to 15
    # alpha, beta: 0 to 3
    # lnC: -40 to 10 (Allow very small penalty coeff)
    # delta: 0 to 4
    bounds = [
        (0.0, 2.5),      # lnE
        (-20.0, 15.0),   # lnA
        (1e-3, 3.0),     # alpha
        (-20.0, 15.0),   # lnB
        (1e-3, 3.0),     # beta
        (-40.0, 10.0),   # lnC
        (1e-3, 4.0)      # delta
    ]
    
    guesses = []
    
    # 1. Standard Chinchilla (Balanced)
    # E=1.7, A=5, a=0.33, B=5, b=0.33, C=1e-4, d=1
    guesses.append([np.log(1.7), np.log(5.0), 0.33, np.log(5.0), 0.33, -9.2, 1.0])
    
    # 2. High penalty (Data constrained)
    # E=1.6, A=2, a=0.3, B=2, b=0.3, C=0.1, d=2
    guesses.append([np.log(1.6), np.log(2.0), 0.3, np.log(2.0), 0.3, np.log(0.1), 2.0])
    
    # 3. Model limited
    guesses.append([np.log(1.8), np.log(10.0), 0.5, np.log(1.0), 0.1, -15.0, 1.0])
    
    # 4. Random Sampling
    np.random.seed(42)
    for _ in range(20):
        g = [np.random.uniform(b[0], b[1]) for b in bounds]
        guesses.append(g)
        
    best_loss = float('inf')
    best_pl = None
    
    for g in guesses:
        try:
            res = minimize(
                objective, 
                g, 
                method='L-BFGS-B', 
                bounds=bounds,
                options={'maxiter': 1000, 'ftol': 1e-10}
            )
            if res.fun < best_loss:
                best_loss = res.fun
                best_pl = res.x
        except:
            continue
            
    if best_pl is None:
        best_pl = guesses[0]
        
    return np.array([
        np.exp(best_pl[0]), # E
        np.exp(best_pl[1]), # A
        best_pl[2],         # alpha
        np.exp(best_pl[3]), # B
        best_pl[4],         # beta
        np.exp(best_pl[5]), # C
        best_pl[6]          # delta
    ])
# EVOLVE-BLOCK-END

#5 Run 1 R² = 0.893579

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios
Improved program using a numerically stable saturation-based functional form.
Models the transition from dataset-size limited to unique-token limited regimes
using a smooth-min formulation with robust log-domain computations.
"""
import numpy as np
from scipy.optimize import least_squares

def scaling_law_func(data_points, params):
    # data_points: (N, 3) array with columns [unique_tokens, params, tokens]
    # params: Array of 7 parameters [E, A, alpha, B, beta, G, delta]
    
    X = np.atleast_2d(np.asarray(data_points))
    params = np.asarray(params)
    
    # Handle vectorized parameters for evaluation
    squeeze_output = False
    if params.ndim == 1:
        params = params[None, :]
        squeeze_output = True
        
    # Unpack parameters (T, 7)
    E     = params[:, 0:1]
    A     = np.abs(params[:, 1:2])
    alpha = np.abs(params[:, 2:3])
    B     = np.abs(params[:, 3:4])
    beta  = np.abs(params[:, 4:5])
    G     = np.abs(params[:, 5:6])
    delta = np.abs(params[:, 6:7]) + 1e-4 # Avoid division by zero
    
    # Normalize inputs for numerical stability
    # Scales: N~1e9, D~1e12, U~1e8
    s_N = 1e9
    s_D = 1e12
    s_U = 1e8
    
    U = X[:, 0:1] / s_U
    N = X[:, 1:2] / s_N
    D = X[:, 2:3] / s_D
    
    # Term 1: Model Size Scaling (Power Law)
    # L_N = A * N^-alpha
    # Add epsilon to base to avoid log(0)
    term_model = A * ((N + 1e-9) ** -alpha)
    
    # Term 2: Data Scaling with Saturation
    # We want to model: B * ( D^-delta + (G*U)^-delta )^(beta/delta)
    # This behaves like B * min(D, G*U)^-beta
    # We compute this in log-domain to prevent overflow of D^-delta when delta is large
    
    # log(Term2) = log(B) + (beta/delta) * log( D^-delta + (G*U)^-delta )
    #            = log(B) + (beta/delta) * logaddexp( -delta*log(D), -delta*log(G*U) )
    
    log_D = np.log(D + 1e-12)
    log_GU = np.log(G * U + 1e-12)
    
    # Exponents for the sum inside the log
    x1 = -delta * log_D
    x2 = -delta * log_GU
    
    # Stable LogSumExp: max(x) + log(sum(exp(x - max(x))))
    max_x = np.maximum(x1, x2)
    # Note: exp(x1 - max_x) + exp(x2 - max_x).
    log_sum = max_x + np.log(np.exp(x1 - max_x) + np.exp(x2 - max_x))
    
    # Combine to get data term
    term_data = B * np.exp((beta / delta) * log_sum)
    
    # Total Loss
    pred = E + term_model + term_data
    
    if squeeze_output:
        return pred[:, 0]
    return pred

def fit_scaling_law(data_points, loss_values):
    X = np.asarray(data_points)
    y = np.asarray(loss_values)
    
    # Objective function for least_squares (residuals)
    def objective(p):
        pred = scaling_law_func(X, p)
        return pred - y
    
    # Bounds
    # E: [1.0, 10.0]
    # A, B, G: [0.0, inf]
    # alpha, beta: [0.0, 4.0]
    # delta: [0.1, 30.0] - Allow sharp transitions
    bounds = (
        [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1],
        [10.0, np.inf, 4.0, np.inf, 4.0, np.inf, 30.0]
    )
    
    # Multiple initializations to avoid local minima
    init_guesses = [
        np.array([2.0, 1.0, 0.3, 1.0, 0.3, 1.0, 1.0]),     # Standard
        np.array([1.6, 0.5, 0.5, 2.0, 0.5, 0.5, 5.0]),     # Sharp transition, data heavy
        np.array([2.2, 2.0, 0.2, 0.5, 0.2, 2.0, 0.5]),     # Soft transition, model heavy
        np.array([1.8, 1.0, 0.3, 1.0, 0.3, 1.0, 10.0])     # Very sharp transition
    ]
    
    best_res = None
    best_cost = np.inf
    
    for p0 in init_guesses:
        try:
            res = least_squares(
                objective, 
                p0, 
                bounds=bounds, 
                method='trf', 
                loss='soft_l1', # Robust loss to ignore outliers
                f_scale=0.1,    # Scale for robust loss
                max_nfev=1000
            )
            if res.cost < best_cost:
                best_cost = res.cost
                best_res = res
        except:
            continue
            
    if best_res is not None:
        return best_res.x
    
    # Fallback
    return init_guesses[0]
# EVOLVE-BLOCK-END