SLD - LR-BSZ Scaling Law - SLDAgent + Gemini 3 Pro Preview

All Runs (sorted by R²)

Best Run 2 R² = 0.940806

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios.
Implements a refined scaling law with terms for Model Size, Dataset Size, 
Learning Rate (quadratic penalty in log-space), and Batch Size.
Uses physics-informed bounds and multi-start L-BFGS-B optimization.
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    """
    Predicts LM loss based on scaling law parameters.
    
    Model Form:
    L = E + A*N^(-alpha) + B*D^(-beta) + C*(log(lr) - log_lr_opt)^2 + F*bsz^G
    
    Where:
    log_lr_opt = d0 + d1*log(N) + d2*log(bsz)
    
    Inputs are normalized:
    - N: Parameters / 1e9
    - D: Tokens / 1e10
    - lr: Learning Rate / 1e-3
    - bsz: Batch Size / 2048
    
    Parameters (11 total):
    0: E (Irreducible loss)
    1: A (Model size coeff)
    2: alpha (Model size exponent)
    3: B (Data size coeff)
    4: beta (Data size exponent)
    5: C (LR penalty coeff)
    6: d0 (Opt LR intercept)
    7: d1 (Opt LR slope w.r.t N)
    8: d2 (Opt LR slope w.r.t bsz)
    9: F (Batch size coeff)
    10: G (Batch size exponent)
    """
    # Normalization constants (based on dataset statistics)
    # Feature order: [lr, bsz, data_size, non_embedding_param_size]
    # Using 2048 for bsz as it matches the max value in the dataset
    scales = np.array([1e-3, 2048.0, 1e10, 1e9])
    
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    X_norm = X / scales[None, :]
    
    lr = X_norm[:, 0]
    bsz = X_norm[:, 1]
    D = X_norm[:, 2]
    N_param = X_norm[:, 3]
    
    # Handle params shape
    params = np.asarray(params, dtype=np.float64)
    original_ndim = params.ndim
    if original_ndim == 1:
        params = params[None, :]
    
    # Unpack parameters
    E     = params[:, 0:1]
    A     = params[:, 1:2]
    alpha = params[:, 2:3]
    B     = params[:, 3:4]
    beta  = params[:, 4:5]
    C     = params[:, 5:6]
    d0    = params[:, 6:7]
    d1    = params[:, 7:8]
    d2    = params[:, 8:9]
    F     = params[:, 9:10]
    G     = params[:, 10:11]
    
    eps = 1e-9
    
    # Broadcasting preparation
    N_p = N_param[None, :]
    D_p = D[None, :]
    lr_p = lr[None, :]
    bsz_p = bsz[None, :]
    
    # 1. Power Laws for N and D
    # Using abs(alpha/beta) to ensure decay behavior
    term_N = A * ((N_p + eps) ** (-np.abs(alpha)))
    term_D = B * ((D_p + eps) ** (-np.abs(beta)))
    
    # 2. Learning Rate Penalty
    # Optimal LR depends on N and bsz
    log_N = np.log(N_p + eps)
    log_bsz = np.log(bsz_p + eps)
    log_lr = np.log(lr_p + eps)
    
    # Linear relationship in log-log space
    opt_log_lr = d0 + d1 * log_N + d2 * log_bsz
    term_LR = C * ((log_lr - opt_log_lr) ** 2)
    
    # 3. Batch Size Independent Effect
    term_BSZ = F * ((bsz_p + eps) ** G)
    
    # Total Loss
    pred = E + term_N + term_D + term_LR + term_BSZ
    
    # Return shape handling
    pred = pred.T
    if original_ndim == 1:
        return pred[:, 0]
    return pred

def fit_scaling_law(data_points, loss_values):
    """
    Fits the scaling law parameters using multi-start L-BFGS-B.
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64).flatten()
    
    def objective(p):
        preds = scaling_law_func(X, p)
        return np.mean((preds - y)**2)
    
    # Parameter Bounds
    # E: [1.0, 2.2] - Irreducible loss must be < min(loss) ~ 2.1
    # A, B: [0, inf] - Coefficients
    # alpha, beta: [0.01, 1.0] - Exponents typically < 1.0
    # C: [0, inf] - Penalty curvature
    # d1: [-2.0, 0.5] - LR usually decreases with Model Size
    # d2: [-0.5, 2.0] - LR usually increases with Batch Size
    bounds = [
        (1.0, 2.2),   # E
        (0.0, None),  # A
        (0.01, 1.0),  # alpha
        (0.0, None),  # B
        (0.01, 1.0),  # beta
        (0.0, None),  # C
        (None, None), # d0
        (-2.0, 0.5),  # d1
        (-0.5, 2.0),  # d2
        (None, None), # F
        (None, None)  # G
    ]
    
    # Multiple initializations to avoid local minima
    guesses = [
        # 1. Standard Scaling: alpha, beta ~ 0.1-0.3
        [1.8, 0.5, 0.1, 0.5, 0.1, 0.2, 0.0, -0.2, 0.2, 0.01, 0.0],
        # 2. Chinchilla-like: alpha, beta ~ 0.5
        [1.6, 2.0, 0.5, 2.0, 0.5, 0.5, -0.5, -0.5, 0.5, 0.05, -0.1],
        # 3. High Variance: small E, large coeffs
        [1.2, 5.0, 0.2, 5.0, 0.2, 0.1, 0.0, -0.1, 0.1, 0.0, 0.0]
    ]
    
    best_loss = np.inf
    best_params = np.array(guesses[0])
    
    for p0 in guesses:
        try:
            res = minimize(objective, p0, method='L-BFGS-B', bounds=bounds,
                          options={'maxiter': 3000, 'ftol': 1e-10, 'gtol': 1e-10})
            if res.fun < best_loss:
                best_loss = res.fun
                best_params = res.x
        except Exception:
            continue
            
    return best_params
# EVOLVE-BLOCK-END

#2 Run 4 R² = 0.940792

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios.
Implements a refined scaling law with terms for Model Size, Dataset Size, 
Learning Rate (quadratic penalty with interaction), and Batch Size.
Uses a two-stage optimization strategy to ensure robust fitting of core parameters.
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    """
    Predicts LM loss based on scaling law parameters.
    
    Model Form:
    L = e + a*N^(-alpha) + b*D^(-beta) + c*(log(lr) - log_opt_lr)^2 + f*bsz^g
    
    Where:
    log_opt_lr = d0 + d1*log(N) + d2*log(bsz)
    
    Inputs are normalized:
    - N: Parameters / 1e9
    - D: Tokens / 1e10
    - lr: Learning Rate / 1e-3
    - bsz: Batch Size / 2048
    
    Parameters (11 total):
    e: Irreducible loss
    a, alpha: Model size scaling coeff and exponent
    b, beta: Data size scaling coeff and exponent
    c: Learning rate penalty coefficient
    d0: Optimal log-lr intercept
    d1: Optimal log-lr slope w.r.t log(N)
    d2: Optimal log-lr slope w.r.t log(bsz)
    f, g: Batch size residual scaling coeff and exponent
    """
    # Constants for normalization
    # Feature order: [lr, bsz, data_size, non_embedding_param_size]
    scales = np.array([1e-3, 2048.0, 1e10, 1e9])
    
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    X_norm = X / scales[None, :]
    
    lr = X_norm[:, 0]
    bsz = X_norm[:, 1]
    D = X_norm[:, 2]
    N_param = X_norm[:, 3]
    
    params = np.asarray(params, dtype=np.float64)
    if params.ndim == 1:
        params = params[None, :]
    
    # Unpack parameters (T, 1)
    e     = params[:, 0:1]
    a     = params[:, 1:2]
    alpha = params[:, 2:3]
    b     = params[:, 3:4]
    beta  = params[:, 4:5]
    c     = params[:, 5:6]
    d0    = params[:, 6:7]
    d1    = params[:, 7:8]
    d2    = params[:, 8:9]
    f     = params[:, 9:10]
    g     = params[:, 10:11]
    
    eps = 1e-9
    
    # Reshape data for broadcasting: (1, N_samples)
    N_p = N_param[None, :]
    D_p = D[None, :]
    lr_p = lr[None, :]
    bsz_p = bsz[None, :]
    
    # 1. Model Size Scaling
    term_N = a * ((N_p + eps) ** -np.abs(alpha))
    
    # 2. Data Size Scaling
    term_D = b * ((D_p + eps) ** -np.abs(beta))
    
    # 3. Learning Rate Penalty with Interaction
    log_lr = np.log(lr_p + eps)
    log_N = np.log(N_p + eps)
    log_B = np.log(bsz_p + eps)
    
    # Optimal LR depends on Model Size and Batch Size
    opt_log_lr = d0 + d1 * log_N + d2 * log_B
    term_LR = c * ((log_lr - opt_log_lr) ** 2)
    
    # 4. Batch Size Residual
    term_BSZ = f * ((bsz_p + eps) ** g)
    
    pred = e + term_N + term_D + term_LR + term_BSZ
    
    if params.shape[0] == 1:
        return pred.flatten()
    return pred.T

def fit_scaling_law(data_points, loss_values):
    """
    Fits the scaling law parameters using a two-stage L-BFGS-B optimization.
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64).flatten()
    
    # Parameters: e, a, alpha, b, beta, c, d0, d1, d2, f, g
    # Bounds designed to guide optimization towards physically meaningful regions
    bounds = [
        (1.0, 4.0),   # e: irreducible loss
        (0.0, 10.0),  # a
        (0.0, 2.0),   # alpha
        (0.0, 10.0),  # b
        (0.0, 2.0),   # beta
        (0.0, 10.0),  # c
        (None, None), # d0
        (None, 0.5),  # d1: usually negative
        (-1.0, 2.0),  # d2: usually positive
        (None, None), # f
        (None, None)  # g
    ]
    
    def objective(p):
        preds = scaling_law_func(X, p)
        return np.mean((preds - y)**2)

    # Step 1: Fit core parameters (N, D, basic LR). Fix d2 (BSZ-LR) and f,g (BSZ residual) to 0.
    # This establishes the baseline Chinchilla + LR scaling.
    p0_step1 = np.array([2.5, 0.5, 0.1, 0.5, 0.1, 0.2, 0.0, -0.2, 0.0, 0.0, 0.0])
    
    bounds_step1 = bounds[:]
    # Fix d2, f, g to 0 by setting equal bounds
    bounds_step1[8] = (0.0, 0.0)
    bounds_step1[9] = (0.0, 0.0)
    bounds_step1[10] = (0.0, 0.0)
    
    res1 = minimize(objective, p0_step1, method='L-BFGS-B', bounds=bounds_step1,
                    options={'maxiter': 1000, 'ftol': 1e-9})
    
    p_step1 = res1.x
    
    # Step 2: Unfreeze all parameters. Use p_step1 as initialization.
    # We add a few restarts with perturbations to d2 to check if BSZ interaction helps.
    
    candidates = []
    
    # Candidate A: Start from Step 1 result (d2=0, f=0)
    candidates.append(p_step1)
    
    # Candidate B: Start with theoretical BSZ scaling for LR (approx sqrt scaling -> d2=0.5)
    p_init_B = p_step1.copy()
    p_init_B[8] = 0.5 
    candidates.append(p_init_B)
    
    best_loss = np.inf
    best_params = p_step1
    
    for p0 in candidates:
        try:
            res = minimize(objective, p0, method='L-BFGS-B', bounds=bounds,
                           options={'maxiter': 2000, 'ftol': 1e-10})
            if res.fun < best_loss:
                best_loss = res.fun
                best_params = res.x
        except:
            continue
            
    return best_params
# EVOLVE-BLOCK-END

#3 Run 3 R² = 0.940739

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios.
Refined model with 11 parameters including Model Size (N), Dataset Size (D), 
Batch Size (B), and Learning Rate (LR) interactions.
Includes B-dependent optimal LR and residual B scaling.
Optimization uses multiple restarts with L-BFGS-B.
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    """
    Predicts LM loss based on scaling law parameters.
    
    Model Form:
    L = e + a*N^(-alpha) + b*D^(-beta) + c*(log(lr) - log_lr_opt)^2 + f*B^g
    
    Where:
    log_lr_opt = d0 + d1*log(N) + d2*log(B)
    
    Inputs are normalized:
    - N: Parameters / 1e9
    - D: Tokens / 1e10
    - lr: Learning Rate / 1e-3
    - bsz: Batch Size / 2048
    """
    # Normalization constants [lr, bsz, data_size, non_embedding_param_size]
    scales = np.array([1e-3, 2048.0, 1e10, 1e9])
    
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    X_norm = X / scales[None, :]
    
    lr = X_norm[:, 0]
    bsz = X_norm[:, 1]
    D = X_norm[:, 2]
    N_param = X_norm[:, 3]
    
    params = np.asarray(params, dtype=np.float64)
    original_ndim = params.ndim
    if original_ndim == 1:
        params = params[None, :]
    
    # Unpack 11 parameters
    e     = params[:, 0:1]
    a     = params[:, 1:2]
    alpha = params[:, 2:3]
    b     = params[:, 3:4]
    beta  = params[:, 4:5]
    c     = params[:, 5:6]
    d0    = params[:, 6:7]
    d1    = params[:, 7:8]
    d2    = params[:, 8:9]
    f     = params[:, 9:10]
    g     = params[:, 10:11]
    
    eps = 1e-9
    
    # Reshape for broadcasting
    N_p = N_param[None, :]
    D_p = D[None, :]
    lr_p = lr[None, :]
    bsz_p = bsz[None, :]
    
    # 1. Model Size Power Law
    term_N = a * ((N_p + eps) ** -np.abs(alpha))
    
    # 2. Data Size Power Law
    term_D = b * ((D_p + eps) ** -np.abs(beta))
    
    # 3. Learning Rate Penalty with Interactions
    log_lr = np.log(lr_p + eps)
    log_N = np.log(N_p + eps)
    log_B = np.log(bsz_p + eps)
    
    # Optimal log LR shifts with Model Size (d1) and Batch Size (d2)
    opt_log_lr = d0 + d1 * log_N + d2 * log_B
    term_LR = c * ((log_lr - opt_log_lr) ** 2)
    
    # 4. Batch Size Residual Power Law
    term_BSZ = f * ((bsz_p + eps) ** g)
    
    pred = e + term_N + term_D + term_LR + term_BSZ
    
    pred = pred.T
    if original_ndim == 1:
        return pred[:, 0]
    return pred

def fit_scaling_law(data_points, loss_values):
    """
    Fits the 11-parameter scaling law using L-BFGS-B with multiple initializations.
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64).flatten()
    
    # Bounds for parameters
    # e: Irreducible loss (1.0 to 2.5)
    # a, b, c, f: Coefficients (non-negative)
    # alpha, beta: Exponents (0 to 2.0)
    # d0, d1, d2: LR shift parameters (unbounded)
    # g: BSZ exponent (unbounded, allows various scaling behaviors)
    bounds = [
        (1.0, 2.5),   # e
        (0.0, None),  # a
        (0.0, 2.0),   # alpha
        (0.0, None),  # b
        (0.0, 2.0),   # beta
        (0.0, None),  # c
        (None, None), # d0
        (None, None), # d1
        (None, None), # d2
        (None, None), # f
        (None, None)  # g
    ]
    
    # Define a set of diverse initial guesses
    # [e, a, alpha, b, beta, c, d0, d1, d2, f, g]
    guesses = [
        # 1. Standard initialization
        [1.8, 0.5, 0.1, 0.5, 0.1, 0.2, 0.0, -0.1, 0.1, 0.0, 0.0],
        # 2. High scaling exponents, lower LR penalty
        [1.6, 1.0, 0.3, 1.0, 0.3, 0.1, 0.0, -0.2, 0.2, 0.0, 0.1],
        # 3. Emphasis on Batch Size effect
        [1.8, 0.5, 0.1, 0.5, 0.1, 0.2, 0.0, -0.1, 0.1, 0.1, 1.0],
        # 4. Low baseline, high LR penalty
        [1.4, 0.8, 0.1, 0.8, 0.1, 0.5, 0.5, -0.1, 0.0, 0.0, 0.0]
    ]
    
    best_loss = np.inf
    best_params = np.array(guesses[0])
    
    def objective(p):
        preds = scaling_law_func(X, p)
        # Using MSE
        return np.mean((preds - y)**2)
    
    for p0 in guesses:
        try:
            res = minimize(objective, p0, method='L-BFGS-B', bounds=bounds,
                           options={'maxiter': 2000, 'ftol': 1e-10})
            if res.fun < best_loss:
                best_loss = res.fun
                best_params = res.x
        except:
            continue
            
    return best_params
# EVOLVE-BLOCK-END

#4 Run 1 R² = 0.940693

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios.
Refined scaling law with additive terms for Model, Data, LR penalty, and Batch Size.
Uses geometric mean normalization for stability and physics-informed bounds.
Includes a diverse set of initial guesses including known scaling regimes.
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    """
    Predicts LM loss based on scaling law parameters.
    
    Model Form:
    L = E + A*N^(-alpha) + B*D^(-beta) + C*(log(lr) - log_lr_opt)^2 + F*bsz^G
    
    Where:
    log_lr_opt = d0 + d1*log(N) + d2*log(bsz)
    
    Inputs are normalized:
    - N: Parameters / 1e9
    - D: Tokens / 1e10
    - lr: Learning Rate / 1e-3
    - bsz: Batch Size / 2048
    
    Parameters (11 total):
    0: E (Irreducible loss)
    1: A (Model size coeff)
    2: alpha (Model size exponent)
    3: B (Data size coeff)
    4: beta (Data size exponent)
    5: C (LR penalty coeff)
    6: d0 (Opt LR intercept)
    7: d1 (Opt LR slope w.r.t N)
    8: d2 (Opt LR slope w.r.t bsz)
    9: F (Batch size coeff)
    10: G (Batch size exponent)
    """
    # Normalization constants (Program 1 settings proved effective)
    scales = np.array([1e-3, 2048.0, 1e10, 1e9])
    
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    X_norm = X / scales[None, :]
    
    lr = X_norm[:, 0]
    bsz = X_norm[:, 1]
    D = X_norm[:, 2]
    N_param = X_norm[:, 3]
    
    # Handle params shape
    params = np.asarray(params, dtype=np.float64)
    original_ndim = params.ndim
    if original_ndim == 1:
        params = params[None, :]
    
    # Unpack parameters
    E     = params[:, 0:1]
    A     = params[:, 1:2]
    alpha = params[:, 2:3]
    B     = params[:, 3:4]
    beta  = params[:, 4:5]
    C     = params[:, 5:6]
    d0    = params[:, 6:7]
    d1    = params[:, 7:8]
    d2    = params[:, 8:9]
    F     = params[:, 9:10]
    G     = params[:, 10:11]
    
    eps = 1e-9
    
    # Broadcasting preparation
    N_p = N_param[None, :]
    D_p = D[None, :]
    lr_p = lr[None, :]
    bsz_p = bsz[None, :]
    
    # 1. Power Laws for N and D
    # Use abs(alpha/beta) to ensure decay
    term_N = A * ((N_p + eps) ** (-np.abs(alpha)))
    term_D = B * ((D_p + eps) ** (-np.abs(beta)))
    
    # 2. Learning Rate Penalty
    # Optimal LR depends on N and bsz
    log_N = np.log(N_p + eps)
    log_bsz = np.log(bsz_p + eps)
    log_lr = np.log(lr_p + eps)
    
    opt_log_lr = d0 + d1 * log_N + d2 * log_bsz
    term_LR = C * ((log_lr - opt_log_lr) ** 2)
    
    # 3. Batch Size Effect
    # Power law scaling. If G < 0, larger batch size -> smaller loss (efficiency/stability)
    # If G > 0, larger batch size -> larger loss (noise reduction saturation?)
    term_BSZ = F * ((bsz_p + eps) ** G)
    
    # Total Loss
    pred = E + term_N + term_D + term_LR + term_BSZ
    
    # Return shape handling
    pred = pred.T
    if original_ndim == 1:
        return pred[:, 0]
    return pred

def fit_scaling_law(data_points, loss_values):
    """
    Fits the scaling law parameters using multi-start L-BFGS-B.
    """
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64).flatten()
    
    min_loss = np.min(y)
    
    def objective(p):
        preds = scaling_law_func(X, p)
        return np.mean((preds - y)**2)
    
    # Parameter Bounds
    # E: [1.0, min_loss] - Irreducible loss must be below achieved loss
    # A, B: [0, inf]
    # alpha, beta: [0, 3]
    # C: [0, inf]
    # d0, d1, d2: unconstrained
    # F: unconstrained (though usually positive if G is chosen well)
    # G: unconstrained
    bounds = [
        (1.0, min_loss - 0.01), # E
        (0.0, None),            # A
        (0.0, 3.0),             # alpha
        (0.0, None),            # B
        (0.0, 3.0),             # beta
        (0.0, None),            # C
        (None, None),           # d0
        (None, None),           # d1
        (None, None),           # d2
        (None, None),           # F
        (None, None)            # G
    ]
    
    # Initial Guesses
    # p = [E, A, alpha, B, beta, C, d0, d1, d2, F, G]
    guesses = [
        # 1. Kaplan-like (low exponents)
        [1.8, 0.5, 0.07, 0.5, 0.07, 0.2, 0.0, -0.1, 0.1, 0.01, -0.1],
        # 2. Chinchilla-like (higher exponents)
        [1.6, 1.0, 0.33, 1.0, 0.33, 0.2, 0.0, -0.2, 0.1, 0.01, -0.1],
        # 3. High data dependence
        [1.5, 0.5, 0.05, 2.0, 0.5, 0.1, 0.5, -0.1, 0.0, 0.05, -0.2],
        # 4. Previous Best (Program 1 params approx)
        [1.8, 0.5, 0.1, 0.5, 0.1, 0.2, 0.0, -0.1, 0.0, 0.01, -0.01],
        # 5. Flat batch size effect
        [2.0, 1.0, 0.1, 1.0, 0.1, 0.5, 0.0, -0.2, 0.2, 0.0, 0.0],
        # 6. Negative BSZ exponent (efficiency gain)
        [1.7, 0.5, 0.1, 0.5, 0.1, 0.2, 0.0, -0.1, 0.1, 0.1, -0.2]
    ]
    
    best_loss = np.inf
    best_params = np.array(guesses[0])
    
    for p0 in guesses:
        try:
            res = minimize(objective, p0, method='L-BFGS-B', bounds=bounds,
                          options={'maxiter': 2500, 'ftol': 1e-10, 'gtol': 1e-10})
            if res.fun < best_loss:
                best_loss = res.fun
                best_params = res.x
        except Exception:
            continue
            
    return best_params
# EVOLVE-BLOCK-END

#5 Run 5 R² = 0.940533

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning.
Models loss as a sum of power laws (Model, Data) and a quadratic log-LR penalty.
The optimal LR is modeled as a power-law function of Model Size and Batch Size.
Includes a residual power-law term for Batch Size effects.
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    """
    Predicts LM loss.
    
    L = e + a*N^-alpha + b*D^-beta + c*(log(lr) - log_opt_lr)^2 + f*B^g
    log_opt_lr = d0 + d1*log(N) + d2*log(B)
    
    Inputs (normalized):
    - lr: Learning Rate / 1e-3
    - B:  Batch Size / 256
    - D:  Tokens / 1e10
    - N:  Parameters / 5e8
    """
    # Fixed normalization constants
    # [lr, bsz, data_size, n_params]
    scales = np.array([1e-3, 256.0, 1e10, 5e8])
    
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    X_norm = X / scales[None, :]
    
    # Unpack features
    lr = X_norm[:, 0]
    bsz = X_norm[:, 1]
    D = X_norm[:, 2]
    N_param = X_norm[:, 3]
    
    # Parse parameters
    params = np.asarray(params, dtype=np.float64)
    one_dim = (params.ndim == 1)
    if one_dim:
        params = params[None, :]
        
    # 11 Params: e, a, alpha, b, beta, c, d0, d1, d2, f, g
    e     = params[:, 0:1]
    a     = params[:, 1:2]
    alpha = params[:, 2:3]
    b     = params[:, 3:4]
    beta  = params[:, 4:5]
    c     = params[:, 5:6]
    d0    = params[:, 6:7]
    d1    = params[:, 7:8]
    d2    = params[:, 8:9]
    f     = params[:, 9:10]
    g     = params[:, 10:11]
    
    eps = 1e-9
    
    # Term 1: Model Size
    term_model = a * ((N_param[None, :] + eps) ** -alpha)
    
    # Term 2: Data Size
    term_data = b * ((D[None, :] + eps) ** -beta)
    
    # Term 3: Learning Rate Penalty
    # Optimal log LR shifts with N and B
    log_N = np.log(N_param[None, :] + eps)
    log_B = np.log(bsz[None, :] + eps)
    opt_log_lr = d0 + d1 * log_N + d2 * log_B
    
    log_lr = np.log(lr[None, :] + eps)
    term_lr = c * ((log_lr - opt_log_lr) ** 2)
    
    # Term 4: Batch Size explicit scaling
    term_bsz = f * ((bsz[None, :] + eps) ** g)
    
    pred = e + term_model + term_data + term_lr + term_bsz
    
    pred = pred.T
    if one_dim:
        return pred[:, 0]
    return pred

def fit_scaling_law(data_points, loss_values):
    X = np.atleast_2d(np.asarray(data_points, dtype=np.float64))
    y = np.asarray(loss_values, dtype=np.float64).flatten()
    
    # Bounds for parameters
    # e, a, alpha, b, beta, c, d0, d1, d2, f, g
    bounds = [
        (1.0, 4.0),   # e: Irreducible loss
        (0.0, 20.0),  # a: Model coeff
        (0.0, 1.5),   # alpha: Model exp
        (0.0, 20.0),  # b: Data coeff
        (0.0, 1.5),   # beta: Data exp
        (0.0, 10.0),  # c: LR penalty
        (-5.0, 5.0),  # d0: Opt LR intercept
        (-2.0, 2.0),  # d1: Opt LR slope (N)
        (-2.0, 2.0),  # d2: Opt LR slope (B)
        (-2.0, 2.0),  # f: Bsz coeff
        (-2.0, 2.0)   # g: Bsz exp
    ]
    
    # Diverse initialization seeds
    seeds = [
        # 1. Theoretical (Chinchilla + Linear Scaling)
        [1.8, 0.5, 0.3, 0.5, 0.3, 0.2, 0.0, -0.2, 0.5, 0.0, 0.0],
        # 2. High irreducible, low exponents
        [2.2, 0.2, 0.1, 0.2, 0.1, 0.5, -0.5, -0.1, 0.8, 0.01, 0.1],
        # 3. Low irreducible, high exponents
        [1.5, 1.0, 0.5, 1.0, 0.5, 0.1, 0.5, -0.3, 0.3, -0.01, 0.1],
        # 4. Strong LR sensitivity
        [1.9, 0.4, 0.2, 0.4, 0.2, 2.0, 0.0, -0.2, 0.5, 0.0, 0.0]
    ]
    
    best_loss = np.inf
    best_params = np.array(seeds[0])
    
    def objective(p):
        preds = scaling_law_func(X, p)
        return np.mean((preds - y)**2)
    
    for p0 in seeds:
        try:
            res = minimize(
                objective, 
                p0, 
                method='L-BFGS-B', 
                bounds=bounds,
                options={'maxiter': 2000, 'ftol': 1e-9}
            )
            if res.fun < best_loss:
                best_loss = res.fun
                best_params = res.x
        except Exception:
            continue
            
    return best_params
# EVOLVE-BLOCK-END