SLD - U-shaped Scaling Law - SLDAgent + Gemini 3 Pro Preview

All Runs (sorted by R²)

Best Run 1 R² = 0.931286

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios
Implements a smooth transition (sigmoid-weighted) between two linear regimes in log-flops space.
This can model monotonic, U-shaped, and inverted U-shaped scaling laws (double descent).
Uses 6 parameters: slope1, bias1, slope2, bias2, transition_point, transition_sharpness.
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    # data_points: (N, F) array, we use column 0 as log_flops
    # params: (P,) or (T, P) array of parameters. P=6.
    
    X = np.atleast_2d(np.asarray(data_points))
    x = X[:, 0:1]  # (N, 1)
    
    params = np.asarray(params)
    if params.ndim == 1:
        params = params[None, :]  # (1, P)
    
    # Transpose to (P, T) for broadcasting: (N, 1) op (1, T) -> (N, T)
    p = params.T
    
    # Unpack parameters (6 params)
    # Model: y = (1-sigma(x)) * L1(x) + sigma(x) * L2(x)
    # Parameters: [w1, b1, w2, b2, m, s]
    w1 = p[0:1, :] # Slope 1
    b1 = p[1:2, :] # Bias 1
    w2 = p[2:3, :] # Slope 2
    b2 = p[3:4, :] # Bias 2
    m  = p[4:5, :] # Transition midpoint
    s  = p[5:6, :] # Transition sharpness
    
    # Sigmoid transition
    # Clip argument for numerical stability
    z = s * (x - m)
    z = np.clip(z, -50, 50)
    sig = 1.0 / (1.0 + np.exp(-z))
    
    # Linear regimes
    y1 = w1 * x + b1
    y2 = w2 * x + b2
    
    # Combined prediction
    pred = (1.0 - sig) * y1 + sig * y2
    
    return pred[:, 0] if pred.shape[1] == 1 else pred


def fit_scaling_law(data_points, loss_values):
    X = np.atleast_2d(np.asarray(data_points))
    x_flat = X[:, 0]
    y = np.asarray(loss_values)
    
    if y.ndim == 1:
        y_2d = y[:, None]
    else:
        y_2d = y
        
    N, T = y_2d.shape
    P = 6 # Number of parameters
    
    params_opt_list = []
    
    # Grid for initialization of 'm' (midpoint)
    # We try splitting data at different percentiles to find the "bend"
    m_candidates = np.percentile(x_flat, [20, 40, 60, 80])
    
    for t in range(T):
        yt = y_2d[:, t]
        
        # Default fallback: constant prediction at mean
        mean_y = np.mean(yt)
        mid_x = np.mean(x_flat)
        best_p = np.array([0.0, mean_y, 0.0, mean_y, mid_x, 1.0])
        best_loss = np.mean((yt - mean_y)**2)
        
        # Try multiple initializations
        for m_init in m_candidates:
            # Estimate linear regimes left and right of m_init
            mask_left = x_flat < m_init
            mask_right = x_flat >= m_init
            
            # Simple regression for initialization
            if np.sum(mask_left) < 2:
                w1_init, b1_init = 0.0, mean_y
            else:
                try:
                    w1_init, b1_init = np.polyfit(x_flat[mask_left], yt[mask_left], 1)
                except:
                    w1_init, b1_init = 0.0, mean_y
                
            if np.sum(mask_right) < 2:
                w2_init, b2_init = 0.0, mean_y
            else:
                try:
                    w2_init, b2_init = np.polyfit(x_flat[mask_right], yt[mask_right], 1)
                except:
                    w2_init, b2_init = 0.0, mean_y
            
            s_init = 5.0 # Start with a moderately sharp transition
            
            p0 = np.array([w1_init, b1_init, w2_init, b2_init, m_init, s_init])
            
            # Optimization
            def objective(p):
                pred = scaling_law_func(X, p)
                return np.mean((pred - yt)**2)
            
            # Bounds to keep transition reasonable
            # m within data range, s positive and not too crazy
            bounds = [
                (None, None), (None, None), # w1, b1
                (None, None), (None, None), # w2, b2
                (np.min(x_flat), np.max(x_flat)), # m
                (0.1, 100.0) # s
            ]
            
            try:
                res = minimize(objective, p0, method='L-BFGS-B', bounds=bounds)
                if res.fun < best_loss:
                    best_loss = res.fun
                    best_p = res.x
            except:
                continue
                
        params_opt_list.append(best_p)
        
    params_opt = np.array(params_opt_list)
    
    return params_opt[0] if T == 1 else params_opt
# EVOLVE-BLOCK-END

#2 Run 2 R² = 0.931286

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios
Implements a smooth transition (sigmoid-weighted) between two linear regimes in log-flops space.
This can model monotonic, U-shaped, and inverted U-shaped scaling laws (double descent).
Uses 6 parameters: slope1, bias1, slope2, bias2, transition_point, transition_sharpness.
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    # data_points: (N, F) array, we use column 0 as log_flops
    # params: (P,) or (T, P) array of parameters. P=6.
    
    X = np.atleast_2d(np.asarray(data_points))
    x = X[:, 0:1]  # (N, 1)
    
    params = np.asarray(params)
    if params.ndim == 1:
        params = params[None, :]  # (1, P)
    
    # Transpose to (P, T) for broadcasting: (N, 1) op (1, T) -> (N, T)
    p = params.T
    
    # Unpack parameters (6 params)
    # Model: y = (1-sigma(x)) * L1(x) + sigma(x) * L2(x)
    # Parameters: [w1, b1, w2, b2, m, s]
    w1 = p[0:1, :] # Slope 1
    b1 = p[1:2, :] # Bias 1
    w2 = p[2:3, :] # Slope 2
    b2 = p[3:4, :] # Bias 2
    m  = p[4:5, :] # Transition midpoint
    s  = p[5:6, :] # Transition sharpness
    
    # Sigmoid transition
    # Clip argument for numerical stability
    z = s * (x - m)
    z = np.clip(z, -50, 50)
    sig = 1.0 / (1.0 + np.exp(-z))
    
    # Linear regimes
    y1 = w1 * x + b1
    y2 = w2 * x + b2
    
    # Combined prediction
    pred = (1.0 - sig) * y1 + sig * y2
    
    return pred[:, 0] if pred.shape[1] == 1 else pred


def fit_scaling_law(data_points, loss_values):
    X = np.atleast_2d(np.asarray(data_points))
    x_flat = X[:, 0]
    y = np.asarray(loss_values)
    
    if y.ndim == 1:
        y_2d = y[:, None]
    else:
        y_2d = y
        
    N, T = y_2d.shape
    P = 6 # Number of parameters
    
    params_opt_list = []
    
    # Grid for initialization of 'm' (midpoint)
    # We try splitting data at different percentiles to find the "bend"
    m_candidates = np.percentile(x_flat, [20, 40, 60, 80])
    
    for t in range(T):
        yt = y_2d[:, t]
        
        # Default fallback: constant prediction at mean
        mean_y = np.mean(yt)
        mid_x = np.mean(x_flat)
        best_p = np.array([0.0, mean_y, 0.0, mean_y, mid_x, 1.0])
        best_loss = np.mean((yt - mean_y)**2)
        
        # Try multiple initializations
        for m_init in m_candidates:
            # Estimate linear regimes left and right of m_init
            mask_left = x_flat < m_init
            mask_right = x_flat >= m_init
            
            # Simple regression for initialization
            if np.sum(mask_left) < 2:
                w1_init, b1_init = 0.0, mean_y
            else:
                try:
                    w1_init, b1_init = np.polyfit(x_flat[mask_left], yt[mask_left], 1)
                except:
                    w1_init, b1_init = 0.0, mean_y
                
            if np.sum(mask_right) < 2:
                w2_init, b2_init = 0.0, mean_y
            else:
                try:
                    w2_init, b2_init = np.polyfit(x_flat[mask_right], yt[mask_right], 1)
                except:
                    w2_init, b2_init = 0.0, mean_y
            
            s_init = 5.0 # Start with a moderately sharp transition
            
            p0 = np.array([w1_init, b1_init, w2_init, b2_init, m_init, s_init])
            
            # Optimization
            def objective(p):
                pred = scaling_law_func(X, p)
                return np.mean((pred - yt)**2)
            
            # Bounds to keep transition reasonable
            # m within data range, s positive and not too crazy
            bounds = [
                (None, None), (None, None), # w1, b1
                (None, None), (None, None), # w2, b2
                (np.min(x_flat), np.max(x_flat)), # m
                (0.1, 100.0) # s
            ]
            
            try:
                res = minimize(objective, p0, method='L-BFGS-B', bounds=bounds)
                if res.fun < best_loss:
                    best_loss = res.fun
                    best_p = res.x
            except:
                continue
                
        params_opt_list.append(best_p)
        
    params_opt = np.array(params_opt_list)
    
    return params_opt[0] if T == 1 else params_opt
# EVOLVE-BLOCK-END

#3 Run 3 R² = 0.931286

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios
Implements a smooth transition (sigmoid-weighted) between two linear regimes in log-flops space.
This can model monotonic, U-shaped, and inverted U-shaped scaling laws (double descent).
Uses 6 parameters: slope1, bias1, slope2, bias2, transition_point, transition_sharpness.
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    # data_points: (N, F) array, we use column 0 as log_flops
    # params: (P,) or (T, P) array of parameters. P=6.
    
    X = np.atleast_2d(np.asarray(data_points))
    x = X[:, 0:1]  # (N, 1)
    
    params = np.asarray(params)
    if params.ndim == 1:
        params = params[None, :]  # (1, P)
    
    # Transpose to (P, T) for broadcasting: (N, 1) op (1, T) -> (N, T)
    p = params.T
    
    # Unpack parameters (6 params)
    # Model: y = (1-sigma(x)) * L1(x) + sigma(x) * L2(x)
    # Parameters: [w1, b1, w2, b2, m, s]
    w1 = p[0:1, :] # Slope 1
    b1 = p[1:2, :] # Bias 1
    w2 = p[2:3, :] # Slope 2
    b2 = p[3:4, :] # Bias 2
    m  = p[4:5, :] # Transition midpoint
    s  = p[5:6, :] # Transition sharpness
    
    # Sigmoid transition
    # Clip argument for numerical stability
    z = s * (x - m)
    z = np.clip(z, -50, 50)
    sig = 1.0 / (1.0 + np.exp(-z))
    
    # Linear regimes
    y1 = w1 * x + b1
    y2 = w2 * x + b2
    
    # Combined prediction
    pred = (1.0 - sig) * y1 + sig * y2
    
    return pred[:, 0] if pred.shape[1] == 1 else pred


def fit_scaling_law(data_points, loss_values):
    X = np.atleast_2d(np.asarray(data_points))
    x_flat = X[:, 0]
    y = np.asarray(loss_values)
    
    if y.ndim == 1:
        y_2d = y[:, None]
    else:
        y_2d = y
        
    N, T = y_2d.shape
    P = 6 # Number of parameters
    
    params_opt_list = []
    
    # Grid for initialization of 'm' (midpoint)
    # We try splitting data at different percentiles to find the "bend"
    m_candidates = np.percentile(x_flat, [20, 40, 60, 80])
    
    for t in range(T):
        yt = y_2d[:, t]
        
        # Default fallback: constant prediction at mean
        mean_y = np.mean(yt)
        mid_x = np.mean(x_flat)
        best_p = np.array([0.0, mean_y, 0.0, mean_y, mid_x, 1.0])
        best_loss = np.mean((yt - mean_y)**2)
        
        # Try multiple initializations
        for m_init in m_candidates:
            # Estimate linear regimes left and right of m_init
            mask_left = x_flat < m_init
            mask_right = x_flat >= m_init
            
            # Simple regression for initialization
            if np.sum(mask_left) < 2:
                w1_init, b1_init = 0.0, mean_y
            else:
                try:
                    w1_init, b1_init = np.polyfit(x_flat[mask_left], yt[mask_left], 1)
                except:
                    w1_init, b1_init = 0.0, mean_y
                
            if np.sum(mask_right) < 2:
                w2_init, b2_init = 0.0, mean_y
            else:
                try:
                    w2_init, b2_init = np.polyfit(x_flat[mask_right], yt[mask_right], 1)
                except:
                    w2_init, b2_init = 0.0, mean_y
            
            s_init = 5.0 # Start with a moderately sharp transition
            
            p0 = np.array([w1_init, b1_init, w2_init, b2_init, m_init, s_init])
            
            # Optimization
            def objective(p):
                pred = scaling_law_func(X, p)
                return np.mean((pred - yt)**2)
            
            # Bounds to keep transition reasonable
            # m within data range, s positive and not too crazy
            bounds = [
                (None, None), (None, None), # w1, b1
                (None, None), (None, None), # w2, b2
                (np.min(x_flat), np.max(x_flat)), # m
                (0.1, 100.0) # s
            ]
            
            try:
                res = minimize(objective, p0, method='L-BFGS-B', bounds=bounds)
                if res.fun < best_loss:
                    best_loss = res.fun
                    best_p = res.x
            except:
                continue
                
        params_opt_list.append(best_p)
        
    params_opt = np.array(params_opt_list)
    
    return params_opt[0] if T == 1 else params_opt
# EVOLVE-BLOCK-END

#4 Run 4 R² = 0.931286

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios
Implements a smooth transition (sigmoid-weighted) between two linear regimes in log-flops space.
This can model monotonic, U-shaped, and inverted U-shaped scaling laws (double descent).
Uses 6 parameters: slope1, bias1, slope2, bias2, transition_point, transition_sharpness.
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    # data_points: (N, F) array, we use column 0 as log_flops
    # params: (P,) or (T, P) array of parameters. P=6.
    
    X = np.atleast_2d(np.asarray(data_points))
    x = X[:, 0:1]  # (N, 1)
    
    params = np.asarray(params)
    if params.ndim == 1:
        params = params[None, :]  # (1, P)
    
    # Transpose to (P, T) for broadcasting: (N, 1) op (1, T) -> (N, T)
    p = params.T
    
    # Unpack parameters (6 params)
    # Model: y = (1-sigma(x)) * L1(x) + sigma(x) * L2(x)
    # Parameters: [w1, b1, w2, b2, m, s]
    w1 = p[0:1, :] # Slope 1
    b1 = p[1:2, :] # Bias 1
    w2 = p[2:3, :] # Slope 2
    b2 = p[3:4, :] # Bias 2
    m  = p[4:5, :] # Transition midpoint
    s  = p[5:6, :] # Transition sharpness
    
    # Sigmoid transition
    # Clip argument for numerical stability
    z = s * (x - m)
    z = np.clip(z, -50, 50)
    sig = 1.0 / (1.0 + np.exp(-z))
    
    # Linear regimes
    y1 = w1 * x + b1
    y2 = w2 * x + b2
    
    # Combined prediction
    pred = (1.0 - sig) * y1 + sig * y2
    
    return pred[:, 0] if pred.shape[1] == 1 else pred


def fit_scaling_law(data_points, loss_values):
    X = np.atleast_2d(np.asarray(data_points))
    x_flat = X[:, 0]
    y = np.asarray(loss_values)
    
    if y.ndim == 1:
        y_2d = y[:, None]
    else:
        y_2d = y
        
    N, T = y_2d.shape
    P = 6 # Number of parameters
    
    params_opt_list = []
    
    # Grid for initialization of 'm' (midpoint)
    # We try splitting data at different percentiles to find the "bend"
    m_candidates = np.percentile(x_flat, [20, 40, 60, 80])
    
    for t in range(T):
        yt = y_2d[:, t]
        
        # Default fallback: constant prediction at mean
        mean_y = np.mean(yt)
        mid_x = np.mean(x_flat)
        best_p = np.array([0.0, mean_y, 0.0, mean_y, mid_x, 1.0])
        best_loss = np.mean((yt - mean_y)**2)
        
        # Try multiple initializations
        for m_init in m_candidates:
            # Estimate linear regimes left and right of m_init
            mask_left = x_flat < m_init
            mask_right = x_flat >= m_init
            
            # Simple regression for initialization
            if np.sum(mask_left) < 2:
                w1_init, b1_init = 0.0, mean_y
            else:
                try:
                    w1_init, b1_init = np.polyfit(x_flat[mask_left], yt[mask_left], 1)
                except:
                    w1_init, b1_init = 0.0, mean_y
                
            if np.sum(mask_right) < 2:
                w2_init, b2_init = 0.0, mean_y
            else:
                try:
                    w2_init, b2_init = np.polyfit(x_flat[mask_right], yt[mask_right], 1)
                except:
                    w2_init, b2_init = 0.0, mean_y
            
            s_init = 5.0 # Start with a moderately sharp transition
            
            p0 = np.array([w1_init, b1_init, w2_init, b2_init, m_init, s_init])
            
            # Optimization
            def objective(p):
                pred = scaling_law_func(X, p)
                return np.mean((pred - yt)**2)
            
            # Bounds to keep transition reasonable
            # m within data range, s positive and not too crazy
            bounds = [
                (None, None), (None, None), # w1, b1
                (None, None), (None, None), # w2, b2
                (np.min(x_flat), np.max(x_flat)), # m
                (0.1, 100.0) # s
            ]
            
            try:
                res = minimize(objective, p0, method='L-BFGS-B', bounds=bounds)
                if res.fun < best_loss:
                    best_loss = res.fun
                    best_p = res.x
            except:
                continue
                
        params_opt_list.append(best_p)
        
    params_opt = np.array(params_opt_list)
    
    return params_opt[0] if T == 1 else params_opt
# EVOLVE-BLOCK-END

#5 Run 5 R² = 0.930479

▼

Python

# EVOLVE-BLOCK-START
"""
Scaling law discovery for LLM finetuning scenarios.
Models the scaling behavior as a smooth transition between two linear regimes in log-flops space.
This functional form (Sigmoid-weighted Broken Stick) captures monotonic, U-shaped, and inverted U-shaped patterns.
Uses 6 parameters: [slope1, bias1, slope2, bias2, transition_point, sharpness].
Improved fitting via input centering and quadratic initialization to robustly detect U-shapes.
"""
import numpy as np
from scipy.optimize import minimize

def scaling_law_func(data_points, params):
    # data_points: (N, 1) array of log_flops
    # params: (6,) or (T, 6) array of parameters [w1, b1, w2, b2, m, s]
    
    X = np.atleast_2d(np.asarray(data_points))
    x = X[:, 0:1] # (N, 1)
    
    params = np.asarray(params)
    if params.ndim == 1:
        params = params[None, :] # (1, 6)
    
    # Transpose to (6, T) for broadcasting
    p = params.T 
    
    w1 = p[0:1, :]
    b1 = p[1:2, :]
    w2 = p[2:3, :]
    b2 = p[3:4, :]
    m  = p[4:5, :]
    s  = p[5:6, :]
    
    # Sigmoid transition
    # z = s * (x - m)
    # Clip z to prevent overflow/underflow in exp, ensuring numerical stability
    z = s * (x - m)
    z = np.clip(z, -50.0, 50.0)
    sig = 1.0 / (1.0 + np.exp(-z))
    
    # Linear regimes
    y1 = w1 * x + b1
    y2 = w2 * x + b2
    
    # Combined prediction: (1 - sig) * y1 + sig * y2
    pred = (1.0 - sig) * y1 + sig * y2
    
    return pred[:, 0] if pred.shape[1] == 1 else pred

def fit_scaling_law(data_points, loss_values):
    X = np.atleast_2d(np.asarray(data_points))
    x_flat = X[:, 0]
    y = np.asarray(loss_values)
    
    if y.ndim == 1:
        y_2d = y[:, None]
    else:
        y_2d = y
        
    N, T = y_2d.shape
    
    # Center input data for better optimization conditioning
    # This avoids interaction between slope and bias during fitting
    x_mean = np.mean(x_flat)
    x_centered = x_flat - x_mean
    x_min, x_max = np.min(x_centered), np.max(x_centered)
    
    best_params_list = []
    
    for t in range(T):
        yt = y_2d[:, t]
        
        # Objective function defined on centered data
        def objective(p):
            # p: [w1, b1, w2, b2, m, s] (in centered space)
            z = p[5] * (x_centered - p[4])
            z = np.clip(z, -50.0, 50.0)
            sig = 1.0 / (1.0 + np.exp(-z))
            
            y1 = p[0] * x_centered + p[1]
            y2 = p[2] * x_centered + p[3]
            
            pred = (1.0 - sig) * y1 + sig * y2
            return np.mean((pred - yt)**2)
        
        # Candidate Initializations
        candidates = []
        
        # 1. Quadratic Initialization (Good for U-shape / Inverted U)
        try:
            # Fit y ~ c2*x^2 + c1*x + c0
            c = np.polyfit(x_centered, yt, 2)
            
            # Vertex m = -c1 / 2c2
            if abs(c[0]) > 1e-5:
                m_quad = -c[1] / (2 * c[0])
                m_quad = np.clip(m_quad, x_min, x_max)
            else:
                m_quad = 0.0
                
            # Slopes at boundaries
            w1_q = 2 * c[0] * x_min + c[1]
            w2_q = 2 * c[0] * x_max + c[1]
            
            # Intercepts at boundaries (y = wx + b => b = y - wx)
            y_at_min = c[0]*x_min**2 + c[1]*x_min + c[2]
            y_at_max = c[0]*x_max**2 + c[1]*x_max + c[2]
            b1_q = y_at_min - w1_q * x_min
            b2_q = y_at_max - w2_q * x_max
            
            candidates.append([w1_q, b1_q, w2_q, b2_q, m_quad, 5.0])
        except:
            pass
            
        # 2. Split Initialization (Good for V-shape or Broken Stick)
        # Try splitting at 33% and 66%
        for pct in [33, 66]:
            split_x = np.percentile(x_centered, pct)
            mask_l = x_centered <= split_x
            mask_r = x_centered > split_x
            
            if np.sum(mask_l) >= 2:
                wl, bl = np.polyfit(x_centered[mask_l], yt[mask_l], 1)
            else:
                wl, bl = 0.0, np.mean(yt)
                
            if np.sum(mask_r) >= 2:
                wr, br = np.polyfit(x_centered[mask_r], yt[mask_r], 1)
            else:
                wr, br = 0.0, np.mean(yt)
                
            candidates.append([wl, bl, wr, br, split_x, 5.0])
            
        # 3. Linear Fallback
        try:
            wl, bl = np.polyfit(x_centered, yt, 1)
            candidates.append([wl, bl, wl, bl, 0.0, 1.0])
        except:
            pass
            
        # Optimization
        best_loss = np.inf
        # Default fallback
        best_p_centered = np.array([0., np.mean(yt), 0., np.mean(yt), 0., 1.])
        
        # Bounds: m in range, s positive
        bnds = [
            (None, None), (None, None),
            (None, None), (None, None),
            (x_min - 0.5, x_max + 0.5),
            (0.1, 100.0)
        ]
        
        for p0 in candidates:
            try:
                res = minimize(objective, p0, method='L-BFGS-B', bounds=bnds, tol=1e-6)
                if res.fun < best_loss:
                    best_loss = res.fun
                    best_p_centered = res.x
            except:
                continue
        
        # Denormalize parameters
        w1_c, b1_c, w2_c, b2_c, m_c, s_c = best_p_centered
        
        # Transform back to original x space
        # x_centered = x - x_mean
        # regime 1: w1 * (x - x_mean) + b1 = w1*x + (b1 - w1*x_mean)
        # sigmoid: s * (x - x_mean - m_c) = s * (x - (m_c + x_mean))
        
        w1 = w1_c
        b1 = b1_c - w1_c * x_mean
        w2 = w2_c
        b2 = b2_c - w2_c * x_mean
        m = m_c + x_mean
        s = s_c
        
        best_params_list.append([w1, b1, w2, b2, m, s])
        
    return np.array(best_params_list)[0] if T==1 else np.array(best_params_list)
# EVOLVE-BLOCK-END