Training a deep neural network is essentially a compression task. We want to represent our training data distribution as a function parameterized by a bunch of matrices. The more complex the distribution, the more parameters we need. The rationale for approximating the entire distribution is so that we can forward any valid point at inference using the same model, with the same weights. But what if our model was trained on-the-fly, at inference? Then, when forwarding , we would only need to model the local distribution around . Since the local region should have lower dimensionality than the entire training set, a much simpler model will suffice!
This is the idea behind local approximation or local regression. Let’s consider a simple regression task.
Task
We are given samples of the following data:
where
Plotting code
import numpy as np
import plotly.graph_objects as go
# Generate data
np.random.seed(42)
n_points = 100
X = np.random.uniform(0, 1, n_points)
epsilon = np.random.normal(0, 1 / 3, n_points)
Y = np.sin(4 * X) + epsilon
# True function
x_true = np.linspace(0, 1, 500)
y_true = np.sin(4 * x_true)
# Create the plot
fig = go.Figure()
# Add scatter points for noisy data
fig.add_trace(
go.Scatter(
x=X,
y=Y,
mode="markers",
name="Noisy Data",
marker=dict(color="gray"),
)
)
# Add true function
fig.add_trace(
go.Scatter(
x=x_true,
y=y_true,
mode="lines",
name="True Function",
line=dict(color="red"),
)
)
# Update layout
fig.update_layout(
title="Data",
xaxis_title="X",
yaxis_title="Y",
template="plotly_dark",
height=400,
width=730,
)
# Save the plot to an HTML file
filename = "local_approximation_data.html"
fig.write_html(filename)
print(f"Saved plot to {filename}")
# Show the plot
fig.show()
We denote the dataset which consists of samples .
Our task is to fit a reasonable curve through the data, that approximately matches the true function. Let’s denote the curve .
K Nearest Neighbors
Given some , one approach is to take the closest values to , and average their values as the estimate. That is,
where denotes the nearest points to .
Plotting code
import plotly.graph_objects as go
import numpy as np
# Generate data
np.random.seed(42)
n_points = 100
X = np.random.uniform(0, 1, n_points)
epsilon = np.random.normal(0, 1 / 3, n_points)
Y = np.sin(4 * X) + epsilon
# True function
x_true = np.linspace(0, 1, 500)
y_true = np.sin(4 * x_true)
# k-NN for a range of k
x_curve = np.arange(0, 1, 0.01)
k_range = range(1, 21)
y_curves_knn = {}
for k in k_range:
y_curve = []
for x in x_curve:
distances = np.square(X - x)
nearest_indices = np.argsort(distances)[:k]
y_curve.append(np.mean(Y[nearest_indices]))
y_curves_knn[k] = y_curve
# Create the Plotly figure
fig = go.Figure()
# Add static traces
fig.add_trace(
go.Scatter(x=X, y=Y, mode="markers", name="Noisy Data", marker=dict(color="gray"))
)
fig.add_trace(
go.Scatter(
x=x_true, y=y_true, mode="lines", name="True Function", line=dict(color="red")
)
)
# Add the first k-NN curve (k=13, the default slider position)
initial_k = 13
fig.add_trace(
go.Scatter(
x=x_curve,
y=y_curves_knn[initial_k],
mode="lines",
name="k-NN Curve",
line=dict(color="yellow"),
)
)
# Define slider steps
steps = []
for k in k_range:
step = dict(
method="update",
args=[
{"y": [Y, y_true, y_curves_knn[k]]}, # Update y-data for the traces
{
"title": f"Interactive k-NN Curve with Slider for k = {k}"
}, # Update the title dynamically
],
label=f"{k}",
)
steps.append(step)
# Add slider to the layout
sliders = [
dict(
active=initial_k - 1,
currentvalue={"prefix": "k = "},
pad={"t": 50},
steps=steps,
)
]
fig.update_layout(
sliders=sliders,
title=f"Interactive k-NN Curve with Slider for k = {initial_k}",
xaxis_title="X",
yaxis_title="Y",
template="plotly_dark",
height=400,
width=730,
)
# Show and save the plot
fig.show()
html_path = "./knn_slider.html"
fig.write_html(html_path)
print(f"Saved interactive plot to {html_path}")
You can see by using the slider that a larger results in a smoother curve, but low curves incorporate some noise. At the extrema, traces the training data exactly and gives a flat global average.
Nadaraya–Watson Kernel Regression
Instead of limiting your subset of data to points, you could instead consider all points in the set, but weight each point’s contribution based on it’s closeness to . Consider the model
where is a kernel, which we will use as a closeness metric.
This function is parameterized by , known as the bandwidth, which controls what range of values in the data play a role in the output of . This becomes clear if we plot these functions.
Kernel Functions
What is plotted below is
where keeps integrating to over its support.
Plotting code
import numpy as np
import plotly.graph_objects as go
from scipy.integrate import quad
# Define kernel functions
def epanechnikov_kernel(u):
return np.maximum(0, 0.75 * (1 - u**2))
def tricube_kernel(u):
return np.maximum(0, (1 - np.abs(u) ** 3) ** 3)
def gaussian_kernel(u):
return np.exp(-0.5 * u**2) / np.sqrt(2 * np.pi)
def renormalized_kernel(kernel_func, u_range, bandwidth):
def kernel_with_lambda(u):
scaled_u = u / bandwidth
normalization_factor, _ = quad(lambda v: kernel_func(v / bandwidth), *u_range)
return kernel_func(scaled_u) / normalization_factor
return kernel_with_lambda
# Kernel function plot generator
def generate_kernel_plot(
kernel_name, kernel_func, x_range, u_range, lambda_values, y_range
):
fig = go.Figure()
# Initial lambda
initial_lambda = lambda_values[len(lambda_values) // 2]
# Generate initial kernel curve
x = np.linspace(*x_range, 500)
kernel_with_lambda = renormalized_kernel(kernel_func, u_range, initial_lambda)
y = kernel_with_lambda(x)
fig.add_trace(
go.Scatter(
x=x,
y=y,
mode="lines",
name=f"{kernel_name} Kernel (λ={initial_lambda:.2f})",
line=dict(color="green"),
)
)
# Create frames for the slider
frames = []
for bandwidth in lambda_values:
kernel_with_lambda = renormalized_kernel(kernel_func, u_range, bandwidth)
y = kernel_with_lambda(x)
frames.append(
go.Frame(
data=[
go.Scatter(
x=x,
y=y,
mode="lines",
name=f"{kernel_name} Kernel (λ={bandwidth:.2f})",
line=dict(color="green"),
)
],
name=f"{bandwidth:.2f}",
)
)
# Add frames to the figure
fig.frames = frames
# Add slider
sliders = [
{
"active": len(lambda_values) // 2,
"currentvalue": {"prefix": "Bandwidth λ: "},
"steps": [
{
"args": [
[f"{bandwidth:.2f}"],
{"frame": {"duration": 0, "redraw": True}, "mode": "immediate"},
],
"label": f"{bandwidth:.2f}",
"method": "animate",
}
for bandwidth in lambda_values
],
}
]
# Update layout
fig.update_layout(
title=f"{kernel_name} Kernel Function",
xaxis_title="u",
yaxis_title="K(u)",
yaxis_range=y_range,
template="plotly_dark",
sliders=sliders,
height=400, # Adjusted to match previous size
width=730, # Adjusted to match previous size
updatemenus=[
{
"direction": "left",
"pad": {"r": 10, "t": 87},
"showactive": False,
"type": "buttons",
"x": 0.1,
"xanchor": "right",
"y": 0,
"yanchor": "top",
}
],
)
return fig
# Kernel functions
kernels = {
"Epanechnikov": epanechnikov_kernel,
"Tricube": tricube_kernel,
"Gaussian": gaussian_kernel,
}
# Parameters
x_range_plot = (-3, 3) # Range of u values for the plot
u_range_integration = (-3, 3) # Range for normalization
lambda_values = np.linspace(0.01, 2, 20) # Linear lambda values from 0.01 to 2
y_range_plot = (0, 1.5) # Adjusted range to fit the normalized functions
# Generate and display plots for each kernel
for kernel_name, kernel_func in kernels.items():
fig = generate_kernel_plot(
kernel_name,
kernel_func,
x_range_plot,
u_range_integration,
lambda_values,
y_range_plot,
)
# Save the figure to an HTML file
filename = f"{kernel_name}_dynamic_normalization_kernel_function.html"
fig.write_html(filename, auto_play=False)
print(f"Saved {kernel_name} kernel plot to {filename}")
# Show the figure
fig.show()
Results
We now plot results for each of the Kernel functions. Each plot has a slider, which controls the output live.
Plotting code
import numpy as np
import plotly.graph_objects as go
# Define kernel functions
def epanechnikov_kernel(u):
return np.maximum(0, 0.75 * (1 - u**2))
def tricube_kernel(u):
return np.maximum(0, (1 - np.abs(u) ** 3) ** 3)
def gaussian_kernel(u):
return np.exp(-0.5 * u**2) / np.sqrt(2 * np.pi)
# Kernel regression function
def kernel_regression(X, Y, x_curve, kernel_func, bandwidth):
y_curve = []
for x in x_curve:
distances = np.abs(X - x) / bandwidth
weights = kernel_func(distances)
weighted_average = (
np.sum(weights * Y) / np.sum(weights) if np.sum(weights) > 0 else 0
)
y_curve.append(weighted_average)
return y_curve
# Generate data
np.random.seed(42)
n_points = 100
X = np.random.uniform(0, 1, n_points)
epsilon = np.random.normal(0, 1 / 3, n_points)
Y = np.sin(4 * X) + epsilon
# True curve
x_true = np.linspace(0, 1, 500)
y_true = np.sin(4 * x_true)
# Points for kernel estimation
x_curve = x_true
# Kernel functions
kernels = {
"Epanechnikov": epanechnikov_kernel,
"Tricube": tricube_kernel,
"Gaussian": gaussian_kernel,
}
# Range of bandwidths for the slider in logspace
lambda_values = np.logspace(-2, 0, 20) # From 0.01 to 1
# Generate separate plots for each kernel
for kernel_name, kernel_func in kernels.items():
fig = go.Figure()
# Add scatter points for noisy data
fig.add_trace(
go.Scatter(
x=X, y=Y, mode="markers", name="Noisy Data", marker=dict(color="gray")
)
)
# Add true function
fig.add_trace(
go.Scatter(
x=x_true,
y=y_true,
mode="lines",
name="True Function",
line=dict(color="red"),
)
)
# Add initial kernel curve
initial_bandwidth = lambda_values[0]
y_curve = kernel_regression(X, Y, x_curve, kernel_func, initial_bandwidth)
fig.add_trace(
go.Scatter(
x=x_curve,
y=y_curve,
mode="lines",
name=f"Nadaraya-Watson ({kernel_name})",
line=dict(color="green"),
)
)
# Create frames for the slider
frames = []
for bandwidth in lambda_values:
y_curve = kernel_regression(X, Y, x_curve, kernel_func, bandwidth)
frames.append(
go.Frame(
data=[
go.Scatter(
x=X,
y=Y,
mode="markers",
name="Noisy Data",
marker=dict(color="gray"),
),
go.Scatter(
x=x_true,
y=y_true,
mode="lines",
name="True Function",
line=dict(color="red"),
),
go.Scatter(
x=x_curve,
y=y_curve,
mode="lines",
name=f"Nadaraya-Watson ({kernel_name})",
line=dict(color="green"),
),
],
name=f"{bandwidth:.2f}",
)
)
# Add frames to the figure
fig.frames = frames
# Add slider
sliders = [
{
"active": 0,
"currentvalue": {"prefix": "Bandwidth λ: "},
"steps": [
{
"args": [
[f"{bandwidth:.2f}"],
{"frame": {"duration": 0, "redraw": True}, "mode": "immediate"},
],
"label": f"{bandwidth:.2f}",
"method": "animate",
}
for bandwidth in lambda_values
],
}
]
# Update layout
fig.update_layout(
title=f"Nadaraya-Watson Kernel Regression ({kernel_name} Kernel)",
xaxis_title="X",
yaxis_title="Y",
template="plotly_dark",
sliders=sliders,
height=400,
width=730,
updatemenus=[
{
"buttons": [
{
"args": [
None,
{
"frame": {"duration": 500, "redraw": True},
"fromcurrent": True,
},
],
"label": "Play",
"method": "animate",
},
{
"args": [
[None],
{
"frame": {"duration": 0, "redraw": True},
"mode": "immediate",
},
],
"label": "Pause",
"method": "animate",
},
],
"direction": "left",
"pad": {"r": 10, "t": 87},
"showactive": False,
"type": "buttons",
"x": 0.1,
"xanchor": "right",
"y": 0,
"yanchor": "top",
}
],
)
# Save the figure to an HTML file
filename = f"{kernel_name}_kernel_regression.html"
fig.write_html(filename, auto_play=False)
print(f"Saved {kernel_name} kernel plot to {filename}")
# Show the figure
fig.show()
We see that a simple weighted average of the data is able to model a sinusoid quite well.
Local Linear Regression
In Nadaraya-Watson kernel regression, we take a weighted average in a neighborhood defined by the kernel function . A potential issue with this is smooth interpolation within local neighborhoods, since we aren’t actually assuming that region follows any model.
What if we assume each region is locally linear? Then, we could solve for the least squares fit and interpolate freely!
Region: $k$-NN
Let’s define our local region as the nearest neighbors of our input. Let and be the corresponding values. The least squares fit coefficients are
Plotting code
import plotly.graph_objects as go
import numpy as np
# Generate data
np.random.seed(42)
n_points = 100
X = np.random.uniform(0, 1, n_points)
epsilon = np.random.normal(0, 1 / 3, n_points)
Y = np.sin(4 * X) + epsilon
# True function
x_true = np.linspace(0, 1, 500)
y_true = np.sin(4 * x_true)
# k-NN Local Linear Regression
def knn_linear_regression(X, Y, x_curve, k_range):
y_curves = {}
for k in k_range:
y_curve = []
for x in x_curve:
# Find k nearest neighbors
distances = np.abs(X - x)
nearest_indices = np.argsort(distances)[:k]
# Select k nearest neighbors
X_knn = X[nearest_indices]
Y_knn = Y[nearest_indices]
# Create design matrix for k-nearest neighbors
X_design = np.vstack((np.ones_like(X_knn), X_knn)).T
# Solve for beta using ordinary least squares
beta = np.linalg.pinv(X_design.T @ X_design) @ X_design.T @ Y_knn
# Predict y-value
y_curve.append(beta[0] + beta[1] * x)
y_curves[k] = y_curve
return y_curves
# Common variables
x_curve = np.arange(0, 1, 0.01)
k_range = range(1, 21) # Values of k from 1 to 20
initial_k = 10 # Default value of k
# Compute LLR using k-NN
y_curves_knn = knn_linear_regression(X, Y, x_curve, k_range)
# Create the Plotly figure
fig = go.Figure()
# Add static traces
fig.add_trace(
go.Scatter(x=X, y=Y, mode="markers", name="Noisy Data", marker=dict(color="gray"))
)
fig.add_trace(
go.Scatter(
x=x_true, y=y_true, mode="lines", name="True Function", line=dict(color="red")
)
)
# Add the first k-NN curve (k=initial_k)
fig.add_trace(
go.Scatter(
x=x_curve,
y=y_curves_knn[initial_k],
mode="lines",
name="k-NN Curve",
line=dict(color="yellow"),
)
)
# Define slider steps
steps = []
for k in k_range:
step = dict(
method="update",
args=[
{"y": [Y, y_true, y_curves_knn[k]]}, # Update y-data for the traces
{
"title": f"k-NN Local Linear Regression Curve with k = {k}"
}, # Update the title dynamically
],
label=f"{k}",
)
steps.append(step)
# Add slider to the layout
sliders = [
dict(
active=k_range.index(initial_k), # Use the index of initial_k
currentvalue={"prefix": "k = "},
pad={"t": 50},
steps=steps,
)
]
fig.update_layout(
sliders=sliders,
title=f"k-NN Local Linear Regression Curve with k = {initial_k}",
xaxis_title="X",
yaxis_title="Y",
template="plotly_dark",
height=400,
width=730,
)
# Show and save the plot
fig.show()
html_path = "./knn_slider_llr.html"
fig.write_html(html_path)
print(f"Saved interactive k-NN plot to {html_path}")
We see that the output can be quite rough for small .
Region: Kernel Function
Perhaps we can reuse some ideas from the Nadaraya-Watson kernel. We would like to consider all points in the training set to varying degrees, with higher weights inside the local region and lower weights outside.
For this, we can use weighted least squares objective, with weights . This has the solution
Plotting the results for various Kernel functions :
Plotting code
import plotly.graph_objects as go
import numpy as np
# Generate data
np.random.seed(42)
n_points = 100
X = np.random.uniform(0, 1, n_points)
epsilon = np.random.normal(0, 1 / 3, n_points)
Y = np.sin(4 * X) + epsilon
# True function
x_true = np.linspace(0, 1, 500)
y_true = np.sin(4 * x_true)
# Kernels
def gaussian_kernel(u):
return np.exp(-0.5 * u**2)
def epanechnikov_kernel(u):
return np.maximum(0, 1 - u**2)
def tricube_kernel(u):
return np.maximum(0, (1 - np.abs(u) ** 3) ** 3)
# Local Linear Regression for a specific kernel
def local_linear_regression(X, Y, x_curve, bandwidths, kernel):
y_curves = {}
for λ in bandwidths:
λ_rounded = round(λ, 2)
y_curve = []
for x in x_curve:
# Calculate weights using the specified kernel
distances = (X - x) / λ
weights = kernel(distances)
W = np.diag(weights)
# Create design matrix
X_design = np.vstack((np.ones_like(X), X)).T
# Solve for beta using weighted least squares
beta = np.linalg.pinv(X_design.T @ W @ X_design) @ X_design.T @ W @ Y
# Predict y-value
y_curve.append(beta[0] + beta[1] * x)
y_curves[λ_rounded] = y_curve
return y_curves
# Common variables
x_curve = np.arange(0, 1, 0.01)
bandwidths = np.linspace(0.05, 0.5, 20)
initial_λ = bandwidths[len(bandwidths) // 2]
# Generate plots for each kernel
kernels = {
"Gaussian Kernel": gaussian_kernel,
"Epanechnikov Kernel": epanechnikov_kernel,
"Tricube Kernel": tricube_kernel,
}
plots = []
for kernel_name, kernel_func in kernels.items():
# Compute LLR with the specified kernel
y_curves = local_linear_regression(X, Y, x_curve, bandwidths, kernel_func)
# Create the Plotly figure
fig = go.Figure()
# Add static traces
fig.add_trace(
go.Scatter(
x=X, y=Y, mode="markers", name="Noisy Data", marker=dict(color="gray")
)
)
fig.add_trace(
go.Scatter(
x=x_true,
y=y_true,
mode="lines",
name="True Function",
line=dict(color="red"),
)
)
# Add the first LLR curve (using the middle value of bandwidths)
fig.add_trace(
go.Scatter(
x=x_curve,
y=y_curves[round(initial_λ, 2)],
mode="lines",
name=f"{kernel_name} Curve",
line=dict(color="yellow"),
)
)
# Define slider steps
steps = []
for λ in bandwidths:
λ_rounded = round(λ, 2)
step = dict(
method="update",
args=[
{"y": [Y, y_true, y_curves[λ_rounded]]}, # Update y-data for the traces
{
"title": f"LLR: {kernel_name} with Bandwidth λ = {λ_rounded}"
}, # Update the title dynamically
],
label=f"{λ_rounded}",
)
steps.append(step)
# Add slider to the layout
sliders = [
dict(
active=len(bandwidths) // 2, # Use the index of the middle bandwidth
currentvalue={"prefix": "λ = "},
pad={"t": 50},
steps=steps,
)
]
fig.update_layout(
sliders=sliders,
title=f"LLR: {kernel_name} with Bandwidth λ = {round(initial_λ, 2)}",
xaxis_title="X",
yaxis_title="Y",
template="plotly_dark",
height=400,
width=730,
)
plots.append(fig)
# Show and save the plots
for i, (kernel_name, fig) in enumerate(zip(kernels.keys(), plots)):
fig.show()
html_path = f"./llr_{kernel_name.lower().replace(' ', '_')}.html"
fig.write_html(html_path)
print(f"Saved interactive plot for {kernel_name} to {html_path}")
I think the results look a lot smoother!
References
- The Elements of Statistical Learning - Hastie, Tibshirani, and Friedman (2009). A comprehensive guide to data mining, inference, and prediction. Read more.