import numpy as np import matplotlib.pyplot as plt from matplotlib.colors import ListedColormap plt.rcParams['figure.dpi'] = 100 plt.style.use('ggplot') from sklearn.linear_model import HuberRegressor, Ridge import gradio as gr C1, C2, C3 = '#ff0000', '#09bd00', '#0000ff' #===================================================== def create_plot(outlier_ratio=0.1, epsilon=1.35): num_samples = 100 x = np.linspace(-15, 15, num_samples) y = 2*x + 2 + np.random.normal(loc=0, scale=2.5, size=x.shape[0]) num_outliers = int(num_samples * outlier_ratio)//2 outliers_x = np.random.normal(loc=11, scale=1, size=num_outliers) outliers_y = np.random.normal(loc=-30, scale=4, size=num_outliers) x = np.concatenate([x, outliers_x]) y = np.concatenate([y, outliers_y]) outliers_x = np.random.normal(loc=-11, scale=1, size=num_outliers) outliers_y = np.random.normal(loc=30, scale=4, size=num_outliers) x = np.concatenate([x, outliers_x]) y = np.concatenate([y, outliers_y]) X = x[..., None] x = np.concatenate([x, outliers_x]) y = np.concatenate([y, outliers_y]) X = x[..., None] ridge = Ridge(alpha=0) ridge.fit(X, y) huber = HuberRegressor(epsilon=epsilon) huber.fit(X, y) fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(x, y, c=C1, edgecolor='k', s=40) line_x = np.linspace(-15, 15, 10) ax.plot(line_x, ridge.coef_*line_x + ridge.intercept_, c=C2, label='Ridge') ax.plot(line_x, huber.coef_*line_x + huber.intercept_, c=C3, label='Huber') ax.set_xlabel('X'); ax.set_ylabel('Y') ax.legend() ax.set_title('Huber Regressor vs Ridge Regressor with Outliers') return fig info = ''' # Robustness Against Outliers: Huber vs Ridge Regression This example demonstrates a simple linear regression problem in the existence of outliers, and compares the effectiveness of Huber regression vs Ridge regression. [Ridge regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html), which is essentially basic L2 linear regression with regularization (but regularization is neglected here), suffers from outliers because the outlying data points are going to heavily increase the loss, forcing the best-fit line to lean towards the outliers to decrease that loss. [Huber regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.HuberRegressor.html) uses the Huber loss instead of the L2 loss. The Huber loss function behaves quadratically when the error is small and linearly when the error is large. Consequently, the loss resulting from outlying points is weighed less heavily than if we use quadratic loss all over. The epsilon parameter controls the cut-off point between the quadratic and linear regions of the Huber loss. Use the sliders to increase the outlier ratio and see when the Huber regressor breaks down and how the value of epsilon affects that. Created by [huabdul](https://huggingface.co/huabdul) based on [scikit-learn docs](https://scikit-learn.org/stable/auto_examples/linear_model/plot_huber_vs_ridge.html#sphx-glr-auto-examples-linear-model-plot-huber-vs-ridge-py). ''' with gr.Blocks(analytics_enabled=False) as demo: with gr.Row(): with gr.Column(): gr.Markdown(info) s_outlier_ratio = gr.Slider(0.01, 0.5, value=0.15, step=0.01, label='Outlier Ratio') s_epsilon = gr.Slider(1, 2, 1.35, step=0.005, label='Epsilon') with gr.Column(): plot = gr.Plot(label='Comparison') s_outlier_ratio.change(create_plot, inputs=[s_outlier_ratio, s_epsilon], outputs=[plot]) s_epsilon.change(create_plot, inputs=[s_outlier_ratio, s_epsilon], outputs=[plot]) demo.load(create_plot, inputs=[s_outlier_ratio, s_epsilon], outputs=[plot]) demo.launch() #=====================================================