Linear regression using scikit-learn

Dr. Huidae Cho

1   Univariate linear regression

from sklearn.linear_model import LinearRegression
import numpy as np
import matplotlib.pyplot as plt

# generate some data
# univariate linear regression
# one independent variable
X = np.array([2, 3, 1, 34, 5, 7]).reshape(-1,1)

# one dependent variable
y = np.array([10, 20, 32, 35, 50, 60]).reshape(-1,1)

# find outliers
q1 = np.percentile(X, 25)
q3 = np.percentile(X, 75)
iqr = q3 - q1
max_x = q3 + 1.5 * iqr
min_x = q1 - 1.5 * iqr

out_idx = np.concatenate((np.where(X < min_x)[0], np.where(X > max_x)[0]))

# remove outliers
X = np.delete(X, out_idx).reshape(-1,1)
y = np.delete(y, out_idx).reshape(-1,1)

# perform linear regression
lin_regressor = LinearRegression()
lin_reg = lin_regressor.fit(X, y)

# take coefficients
# lin_reg contains coefficients for a 2D line equation: y = a * x1 + b
a = lin_reg.coef_
b = lin_reg.intercept_

# construct two points that cross the regression line
min_x1 = 0
max_x1 = 35
sta_y = a * min_x1 + b
end_y = a * max_x1 + b

# put the two points in numpy arrays
lin_x1 = np.array([min_x1, max_x1]).reshape(-1,1)
lin_y = np.array([sta_y, end_y]).reshape(-1,1)

# plot data
plt.scatter(X, y)
plt.plot(lin_x1, lin_y, color="red")

2   Bivariate linear regression

# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

from sklearn.linear_model import LinearRegression
import numpy as np
import matplotlib.pyplot as plt

# generate some data
# bivariate linear regression
# two independent variables
X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]])

# one dependent variable
y = np.dot(X, np.array([1, 2])) + 3

# perform linear regression
lin_regressor = LinearRegression()
lin_reg = lin_regressor.fit(X, y)

# take coefficients
# lin_reg contains coefficients for a 3D line equation: y = a * x1 + b * x2 + c
a, b = lin_reg.coef_
c = lin_reg.intercept_

# construct two points that cross the regression line
min_x1 = min_x2 = 0
max_x1 = max_x2 = 5
sta_y = a * min_x1 + b * min_x2 + c
end_y = a * max_x1 + b * max_x2 + c

# put the two points in numpy arrays
lin_x1 = np.array([min_x1, max_x1])
lin_x2 = np.array([min_x2, max_x2])
lin_y = np.array([sta_y, end_y])

# plot data
fig = plt.figure()
ax = fig.add_subplot(projection="3d")
ax.set_xlabel("x1")
ax.set_ylabel("x2")
ax.set_zlabel("y")
ax.scatter(X[:,0], X[:,1], y)
ax.plot(lin_x1, lin_x2, lin_y, color="red")