ML - 多元线性回归

文章目录

关于多元线性回归
求解
算法封装
- 使用 sklearn 处理 boston 房价回归问题
使用 kNN 解决多元线性回归问题

关于多元线性回归

简单线性回归：假设样本只有一个特征值；
多元线性回归：解决很多特征值。

y^(i)=θ0+θ1X1(i)+θ2X2(i)+...+θnXn(i)\hat{y}^{(i)} = \theta_{0} + \theta_{1}X_1^{(i)} + \theta_{2}X_2^{(i)} + ... + \theta_{n}X_n^{(i)} y^(i)=θ0+θ1X1(i)+θ2X2(i)+...+θnXn(i)

求解

目标：要找到 $ \theta_{0}, \theta_{1}, \theta_{2} … \theta_{n} $ 使得目标函数（损失函数） ∑i=1m(y(i)−y^(i))2\sum_{i=1}^m ( y^{(i)} - \hat{y}^{(i)} )^2 i=1∑m(y(i)−y^(i))2 尽可能小

将上面多个 $ \theta $ 整理为一个元素，转置为列向量。
θ=(θ0,θ1,θ2,...,θn)T\theta = (\theta_{0}, \theta_{1}, \theta_{2}, ... , \theta_{n} )^T θ=(θ0,θ1,θ2,...,θn)T

给 $ \theta_{0} $ 增加一个虚构的乘数 $ X_0^{(i) } \equiv 1$，得到：
y^(i)=θ0X0(i)+θ1X1(i)+θ2X2(i)+...+θnXn(i)\hat{y}^{(i)} = \theta_{0}X_0^{(i) } + \theta_{1}X_1^{(i)} + \theta_{2}X_2^{(i)} + ... + \theta_{n}X_n^{(i)} y^(i)=θ0X0(i)+θ1X1(i)+θ2X2(i)+...+θnXn(i)

X(i)=(X0(i),X1(i),X2(i),...,Xn(i))X^{(i)} = ( X_0^{(i)}, X_1^{(i)}, X_2^{(i)}, ... , X_n^{(i)} ) X(i)=(X0(i),X1(i),X2(i),...,Xn(i))

y^(i)=X(i)⋅θ\hat{y}^{(i)} = X^{(i)} \cdot \theta y^(i)=X(i)⋅θ

将此推广到所有样本上

(1X1(1)X2(1)...Xn(1)1X1(2)X2(2)...Xn(2)...1X1(m)X2(m)...Xn(m))\left( \begin{array}{cc} 1 & X_1^{(1)} & X_2^{(1)} & ... & X_n^{(1)} \\ 1 & X_1^{(2)} & X_2^{(2)} & ... & X_n^{(2)} \\ ... \\ 1 & X_1^{(m)} & X_2^{(m)} & ... & X_n^{(m)} \end{array} \right) ⎝⎜⎜⎜⎛11...1X1(1)X1(2)X1(m)X2(1)X2(2)X2(m).........Xn(1)Xn(2)Xn(m)⎠⎟⎟⎟⎞

比之前的 XXX 多了一列 1，为了区分，记作 $ X_b$

θ=(θ0θ1θ2...θn)\theta = \left( \begin{array}{cc} \theta_0 \\ \theta_1 \\ \theta_2 \\ ... \\ \theta_n \end{array} \right) θ=⎝⎜⎜⎜⎜⎛θ0θ1θ2...θn⎠⎟⎟⎟⎟⎞

$ \theta_0 $ 称为截距 intercept，代表一个偏移；
$ \theta_1 - \theta_n $ 称为系数 coefficients；每一个都对应原来样本中的一个特征，用于描述这个特征对于最终样本相应的贡献程度；

y^=Xb⋅θ\hat{y} = X_b \cdot \thetay^=Xb⋅θ

所以目标函数也可以转化为：
(y−Xb⋅θ)T(y−Xb⋅θ)(y - X_b \cdot \theta)^T (y - X_b \cdot \theta) (y−Xb⋅θ)T(y−Xb⋅θ)

θ=(XbTXb)−1XbTy\theta = ( X_b^T X_b )^{-1} X_b^T y θ=(XbTXb)−1XbTy

这个式子称为多元线性回归的 正规方程解（Normal Equation ）

优点：不需要对数据做归一化处理；
因为最后估计的 θ\thetaθ 是原始数据进行数学运算的结果，这种运算中不存在量纲的问题， θ\thetaθ 是线性方程中每一个 x 前面的系数而已，没有量纲的问题。

缺点：时间复杂度比较高，大概是 O(n^3)，即使使用优化方案，也在 O(n^2.4) 左右。
无论是样本量大，还是特征多，用这个方法都会很慢。所以一般使用其他方法。

算法封装

import numpy as np
from .metrics import r2_scoreclass LinearRegression:def __init__(self):"""初始化Linear Regression模型"""self.coef_ = Noneself.intercept_ = Noneself._theta = Nonedef fit_normal(self, X_train, y_train):"""根据训练数据集X_train, y_train训练Linear Regression模型"""assert X_train.shape[0] == y_train.shape[0], \"the size of X_train must be equal to the size of y_train"X_b = np.hstack([np.ones((len(X_train), 1)), X_train])self._theta = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y_train)self.intercept_ = self._theta[0]self.coef_ = self._theta[1:]return selfdef fit_gd(self, X_train, y_train, eta=0.01, n_iters=1e4):"""根据训练数据集X_train, y_train, 使用梯度下降法训练Linear Regression模型"""assert X_train.shape[0] == y_train.shape[0], \"the size of X_train must be equal to the size of y_train"def J(theta, X_b, y):try:return np.sum((y - X_b.dot(theta)) ** 2) / len(y)except:return float('inf')def dJ(theta, X_b, y):# res = np.empty(len(theta))# res[0] = np.sum(X_b.dot(theta) - y)# for i in range(1, len(theta)):#     res[i] = (X_b.dot(theta) - y).dot(X_b[:, i])# return res * 2 / len(X_b)return X_b.T.dot(X_b.dot(theta) - y) * 2. / len(X_b)def gradient_descent(X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8):theta = initial_thetacur_iter = 0while cur_iter < n_iters:gradient = dJ(theta, X_b, y)last_theta = thetatheta = theta - eta * gradientif (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):breakcur_iter += 1return thetaX_b = np.hstack([np.ones((len(X_train), 1)), X_train])initial_theta = np.zeros(X_b.shape[1])self._theta = gradient_descent(X_b, y_train, initial_theta, eta, n_iters)self.intercept_ = self._theta[0]self.coef_ = self._theta[1:]return selfdef fit_sgd(self, X_train, y_train, n_iters=5, t0=5, t1=50):"""根据训练数据集X_train, y_train, 使用梯度下降法训练Linear Regression模型"""assert X_train.shape[0] == y_train.shape[0], \"the size of X_train must be equal to the size of y_train"assert n_iters >= 1def dJ_sgd(theta, X_b_i, y_i):return X_b_i * (X_b_i.dot(theta) - y_i) * 2.def sgd(X_b, y, initial_theta, n_iters, t0=5, t1=50):def learning_rate(t):return t0 / (t + t1)theta = initial_thetam = len(X_b)for cur_iter in range(n_iters):indexes = np.random.permutation(m)X_b_new = X_b[indexes]y_new = y[indexes]for i in range(m):gradient = dJ_sgd(theta, X_b_new[i], y_new[i])theta = theta - learning_rate(cur_iter * m + i) * gradientreturn thetaX_b = np.hstack([np.ones((len(X_train), 1)), X_train])initial_theta = np.random.randn(X_b.shape[1])self._theta = sgd(X_b, y_train, initial_theta, n_iters, t0, t1)self.intercept_ = self._theta[0]self.coef_ = self._theta[1:]return selfdef predict(self, X_predict):"""给定待预测数据集X_predict，返回表示X_predict的结果向量"""assert self.intercept_ is not None and self.coef_ is not None, \"must fit before predict!"assert X_predict.shape[1] == len(self.coef_), \"the feature number of X_predict must be equal to X_train"X_b = np.hstack([np.ones((len(X_predict), 1)), X_predict])return X_b.dot(self._theta)def score(self, X_test, y_test):"""根据测试数据集 X_test 和 y_test 确定当前模型的准确度"""y_predict = self.predict(X_test)return r2_score(y_test, y_predict)def __repr__(self):return "LinearRegression()"

使用 sklearn 处理 boston 房价回归问题

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasetsboston = datasets.load_boston()boston.keys()
# dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])boston.DESCR
# 有 13 个特征boston.feature_names
# array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')X = boston.data
y = boston.targetX = X[y < 50.0]
y = y[y < 50.0]from sklearn.model_selection import train_test_splitX_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)from sklearn.linear_model import LinearRegressionreg = LinearRegression()reg.fit(X_train, y_train)
# LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)reg.coef_
'''array([-1.22096140e-01,  3.24934065e-02, -4.51945652e-02,  6.32525034e-02,-1.17444911e+01,  3.61793376e+00, -2.00394486e-02, -1.21059188e+00,2.47235697e-01, -1.31042159e-02, -8.35556922e-01,  8.18076684e-03,-3.81616606e-01])
'''reg.intercept_
# 32.73189970831903reg.score(X_test, y_test)
# 0.7640047258028569

使用 kNN 解决多元线性回归问题

from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
knn_reg = KNeighborsRegressor()
knn_reg.fit(X_train, y_train)
# KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=None, n_neighbors=5, p=2, weights='uniform')knn_reg.score(X_test, y_test)
# 0.5812004118756823# 使用网格搜索寻找超参数
param_grid = [{'weights': ['uniform'],'n_neighbors': [i for i in range(1,11)]}, {'weights': ['distance'],'n_neighbors': [i for i in range(1,11)],'p': [i for i in range(1,6)]}]from sklearn.model_selection import GridSearchCV
# CV 的意思是 Cross Validation，交叉验证。grid_search = GridSearchCV(knn_reg, param_grid, n_jobs=-1, verbose=2)%time
grid_search.fit(X_train, y_train)'''CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µsWall time: 4.77 µsFitting 3 folds for each of 60 candidates, totalling 180 fitsGridSearchCV(cv='warn', error_score='raise-deprecating',estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30,metric='minkowski',metric_params=None, n_jobs=None,n_neighbors=5, p=2,weights='uniform'),iid='warn', n_jobs=-1,param_grid=[{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],'weights': ['uniform']},{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],'p': [1, 2, 3, 4, 5], 'weights': ['distance']}],pre_dispatch='2*n_jobs', refit=True, return_train_score=False,scoring=None, verbose=2)'''grid_search.best_params_
# {'n_neighbors': 10, 'p': 1, 'weights': 'distance'}grid_search.best_score_
# 0.6343537550346618grid_search.best_estimator_
# KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=None, n_neighbors=10, p=1, weights='distance')grid_search.best_estimator_.score(X_test, y_test)
# 比 knn 默认的结果，但是不如线性回归的效果。但这里的score 标准，可能和 lr 中的标准不通过。
# 0.6717251762406973