Machine Leaning ex1:Linear Regression


1. 简单练习

1.1 输出一个5*5的单位矩阵

a = np.eye(5, dtype=int)

2. 梯度下降:训练线性回归的参数θ

2.1 计算代价损失函数Cost

def computeCost(X, y, theta):m = len(X)  # m: 数据集规模h_theta_x = X * theta.T  # h_theta_x: 假设函数, 向量inner = np.power((h_theta_x - y), 2)J_theta = np.sum(inner) / (2 * m)return J_theta

2.2 梯度下降函数

def gradientDescent(X, y, theta, alpha, num_iters):temp = np.matrix(np.zeros(theta.shape))parameters = int(theta.shape[1])  # theta的列数,用于更新每代的全部thetacost = np.zeros(num_iters)  # 形成iter次数相同的数组,记录损失函数J_thetaall_theta = np.zeros((num_iters, parameters))for i in range(num_iters):cost[i] = computeCost(X, y, theta)all_theta[i] = thetahx_Sub_y = X * theta.T - y  # 假设函数-yfor j in range(parameters):diff = np.multiply(hx_Sub_y, X[:, j])  # 计算偏导数(hxi-yi)*xitemp[0, j] = theta[0, j] - ((alpha / len(X)) * np.sum(diff))theta = tempreturn cost, all_theta

2.3 正规方程 normal equation

def normalequation(X, y):theta = (X.T * X)**-1 * X.T * y# computeCost的返回值theta形状统一return theta.T

3 线性回归

3.1 单变量线性回归


3.1.1 读取数据,然后展示数据

    data_path = 'ex1data1.txt'data = pd.read_csv(data_path, header=None, names=['Population', 'Profit'])# 增加θ0data.insert(0, 'Theta0', 1)# 使用pd的绘图,预览原始数据data.plot(kind='scatter', x='Population', y='Profit')

3.1.2 线性回归函数

def liner_regression():return

3.2 多变量的线性回归函数


def multi_liner_regression():return

4 绘图

4.1 绘制线性回归

def plotLinerRegression(data, theta):X = np.linspace(data.Population.min(), data.Population.max(), 100)f = theta[0] + theta[1] * X# 绘制线性回归图fig, ax = plt.subplots()ax.plot(X, f, color='blue', alpha=0.4, label='Prediction')ax.scatter(data.Population, data.Profit, color='green', alpha=0.4, label='Training Data')ax.legend()ax.set_xlabel('Population')ax.set_ylabel('Profit')ax.set_title('Prediction vs Population size')#

4.2 绘制J_theta

def plotJtheta(cost, num_iters, title):# 绘制J_thetafig, ax = plt.subplots()# 格式化x,y# 绘制J_theta更新ax.scatter(np.arange(num_iters), cost, alpha=0.4)ax.set_xlabel('num_iters')ax.set_ylabel('J(θ)')ax.set_title(title)#

4.3 绘制3D图

def plotThreeD(X, Y, fig_title):fig = plt.figure()ax = fig.gca(projection='3d')# 格式化x,ytheta1 = np.linspace(-1, 4, 100)theta0 = np.linspace(-10, 10, 100)J_theta = np.zeros((len(theta0), len(theta1)))for i in range(len(theta0)):for j in range(len(theta1)):t = np.matrix([theta0[i], theta1[j]])J_theta[i][j] = computeCost(X, Y, t)# 绘制3Dx = theta0y = theta1z = J_thetax, y = np.meshgrid(x, y)# rainbow,gist_rainbow,hsvsurf = ax.plot_surface(x, y, z, cmap=cm.rainbow, linewidth=0, antialiased=False)ax.set_xlabel('theta0')ax.set_ylabel('theta1')ax.set_zlabel('J_theta')ax.set_title(fig_title)fig.colorbar(surf, shrink=0.5, aspect=5)#



import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm
from sklearn.preprocessing import StandardScaler
from mpl_toolkits.mplot3d import Axes3D# 1.简单练习
# 输出一个5*5的单位矩阵
a = np.eye(5, dtype=int)# print(a)# 2. 梯度下降:训练线性回归的参数θ
# 在本次练习中,需要实现一个单变量的线性回归。
# 假设有一组历史数据<城市人口,开店利润>,现需要预测在哪个城市中开店利润比较好?
# 历史数据如下:第一列表示城市人口数,单位为万人;第二列表示利润,单位为10,000$# 2.1 计算代价损失函数Cost,训练线性回归的参数θ,特征X=[x0, x1, x2, …, xm], 某个特征theta=[θ0, θ1, θ2, …, θm]
def computeCost(X, y, theta):m = len(X)  # m: 数据集规模h_theta_x = X * theta.T  # h_theta_x: 假设函数, 向量inner = np.power((h_theta_x - y), 2)J_theta = np.sum(inner) / (2 * m)return J_theta# 2.2 梯度下降函数
def gradientDescent(X, y, theta, alpha, num_iters):# cost = np.zeros(num_iters)  # 形成iter次数相同的数组,记录损失函数J_theta# all_theta = np.zeros((num_iters, theta.shape[1]))# for i in range(num_iters):#     cost[i] = computeCost(X, y, theta)#     inner = X * theta.T - y  # 假设函数-y#     print(type(inner))#     theta = theta - (np.sum(inner).T * X) * alpha / len(X)#     # alpha_m_inner = a * m * 矩阵每一列和#     # theta = theta - alpha_m_inner*X#     all_theta[i] = thetatemp = np.matrix(np.zeros(theta.shape))parameters = int(theta.shape[1])  # theta的列数,用于更新每代的全部thetacost = np.zeros(num_iters)  # 形成iter次数相同的数组,记录损失函数J_thetaall_theta = np.zeros((num_iters, parameters))for i in range(num_iters):cost[i] = computeCost(X, y, theta)all_theta[i] = thetahx_Sub_y = X * theta.T - y  # 假设函数-yfor j in range(parameters):diff = np.multiply(hx_Sub_y, X[:, j])  # 计算偏导数(hxi-yi)*xitemp[0, j] = theta[0, j] - ((alpha / len(X)) * np.sum(diff))theta = tempreturn cost, all_theta# 2.3 正规方程, narmal equation
def normalequation(X, y):theta = (X.T * X) ** -1 * X.T * y# computeCost的返回值theta形状统一return theta.T# 4 绘图
# 4.1 绘制线性回归
def plotLinerRegression(data, theta):X = np.linspace(data.Population.min(), data.Population.max(), 100)f = theta[0] + theta[1] * X# 绘制线性回归图fig, ax = plt.subplots()ax.plot(X, f, color='blue', alpha=0.4, label='Prediction')ax.scatter(data.Population, data.Profit, color='green', alpha=0.4, label='Training Data')ax.legend()ax.set_xlabel('Population')ax.set_ylabel('Profit')ax.set_title('Prediction vs Population size')# 4.2 绘制J_theta
def plotJtheta(cost, num_iters, title):# 绘制J_thetafig, ax = plt.subplots()# 格式化x,y# 绘制J_theta更新ax.scatter(np.arange(num_iters), cost, alpha=0.4)ax.set_xlabel('num_iters')ax.set_ylabel('J(θ)')ax.set_title(title)# 4.3 绘制J_theta、3D图
def plotThreeD(X, Y, fig_title):fig = plt.figure()ax = fig.gca(projection='3d')# 格式化x,ytheta1 = np.linspace(-1, 4, 100)theta0 = np.linspace(-10, 10, 100)J_theta = np.zeros((len(theta0), len(theta1)))for i in range(len(theta0)):for j in range(len(theta1)):t = np.matrix([theta0[i], theta1[j]])J_theta[i][j] = computeCost(X, Y, t)# 绘制3Dx = theta0y = theta1z = J_thetax, y = np.meshgrid(x, y)# rainbow,gist_rainbow,hsvsurf = ax.plot_surface(x, y, z, cmap=cm.rainbow, linewidth=0, antialiased=False)ax.set_xlabel('theta0')ax.set_ylabel('theta1')ax.set_zlabel('J_theta')ax.set_title(fig_title)fig.colorbar(surf, shrink=0.5, aspect=5)# 3 线性回归
# 3.1 单变量线性回归
def liner_regression():# 1. 读取数据,然后展示数据data_path = 'ex1data1.txt'data = pd.read_csv(data_path, header=None, names=['Population', 'Profit'])# 增加θ0data.insert(0, 'X0', 1)# 使用pd的绘图,预览原始数据data.plot(kind='scatter', x='Population', y='Profit', alpha=0.4, color='green')# 2. 初始化num_iters = 400  # 迭代次数alpha = 0.01  # 学习速率fig_title = 'liner regression'  # 图像标题cols = data.shape[1]list_x = data.iloc[:, :-1]  # x包含全为1的第1列,以及剩余列(不包括最后一列y)list_y = data.iloc[:, cols - 1:cols]# 将列表x、y转换为矩阵X、YX = np.matrix(list_x.values)y = np.matrix(list_y.values)# 定义初始化Theta,单变量线性回归,Theta维度为2x1theta = np.matrix(np.zeros(X.shape[1], dtype=int))  # 定义theta# 3. 调用cost, all_theta = gradientDescent(X, y, theta, alpha, num_iters)theta = all_theta[-1]# 正规方程计算theta# theta1 = normalequation(X, y)# cost1 = computeCost(X, y, theta1)plotLinerRegression(data, theta)plotJtheta(cost, num_iters, fig_title)plotThreeD(X, y, fig_title)return# 3.2 多变量的线性回归函数
def multi_liner_regression():# 1. 读取数据path = 'ex1data2.txt'raw_data = pd.read_csv(path)# 2. 初始化num_iters = 1500  # 迭代次数alpha = 0.01  # 学习速率fig_title = 'multi_liner regression'  # 定义图像标题# 2.1 数据预处理# 使用sklearn.preprocessing的StandardScaler特征缩放scaler = StandardScaler()# 标准化rawdata, data当前格式被转换为numpy.ndarry类型,无须转换data = scaler.fit_transform(raw_data)# 在数组插入0列data = np.insert(data, obj=0, values=1, axis=1)cols = data.shape[1]X = data[:, 0:cols - 1]y = data[:, cols - 1:cols]# 转换为矩阵形式X = np.asmatrix(X)  # Unlike matrix, asmatrix does not make a copy if the input is already a matrix or an ndarray.y = np.asmatrix(y)  # Equivalent to matrix(data, copy=False).theta = np.matrix(np.zeros(X.shape[1], dtype=int))  # 定义theta# 3. 运行调用cost, all_theta = gradientDescent(X, y, theta, alpha, num_iters)theta = all_theta[-1]# 正规方程计算theta# theta1 = normalequation(X, y)# cost1 = computeCost(X, y, theta1)# 4. 画图plotJtheta(cost, num_iters, fig_title)if __name__ == '__main__':liner_regression()multi_liner_regression()

