机器学习作业2编程作业(python):Logistic Regression

英文文档图片均来自原档作业pdf截图网址https://www.coursera.org/learn/machine-learning/programming/8f3qT/linear-regression*

Logistic Regression

修改某行index name：

这个只能对某一行修改行名：

data1.rename({data1.index[0]:'no'})

试了好多函数，最后还是采用按数量修改。。用loc（）选择还是改不了。。有大神教我一下吗

通过重新排序并根据数量修改行名（强迫症，跟他杠上了，其实不改也行直接iloc+行号）：

import numpy as np
import pandas as pd
from matplotlib import pyplot as pltpath = r'D:\Ninachen\wg_machinelearning\machine-learning-ex2\ex2\ex2data1.txt'
data1 = pd.read_csv(path, header=None, names=['exam1 score','exam2 score','result'])#visualize data1
data1 = data1.sort_values(by=['result']) # 按'result'升序排列
data1 = data1.reset_index(drop = True) # 重新设置索引
#添加行名***
data1.index = ['unpassed']*40 + ['passed']*60 # 对应'result'数据分别添加行名# data1.groupby('result')# 按result分组排序
# df1 = df1.reset_index(drop = True)
# data1.groupby('result').get_group(0).shape  #(40, 3),说明0~39行均为result==0的人
# 错误代码 data1.rename({data1.index[0:40]:'unpassed'})
# 错误代码 data1.iloc[0:40].index.name = 'un'
# 错误代码 data1.loc[data1['result']==1].index= [111]*60

data1exam1 score  exam2 score  result
unpassed    34.623660    78.024693       0
unpassed    38.785804    64.995681       0
unpassed    30.058822    49.592974       0
unpassed    82.226662    42.719879       0
unpassed    40.236894    71.167748       0...          ...     ...
passed      64.176989    80.908061       1
passed      52.045405    69.432860       1
passed      85.404519    57.051984       1
passed      97.771599    86.727822       1
passed      74.775893    89.529813       1
[100 rows x 3 columns]

画出散点图：

# 画图！！！****
#
#分开的两张图
# data1.loc['unpassed'].plot(kind="scatter",x="exam1 score",y="exam2 score",color = 'Red')
# data1.loc['passed'].plot(kind="scatter",x="exam1 score",y="exam2 score")fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(data1.loc['unpassed','exam1 score'], data1.loc['unpassed','exam2 score'], color = 'Red')
ax.scatter(data1.loc['passed','exam1 score'], data1.loc['passed','exam2 score'])
ax.legend(['unpassed','passed']) #添加图例，要加【】； loc为位置
plt.xlabel('exam1 score')
plt.ylabel('exam2 score')
# 或者写成ax.set_xlabel('θ0') ,ax.set_ylabel('θ1')
# 错误代码：ax.xlable('...')
plt.title('primary data')
plt.show()

在开始编写程序之前先设置各个参数：


datamat1 = data1.values # 转换为array
X = np.matrix(datamat1[:,0:-1])  # 输入特征
y = np.matrix(datamat1[:,-1]) # 标签
y = y.T # 列向量
X = np.insert(X, X.shape[1], values=1, axis=1)  # 添加一列常数
theta_init = np.matrix(np.zeros((X.shape[1],1))) # theta初值为零
# =========================================
iterations = int(input('please input the iterations:'))  # 给定迭代次数
alpha = float(input('please input the  learning rate alpha:'))  # 学习率
cost_table = np.empty([iterations])

y, theta 在这里都是列向量
X是m*n矩阵，m为数据组数，n为特征维数，且我把常数列放在最后一列，即常数对应的是theta[-1,0]

需要使用到sigmoid(),可以直接编写一个或者调用现有的：

import scipy.specialdef sigmoid(x):return scipy.special.expit(x)   #return 1 / (1 + exp(-inx))

代价函数：条件概率->对数似然函数->最大似然估计->求J最小时的θ

为什么不采用线性回归时候用的代价函数：因为其在逻辑回归中非凸，有多个极值

这里，要求的代价函数就是：

继续使用梯度下降法，可以看到几乎与线性回归的代码很相似。首先计算代价函数cost：

def cost(theta, X, y): #即代价函数J(θ)J = np.sum(- np.multiply(y, np.log(sigmoid(X @ theta))) - np.multiply((1 - y), np.log(1 - sigmoid(X @ theta))))  # y.shape(1,m) 要转置成列向量return J / len(X)

梯度函数：

def gradient(theta,X,y):return (X.T @ (sigmoid(X @ theta) - y))/len(X)# 返回的theta为列向量

定义一个功能函数：


def ex2_LogisticRegression(X, y, theta, alpha, iterations):# theta初值为零cost_table[0] = cost(theta, X, y)for i in range(1, iterations):theta = theta - alpha * gradient(theta,X,y)cost_table[i] = cost(theta, X, y)return theta  #返回最终的theta值

将画图部分整合之后的所有代码：

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import scipy.specialdef sigmoid(x):return scipy.special.expit(x)   #return 1 / (1 + exp(-inx))def cost(theta, X, y): #即代价函数J(θ)J  J = np.sum(- np.multiply(y, np.log(sigmoid(X * theta)+ 1e-5)) - np.multiply((1 - y), np.log(1 - sigmoid(X * theta) + 1e-5)))  # y.shape(1,m) 要转置成列向量return J / len(X)def gradient(theta,X,y):return (X.T @ (sigmoid(X @ theta) - y))/len(X)# 返回的theta为列向量def ex2_LogisticRegression(X, y, theta, alpha, iterations):# theta初值为零cost_table[0] = cost(theta, X, y)for i in range(1, iterations):theta = theta - alpha * gradient(theta,X,y)cost_table[i] = cost(theta, X, y)return theta  #返回最终的theta值path = r'D:\Ninachen\wg_machinelearning\machine-learning-ex2\ex2\ex2data1.txt'
data1 = pd.read_csv(path, header=None, names=['exam1 score','exam2 score','result'])#visualize data1
data1 = data1.sort_values(by=['result']) # 按'result'升序排列
data1 = data1.reset_index(drop = True) # 重新设置索引
data1.index = ['unpassed']*40 + ['passed']*60 # 对应'result'数据分别添加行名datamat1 = data1.values # 转换为array
X = np.matrix(datamat1[:,0:-1])  # 输入特征
y = np.matrix(datamat1[:,-1]) # 标签
y = y.T # 列向量
X = np.insert(X, X.shape[1], values=1, axis=1)  # 添加一列常数
theta_init = np.matrix(np.zeros((X.shape[1],1))) # theta初值为零
# =========================================
iterations = int(input('please input the iterations:'))  # 给定迭代次数
alpha = float(input('please input the  learning rate alpha:'))  # 学习率
cost_table = np.empty([iterations])# 得到theta终值
theta = ex2_LogisticRegression(X, y, theta_init, alpha, iterations)
print('final theta =  ', theta)
print('and cost function= ', cost_table[iterations-1])
# 画图！！！****
# 散点图
fig = plt.figure(figsize=(5,5))
ax = fig.add_subplot(121)
ax.scatter(data1.loc['unpassed','exam1 score'], data1.loc['unpassed','exam2 score'], color = 'Red')
ax.scatter(data1.loc['passed','exam1 score'], data1.loc['passed','exam2 score'])
plt.legend(['unpassed','passed']) #添加图例，要加【】； loc为位置
plt.xlabel('exam1 score')
plt.ylabel('exam2 score')# 或者写成ax.set_xlabel('θ0') ,ax.set_ylabel('θ1')
# 错误代码：ax.xlable('...')
xx = np.arange(data1.loc[:, 'exam1 score'].min() - 30, data1.loc[:, 'exam1 score'].max() + 30, 0.1)
yy = xx.copy()
a,b = np.meshgrid(xx,yy) # 先创建网格
# theta[0].shape = (1,1) *** ， theta[0,0]才是一个数值
# f = lambda xx,yy : theta[0,0] * xx + theta[1,0] * yy + theta[2,0] # X*theta
# f就是隐函数Xθ（=0）
# ****注意f()参数是a,b，是网格，否则维数不对
z = theta[0,0] * a + theta[1,0] * b + theta[2,0] # X*theta
# x1 = -(theta[2,0]+theta[0,0] * xx)/theta[1,0]
# plt.plot(xx, x1)
plt.contour(a, b, z, 0) # 通过等高线绘图,'0'即相当于显示z轴高度为0处的等高线，即所求平面图像ax2 = fig.add_subplot(122)
xulie = np.array(range(0,iterations))
ax2.plot(xulie, cost_table)
plt.ylim((0,4))
plt.xlabel('iterations')
plt.ylabel('cost function')
plt.title('Cost funtion')
plt.show()

学到了用等高线画隐函数~ plt.contour() ；以及对dataframe进行排序分组贴标签~ 结果：

这里会有warning，好像是由于log（）函数溢出
D:\PycharmProjects\pythonProject\venv\ex2-2.py:15: RuntimeWarning: divide by zero encountered in log
J = np.sum(- np.multiply(y, np.log(sigmoid(X @ theta))) - np.multiply((1 - y), np.log(1 - sigmoid(X @ theta))))
D:\PycharmProjects\pythonProject\venv\ex2-2.py:15: RuntimeWarning: invalid value encountered in multiply

https://blog.csdn.net/u012965373/article/details/94392948 在np.log()里加上1e-5解决此warning

当迭代200000次时效果比较好：

please input the iterations:200000
please input the  learning rate alpha:0.01final theta =   [[  0.49798359][  0.49087579][-59.81533926]]
and cost function=  0.3105837855219491

Regularized logistic regression

先画出data2的散点图：

显然是个非线性的！也就是说可以有高次幂项，同样也可以化为线性多项式形式

特征映射函数（借大佬写的一用）来自https://blog.csdn.net/Cowry5/article/details/80247569

format 格式化函数
这里x1,x2是array; 我一开始用matrix就报错了看了好久:(
还有x1,x2必须是一维的

data2.loc[:,['test1']].values.shape #会报错
(118, 1)
data2['test1'].values.shape #不会报错
(118,)

def feature_mapping(x1, x2, power): #来自别的大佬~#x1,x2 的type 须是<class 'numpy.ndarray'>data = {}for i in np.arange(power + 1):for p in np.arange(i + 1):data["f{}{}".format(i - p, p)] = np.multiply(np.power(x1, i - p) , np.power(x2, p))return pd.DataFrame(data)

得到新的数据集data_new：

path = r'D:\Ninachen\wg_machinelearning\machine-learning-ex2\ex2\ex2data2.txt'
data2 = pd.read_csv(path, header=None, names=['test1','test2','result'])# ***特征映射到6阶
# x1 = data2.loc[:,['test1']].values
# x2 = data2.loc[:,['test2']].values   #.as_matrix用.values代替
# 不能给入函数,因为data2.loc[:,['test1']]是dataframe（后面array会是二维）,需要series（后面是一维）x1 = data2['test1'].values
x2 = data2['test2'].values   #.as_matrix用.values代替
data_new = feature_mapping(x1, x2, power=6)

data_new.head()f00       f10      f01       f20  ...       f33       f24       f15       f06
0  1.0  0.051267  0.69956  0.002628  ...  0.000046  0.000629  0.008589  0.117206
1  1.0 -0.092742  0.68494  0.008601  ... -0.000256  0.001893 -0.013981  0.103256
2  1.0 -0.213710  0.69225  0.045672  ... -0.003238  0.010488 -0.033973  0.110047
3  1.0 -0.375000  0.50219  0.140625  ... -0.006679  0.008944 -0.011978  0.016040
4  1.0 -0.513250  0.46564  0.263426  ... -0.013650  0.012384 -0.011235  0.010193
[5 rows x 28 columns]

接下来是代价函数，这里要正则化。

正则项（由惩罚系数λ控制）
防止过拟合措施：1.减少特征 2.正则化

这里写到对“θ0”（即常数项系数）不进行正则化。那么程序中对应的应该是theta[0,0]（即data_new中的{f00}）

代价函数：

def cost(theta, X, y, lam): #即代价函数J(θ)J = np.sum(- np.multiply(y, np.log(sigmoid(X @ theta))) - np.multiply((1 - y), np.log(1 - sigmoid(X @ theta))))  # y.shape(1,m) 要转置成列向量second = lam / 2 * (np.sum(np.power(theta,2)) - np.power(theta[0,0],2))return (J + second) / len(X)

梯度：

def gradient(theta, X, y, lam):gra_ori = (X.T @ (sigmoid(X @ theta) - y))/len(X) #未正则化的regu = lam / len(X) * thetaregu[0,0] = 0return (gra_ori + regu)# 返回的theta为列向量

画决策边界依旧借鉴了别的大佬写的程序 .ravel()：ndarray降维

xx = np.arange(data2.loc[:, 'test1'].min()-0.1, data2.loc[:, 'test1'].max()+0.1, 0.05)
yy = xx.copy()
a,b = np.meshgrid(xx,yy)
z = feature_mapping(a.ravel(), b.ravel(), 6).values
z = z * theta
z = z.reshape(a.shape)
plt.contour(a, b, z, 0)

加上画图整合一下代码：

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import scipy.specialdef sigmoid(x):return scipy.special.expit(x)   #return 1 / (1 + exp(-inx))
#****
def feature_mapping(x1, x2, power): #来自别的大佬~#x1,x2 的type 须是<class 'numpy.ndarray'>data = {}for i in np.arange(power + 1):for p in np.arange(i + 1):data["f{}{}".format(i - p, p)] = np.multiply(np.power(x1, i - p) , np.power(x2, p))return pd.DataFrame(data)def cost(theta, X, y, lam): #即代价函数J(θ)J = np.sum(- np.multiply(y, np.log(sigmoid(X * theta)+ 1e-5)) - np.multiply((1 - y), np.log(1 - sigmoid(X * theta) + 1e-5)))  # y.shape(1,m) 要转置成列向量second = lam / 2 * (np.sum(np.power(theta,2)) - np.power(theta[0,0],2))return (J + second) / len(X)def gradient(theta, X, y, lam):gra_ori = (X.T @ (sigmoid(X @ theta) - y))/len(X) #未正则化的regu = lam / len(X) * thetaregu[0,0] = 0return (gra_ori + regu)# 返回的theta为列向量def ex2_RegularizedLogisticRegression(X, y, theta, alpha, iterations, lam):# theta初值为零cost_table[0] = cost(theta, X, y, lam)for i in range(1, iterations):theta = theta - alpha * gradient(theta, X, y, lam)cost_table[i] = cost(theta, X, y, lam)return theta  #返回最终的theta值path = r'D:\Ninachen\wg_machinelearning\machine-learning-ex2\ex2\ex2data2.txt'
data2 = pd.read_csv(path, header=None, names=['test1','test2','result'])# ***特征映射到6阶
# x1 = data2.loc[:,['test1']].values
# x2 = data2.loc[:,['test2']].values   #.as_matrix用.values代替
# 不能给入函数,因为data2.loc[:,['test1']]是dataframe（后面array会是二维）,需要series（后面是一维）x1 = data2['test1'].values
x2 = data2['test2'].values   #.as_matrix用.values代替
pow = int(input('please input the power:'))  # 给定powerdata_new = feature_mapping(x1, x2, power=pow)
data_new = data_new.valuesX = np.matrix(data_new[:,:])  # 输入特征
y = np.matrix(data2.values[:,-1]) # 标签
y = y.T # 列向量
# X = np.insert(X, X.shape[1], values=1, axis=1)  # 添加一列常数
theta_init = np.matrix(np.zeros((X.shape[1],1)))
# theta初值为零
# =========================================
iterations = int(input('please input the iterations:'))  # 给定迭代次数
alpha = float(input('please input the  learning rate alpha:'))  # 学习率
lam = float(input('please input lambda:'))  # 给定惩罚系数cost_table = np.empty([iterations])# 得到theta终值
theta = ex2_RegularizedLogisticRegression(X, y, theta_init, alpha, iterations, lam)
print('final theta =  ', theta)
print('( with lambda =', lam ,')')
print('and cost function= ', cost_table[iterations-1])# 画图=========================
passed = data2.loc[data2['result']==1]
unpassed = data2.loc[data2['result']==0]   # isin([]) 要加方括号
fig = plt.figure(figsize=(10,5))
ax = fig.add_subplot(121)
ax.scatter(unpassed.test1, unpassed.test2, color = 'Red')
ax.scatter(passed.test1, passed.test2)
plt.legend(['unpassed','passed']) #添加图例，要加【】； loc为位置
plt.xlabel('test1')
plt.ylabel('test2')
# 画决策边界
xx = np.arange(data2.loc[:, 'test1'].min()-0.1, data2.loc[:, 'test1'].max()+0.1, 0.05)
yy = xx.copy()
a,b = np.meshgrid(xx,yy)
z = feature_mapping(a.ravel(), b.ravel(), pow).values
z = z * theta
z = z.reshape(a.shape)
plt.contour(a, b, z, 0)# cost
ax2 = fig.add_subplot(122)
xulie = np.array(range(0,iterations))
ax2.plot(xulie, cost_table)
plt.ylim((0,4))
plt.xlabel('iterations')
plt.ylabel('cost function')
plt.title('Cost funtion')
plt.show()

结果：

迭代500000次，α=0.1，λ=0, power=6：

迭代500000次，α=0.1，λ=1, power=6：

迭代500000次，α=0.1，λ=3, power=6：

迭代500000次，α=0.1，λ=100, power=6：

将代码重新应用到ex2_data1上，选择power=3,迭代400000次，α=0.1，λ=0时：

please input the power:3
please input the iterations:400000
please input the  learning rate alpha:0.1
please input lambda:0
final theta =   [[-1.64468600e+03][-6.45254973e+04][-6.48535093e+04][-2.00511763e+06][-1.84890789e+06][-2.02252292e+06][-6.97358719e+03][ 8.51691588e+04][ 9.51192957e+03][ 1.51535958e+04]]
( with lambda = 0.0 )
and cost function=  -9.999950000398841e-06