吴恩达机器学习python实现8 异常检测及推荐系统

异常检测

1、可视化数据

def visualize_dataset(X):plt.scatter(X[..., 0], X[..., 1], marker="x", label="point")

2、估计参数

def estimate_parameters_for_gaussian_distribution(X):mu = np.mean(X, axis=0)sigma2 = np.var(X, axis=0)return mu, sigma2

3、根据高斯模型，计算概率

def gaussian_distribution(X, mu, sigma2):p = np.exp(-(X-mu)**2/(2*sigma2)) * 1/(np.sqrt(2*np.pi*sigma2))return np.prod(p, axis=1)

4、画出高斯分布的等高线

def visualize_contours(mu, sigma2):x, y = np.linspace(5, 25, 100), np.linspace(5, 25, 100)xx, yy = np.meshgrid(x, y)X = np.concatenate((xx.reshape(-1, 1), yy.reshape(-1, 1)), axis=1)z = gaussian_distribution(X, mu, sigma2).reshape(xx.shape)cont_levels = [10**h for h in range(-20, 0, 3)]  # 当z为当前列表的值时才绘出等高线plt.contour(xx, yy, z, cont_levels)

5、计算precision、recall

# yp是预测值，yt是真实值
def error_analysis(yp, yt):tp, fp, fn, tn = 0, 0, 0, 0for i in range(len(yp)):if yp[i] == yt[i]:if yp[i] == 1:tp += 1else:tn += 1else:if yp[i] == 1:fp += 1else:fn += 1precision = tp / (tp+fp) if tp + fp else 0recall = tp / (tp+fn) if tp+fn else 0f1 = 2*precision*recall/(precision+recall) if precision+recall else 0return f1

6、选择最好的阈值

# yval真实值 pval预测值
def select_threshold(yval, pval):epsilons = np.linspace(min(pval), max(pval), 1000)l = np.zeros((1, 2))for e in epsilons:ypre = (pval<e).astype(float)f1 = error_analysis(ypre, yval)l = np.concatenate((l, np.array([[e, f1]])), axis=0)index = np.argmax(l[..., 1])return l[index, 0], l[index, 1]

7、根据高斯模型检测出异常数据

def detection(X, e, mu, sigma2):p = gaussian_distribution(X, mu, sigma2)anomaly_points = np.array([X[i] for i in range(len(p)) if p[i]<e])return anomaly_points

8、可视化异常数据

def circle_anomaly_points(X):plt.scatter(X[..., 0], X[..., 1], s=80, facecolor="none", edgecolors="r", label="anomaly point")

9、低维数据测试

data = sio.loadmat(文件路径)X = data["X"]  # (307,2)visualize_dataset(X)mu, sigma2 = estimate_parameters_for_gaussian_distribution(X)  # [14.11222578 14.99771051] [1.83263141 1.70974533]p = gaussian_distribution(X, mu, sigma2)  # (307,)visualize_contours(mu, sigma2)Xval = data["Xval"]  # (307,2)yval = data["yval"]  # (307,1)print(yval[:3])e, f1 = select_threshold(yval.ravel(), gaussian_distribution(Xval, mu, sigma2))print('best choice of epsilon is ', e, ',the F1 score is ', f1)# best choice of epsilon is  8.999852631901394e-05 ,the F1 score is  0.8750000000000001anomaly_points = detection(X, e, mu, sigma2)circle_anomaly_points(anomaly_points)plt.title('anomaly detection')plt.legend()plt.show()

10、高维数据测试

data2 = sio.loadmat(文件路径)X = data2["X"]  # (1000,11)Xval = data2["Xval"]  # (100,11)yval = data2["yval"]  # (100, 1)mu, sigma2 = estimate_parameters_for_gaussian_distribution(X)e, f1 = select_threshold(yval.ravel(), gaussian_distribution(Xval, mu, sigma2))anomaly_points = detection(X, e, mu, sigma2)print('\n\nfor this high dimensional dataset \nbest choice of epsilon is ', e, ',the F1 score is ', f1)print('the number of anomaly points is', anomaly_points.shape[0])# for this high dimensional dataset # best choice of epsilon is  1.3786074982000235e-18 ,the F1 score is  0.6153846153846154# the number of anomaly points is 117

推荐系统

1、导入所需的库

import scipy.io as sio
import numpy as np
import scipy.optimize as opt
from sklearn.metrics import mean_squared_error

2、参数维数变换

# 参数一维向量化
def serialize(X, theta):return np.concatenate((X.flatten(), theta.flatten()), axis=0)# 将一维参数向量还原
def deserializer(params, nm, nu, nf):X = params[: nm*nf].reshape(nm, nf)theta = params[nm*nf:].reshape(nu, nf)return X, theta

3、协同过滤算法目标函数

def collaborative_filtering_cost(params, Y, R, nm, nu, nf, lamda=0.0):X, theta = deserializer(params, nm, nu, nf)part1 = np.sum(((X.dot(theta.T) - Y) ** 2)*R)/2part2 = (lamda/2) * np.sum(theta**2)part3 = (lamda/2) * np.sum(X**2)return part1 +part2 + part3

4、协同过滤梯度下降

def collaborative_filtering_gradient(params, Y, R, nm, nu, nf, lamda=0.0):X, theta = deserializer(params, nm, nu, nf)g_X = ((X.dot(theta.T)-Y) * R).dot(theta) + lamda*Xg_theta = ((X.dot(theta.T)-Y) * R).T.dot(X) + lamda*thetareturn serialize(g_X, g_theta)

5、检验预测

def check_gradient(params, Y, R, nm, nu, nf):e = 0.0001m = len(params)g_params = np.zeros((m,))for i in range(m):temp = np.zeros((m,))temp[i] = eg_params = (collaborative_filtering_cost(params+temp, Y, R, nm, nu, nf)-collaborative_filtering_gradient(params-temp, Y, R, nm, nu, nf))/(2 * e)return g_params

6、代入数据

data1 = sio.loadmat(文件路径)# Y是包含从1到5的等级的（数量的电影x数量的用户）数组.R是包含指示用户是否给电影评分的二进制值的“指示符”数组。 两者应该具有相同的维度。Y = data1["Y"]  # (1682,943)R = data1["R"]  # (1682,943)data2 = sio.loadmat(r"E:\zl\机器学习\1\data\ex8\ex8_anomaly_detection_and_recommender_system_data_ex8_movieParams")X = data2["X"]  # (1682,10)theta = data2["Theta"]  # (943,10)nu = data2["num_users"][0][0]  # 943nm = data2["num_movies"][0][0]  # 1682nf = data2["num_features"][0][0]  # 10print(collaborative_filtering_cost(serialize(X, theta), Y, R, nm, nu, nf))  # 27918.64012454421print(collaborative_filtering_cost(serialize(X, theta), Y, R, nm, nu, nf, 1.5))  # 34821.703613072226

# 读入电影标签with open(文件路径) as f:movies = []for line in f.readlines():movies.append(line.split(' ', 1)[-1])# 训练模型# 添加一组自定义的用户数据my_ratings = np.zeros((1682, 1))my_ratings[0] = 4my_ratings[97] = 2my_ratings[6] = 3my_ratings[11] = 5my_ratings[53] = 4my_ratings[63] = 5my_ratings[65] = 3my_ratings[68] = 5my_ratings[182] = 4my_ratings[225] = 5my_ratings[354] = 5Y = np.concatenate((Y, my_ratings), axis=1)R = np.concatenate((R, my_ratings > 0), axis=1)nu += 1# params = serialize(np.random.random((nm, nf)), np.random.random((nu, nf)))# res = opt.minimize(fun=collaborative_filtering_cost, x0=params, args=(Y, R, nm, nu, nf,10), method="TNC",#                    jac=collaborative_filtering_gradient)# print(res.shape)trained_X, trained_theta = deserializer(sio.loadmat(文件路径)["params"].ravel(), nm, nu, nf)predict = trained_X.dot(trained_theta.T)my_predict = predict[..., -1]# 从预测结果选择10个最优推荐for i in range(10):index = int(np.argmax(my_predict))print("Predicting rating ", my_predict[index], " for movie ", movies[index])my_predict[index] = -1# Predicting# rating# 4.291401160077979# for movie  Titanic(1997)## Predicting# rating# 4.119953862808096# for movie  Star Wars (1977)## Predicting# rating# 3.9792200003762264# for movie  Raiders of the Lost Ark (1981)## Predicting# rating# 3.9099976364851314# for movie  Good Will Hunting (1997)## Predicting# rating# 3.885805506896392# for movie  Shawshank Redemption, The (1994)## Predicting# rating# 3.8729551652292584# for movie  Return of the Jedi (1983)## Predicting# rating# 3.8712945387591366# for movie  Braveheart(1995)## Predicting# rating# 3.863004536777663# for movie  Empire Strikes Back, The (1980)## Predicting# rating# 3.757933676945382# for movie  Terminator 2: Judgment# Day(1991)## Predicting# rating# 3.7576861972110667# for movie  As Good As It Gets (1997)# 用均方误差来评价Y = Y.flatten()R = R.flatten()predict = predict.flatten()true_y = []pre_y = []for i in range(len(Y)):if R[i] == 1:true_y.append(Y[i])pre_y.append(predict[i])print("当前训练对岳原始数据集的均方误差", mean_squared_error(true_y, pre_y))  # 当前训练对岳原始数据集的均方误差 0.6400023155268085

吴恩达机器学习python实现8 异常检测及推荐系统相关推荐

8. 吴恩达机器学习课程-作业8-异常检测和推荐系统
fork了别人的项目,自己重新填写,我的代码如下 https://gitee.com/fakerlove/machine-learning/tree/master/code 代码原链接文章目录 8. ...
吴恩达机器学习（十三）异常检测（高斯分布）
目录 0. 前言 1. 高斯分布(Gaussian distribution) 2. 参数估计 3. 异常检测算法(原始模型) 4. 高斯分布异常阈值的选择 5. 多变量高斯分布(多元模型) 6. 原 ...
python 异常检测算法_吴恩达机器学习中文版笔记：异常检测（Anomaly Detection）
大数据文摘经授权转载作者:黄海广在接下来的一系列视频中,我将向大家介绍异常检测(Anomaly detection)问题.这是机器学习算法的一个常见应用.这种算法的一个有趣之处在于:它虽然主要用于 ...
吴恩达|机器学习作业8.0.异常检测
8.0.异常检测 1)题目: 在本练习中,您将实现异常检测算法,并将其应用于检测网络中的故障服务器.在第二部分中,您将使用协同过滤来构建电影推荐系统. 在本练习中,您将实现一个异常检测算法来检测服务器 ...
吴恩达机器学习python作业之多变量线性回归
建议先看单变量线性回归再看多变量线性回归哦. 参考链接: (7条消息) 吴恩达|机器学习作业1.1多变量线性回归_学吧学吧终成学霸的博客-CSDN博客数据集:一共三列,左边两列是自变量x,最右边一列 ...
吴恩达机器学习python代码练习一（线性回归）
吴恩达机器学习练习文件下载地址: 链接:https://pan.baidu.com/s/1RvUeG10FBpV9RyFtOX1Zdw 提取码:5b4x 单变量线性回归 import numpy as ...
吴恩达机器学习python代码练习三（多类别分类）
import numpy as np import pandas as pd import matplotlib.pyplot as plt import scipy.io as sio from s ...
吴恩达机器学习python实现（6）：SVM支持向量机（文末附完整代码）
所有的数据来源:链接:https://pan.baidu.com/s/1vTaw1n77xPPfKk23KEKARA 提取码:5gl2 1 Support Vector Machines 1.1 Pr ...
吴恩达机器学习 EX7 第二部分主成分分析(PCA)
2 主成分分析主成分分析通过协方差矩阵提取数据的主要成分,如90%的成分,通常用户数据压缩和数据可视化(维度降低方便可视化) 2.1 导入模块和数据该部分通过将二维数据压缩成一维数据演示主成分分析 ...
吴恩达机器学习ex1-matlab版学习总结笔记-(1)单变量线性回归
作业任务项一:5*5矩阵A 代码如下: A=eye(5); eye()是单位矩阵,除了对角线为1,其余项都为0.5为矩阵维度,即生成5*5矩阵. 作业任务项二:单变量线性回归预测代码如下: data ...

吴恩达机器学习python实现8 异常检测及推荐系统

吴恩达机器学习python实现8 异常检测及推荐系统相关推荐

最新文章

热门文章