设备异常状态检测相关内容（一）

数据集简单介绍

数据集来源https://www.kaggle.com/arashnic/sensor-fault-detection-data

由传感器 ID 唯一标识的传感器测量的时间序列。在一系列测量过程中，传感器断开连接或出现故障。

该数据集旨在为故障检测分析组件提供服务。

SensorID = 1 = PT100 温度传感器，在工业环境中，有灰尘和振动

传感器数据处理

（1）读入数据

import pandas as pd
filename='C:/Users/zzzz/Desktop/0306/sensor-fault-detection.csv'
dataframe=pd.read_csv(filename)
#首先读入文件数据dataframe.describe()#获得描述性统计量dataframe['Value'].value_counts()#显示所有的唯一值以及出现的次数dataframe['Value'].nunique()#统计有多少个唯一值dataframe['Timestamp']#查看属性“timestamp”

（2）进行调整和筛选

def my_trim(x): #定义函数（去掉T和时区信息）x=x[0:19]y=x.replace("T"," ")return y
date_array=dataframe['Timestamp'].apply(my_trim) #对该列数据应用函数

datetime_array=[pd.to_datetime(data,format='%Y-%m-%d %H:%M:%S') for data in date_array]
#把字符转换为日期
dataframe['Timestamp']=date_array

dataframe_new=dataframe[(dataframe['Timestamp']>='2016-12-16 00:00:00')&(dataframe['Timestamp']<'2017-09-01 00:00:00')] #选择日期范围内数据dataframe_new=dataframe_new.reset_index()#重置索引
dataframe_new=dataframe_new.drop('index',axis=1)#删除一列

（3）绘示意图

import matplotlib.pyplot as plt
dataframe_new.iloc[:260].plot(x = "Timestamp", y = "Value",figsize=(12,6))
plt.xlabel("Timestamp")
#添加纵坐标轴标签
plt.ylabel("Value")
#添加标题
plt.title("sensor temperature")
plt.show()

#条件筛选
oneday_count=dataframe_new[(dataframe_new['Timestamp']>='2016-12-16 00:00:00')&(dataframe_new['Timestamp']<'2016-12-16 01:00:00')]

dataframe_new.iloc[520:780].plot(x = "Timestamp", y = "Value",figsize=(12,6))
plt.xlabel("Timestamp")
#添加纵坐标轴标签
plt.ylabel("Value")
#添加标题
plt.title("sensor temperature")
plt.show()

（4）进行序列的分段

dataframe_new_values=dataframe_new['Value']

#序列分段，降噪
import numpy as np#计算误差，采用回归方法的误差平方和，也可采用其他方法
def calculate_error(seq_time, seq_range):   #seq_range是序列的输入范围x=np.arange(seq_range[0], seq_range[1])y=np.array(seq_time[seq_range[0]:seq_range[1]])A=np.ones((len(x),2),float)A[:,0]=x# 返回回归系数、残差平方和、自变量X的秩、X的奇异值(p,residuals,ranks,s)=np.linalg.lstsq(A,y,rcond=None) #最小二乘法拟合try:error=residuals[0]except IndexError:error=0.0return errordef improvement_splitting_here(T,i,seq_range):return calculate_error(T,(seq_range[0],i))+calculate_error(T,(i,seq_range[1]))#自上而下方法
def Top_Down(T,max_error,seq_range=None):if not seq_range:seq_range=(0,len(T))best_so_far=float('inf')break_point=float('inf')for i in range(seq_range[0]+1,seq_range[1]-1): #寻找最优划分点improvement_in_approximation=improvement_splitting_here(T,i,seq_range)if improvement_in_approximation<best_so_far:break_point=ibest_so_far=improvement_in_approximationleft_error=calculate_error(T,(seq_range[0],break_point)) #左段残差平方和left_seg=T[seq_range[0]:break_point]right_error=calculate_error(T,(break_point,seq_range[1])) #右段残差平方和right_seg=T[break_point:seq_range[1]]if left_error>max_error:segleft=Top_Down(T,max_error,(seq_range[0],break_point)) #递归计算else:segleft=[left_seg]if right_error>max_error:segright=Top_Down(T,max_error,(break_point,seq_range[1]))else:segright=[right_seg]return segleft+segright #输出划分后的序列表示

values_test=dataframe_new_values[0:57600].tolist()#计算该序列的方差
values_test_FangCha=np.var(values_test)#方差
ChanCha_2_sum=values_test_FangCha*len(values_test)
[values_test_FangCha,ChanCha_2_sum]

#平稳性检验
from statsmodels.tsa.stattools import adfuller
check_test = adfuller(values_test, autolag='AIC')
check_test  #输出[test statistic,p-value,number of lags used,number of observations used,critical value(1%),critical value(5%),critical value(5%)]
# Test Statistic的值如果比Critical Value (5%)小则满足稳定性需求.
# p-value越低（理论上需要低于0.05）证明序列越稳定。

保存分段结果，将分段用直线拟合后绘图

my_array=Top_Down(values_test,29.3)#保存文件
#fileObject = open('C:/Users/zzzz/Desktop/0306/my_array.txt','w')
#for ip in my_array:
#    fileObject.write(str(ip))
#    fileObject.write('\n')
#fileObject.close() #读取文件
import json
filename='C:/Users/zzzz/Desktop/0306/my_array.txt'
file = open(filename, "r")
list_row =file.readlines()
list_source = []
for i in range(len(list_row)):column_list = json.loads(list_row[i])list_source.append(column_list)
file.close()
print(type(list_source))

from matplotlib import pyplot as plt
#分段后使用直线拟合
plt.figure(figsize=(12,6))
num=0
for Vdata in my_array:X=np.arange(num,num+len(Vdata))Y=VdataA=np.vstack([X, np.ones(len(X))]).Tm,c=np.linalg.lstsq(A,Y,rcond=None)[0]plt.plot(X,m*X+c, 'r')num=num+len(Vdata)
plt.xlabel("Timestamp")
plt.ylabel("Value")
plt.title("sensor temperature")
plt.show()

使用均值代替每个分段

Y_array=[] #求均值后的序列
for Vdata in my_array:sum=0for i in Vdata:sum=sum+idata_mean=sum/len(Vdata)for i in Vdata:Y_array.append(data_mean)
X=np.arange(0,len(values_test))
plt.figure(figsize=(12,6))
plt.plot(X,Y_array, 'r')
plt.xlabel("Timestamp")
plt.ylabel("Value")
plt.title("sensor temperature")
plt.show()

保存并绘图

#fileObject1 = open('C:/Users/zzzz/Desktop/0306/Y_array.txt','w')  #保存Y_array
#for ip in Y_array:  #fileObject1.write(str(ip))#fileObject1.write('\n')
#fileObject1.close()

temp_y=Y_array[0:260]
temp_x=np.arange(0,len(temp_y))
plt.figure(figsize=(12,6))
plt.plot(temp_x,temp_y, 'r')
plt.xlabel("Timestamp")
plt.ylabel("Value")
plt.title("sensor temperature")
plt.show()

mean_value_array=np.array(Y_array)#mean_value_array=np.array(values_test)# mean_value_array=np.array(Y_array)#将列表变成数组mean_value_array_set=mean_value_array.reshape(-1,12)#重构数组

mean_value_array_list=mean_value_array_set.tolist()#将数组变成二维列表

（5）聚类，使用DTW距离，（认为传感器数据分为正常和异常两类，而且正常的比例更大）

#DTW距离,动态时间规整
from math import sqrt
def DTWDistance(s1, s2, w):DTW={}for i in range(-1,len(s1)): for j in range(-1,len(s2)):DTW[(i, j)] = float('inf')DTW[(-1, -1)] = 0for i in range(len(s1)):for j in range(max(0, i-w), min(len(s2), i+w)):dist= abs(s1[i]-s2[j])DTW[(i, j)] = dist + min(DTW[(i-1, j)],DTW[(i, j-1)], DTW[(i-1, j-1)])return DTW[len(s1)-1, len(s2)-1]

#K-means聚类
import random
#时间序列k-means聚类
def k_means_clust(data,num_clust,num_iter,w):centroids=random.sample(data,num_clust)#output_clust={}for n in range(num_iter):assignments={} #字典#assign data points to clustersfor ind,i in enumerate(data):  #每一个数据序列min_dist=float('inf')closest_clust=Nonefor c_ind,j in enumerate(centroids):  #每一个聚类中心cur_dist=DTWDistance(i,j,w) #计算距离if cur_dist<min_dist:min_dist=cur_distclosest_clust=c_ind #分配到相似度最高的中心if closest_clust in assignments:  #如果该中心在字典里，则将该数据添加到相关键值里assignments[closest_clust].append(ind)else:assignments[closest_clust]=[] #如果不在，啥也不干#recalculate centroids of clusters更新聚类中心for key in assignments:#每个聚类中心clust_sum = np.zeros(12,dtype=np.float )  #创建一个向量12个值，将列表变成数组for k in assignments[key]: #key为聚类中心的索引，k为数值索引clust_sum=clust_sum+np.array(data[k]) #数据序列求和centroids[key]=[m/len(assignments[key]) for m in clust_sum]  #返回列表，计算每个数据序列的和除以该簇所有数据点的数量#output_clust=assignmentsreturn [assignments,centroids]#返回聚类结果和簇中心

kmeans_result=k_means_clust(mean_value_array_list,2,10,12)

kmeans_result[0]为各类的样本索引，kmeans_result[1]为中心簇

本文主要为数据集的准备和预处理。选择kaggle网站上发布的未进行标记的温度传感器数据，数据集中存在由于操作不当、传感器故障、控制系统故障等原因造成的异常数据。该数据记录过程中存在震动和噪音，为降低噪声的影响，使用分段线性表示方法PAA对其进行了降噪处理。对于未标记的数据，选择k-means聚类的方法，在算法中采用动态时间规整DTW方法进行两个序列的相似性度量，实现对数据的异常和正常两种状态的分类，规定数据量小的一簇为异常数据并进行标记，为训练分类器做准备。

这是笔者毕业设计完成过程中做的一部分工作，虽说最后由于部分原因没有用到这些内容，但在此记录一下，其中有一些经典的算法，也包含深度学习的实践。