QQ群数据的切割与归并（数据分析案例四）

本节内容的数据见电脑F:\python数据\Python海量数据（精缩版） 或 百度网盘“我的数据文件/Python海量数据”

一、先了解下QQ群的数据

1.QQ目录下的文件

（QQ目录下的文件是按照QQ群号有序排列的，我们根据QQ群找这个群里的QQ就很方便，但要是根据一个人的QQ找它所在的群这就麻烦了）

2.QQqun目录下的文件

（因为数据是有序的，两个文件归并的时候就不用排序）

以下是对QQ群数据的合并、检索、制作索引、快速查找

一、QQ群数据的合并

1.把这么多文件里面的数据合并到一个大文件里去

import osfiledir="/mnt/hgfs/E/QQ项目/QUN/qqQun"
filelist=[]
for i in range(1,111):  #该目录下一共有110个数据文件filelist.append(filedir+"/QunList"+str(i)+".txt")#filelist=["/mnt/hgfs/E/QQ项目/QUN/qqQun/QunList1.txt","/mnt/hgfs/E/QQ项目/QUN/qqQun/QunList2.txt",,,]allfile=open("/mnt/hgfs/E/QQ项目/QUN/qqQun/allQQqun.txt","wb")
for  filepath in filelist:tmpfile = open(filepath, "rb")tmplist=tmpfile.readlines()for line   in tmplist:allfile.write(line)tmpfile.close()allfile.close()

2.把这么多文件里面的数据进行处理然后再合并到一个大文件里去

filedir="/mnt/hgfs/E/newQQ/QQqun"filelist=[]
for i in range(1,111):filelist.append(filedir+"/QunList"+str(i)+".txt")allqunfile=open("/mnt/hgfs/E/newQQ/QQqunalllite.txt","wb")for  filepath in filelist:tmpfile = open(filepath, "rb")tmplist = tmpfile.readlines()for line in tmplist:line=line.decode("utf-8","ignore") #解码linelist=line.split("\t") #切割if len(linelist)==7:  #如果改行不是7个数据，说明改行缺失信息，那么就是垃圾数据，我们不做处理Qunid=linelist[1].replace("\"","")Qunname=linelist[4].replace("\"", "")Quntitle=linelist[6].replace("\"", "")#抓取重要的数据，替换符号，wline=Qunid+"\t"+Qunname+"\t"+Quntitleallqunfile.write(wline.encode("utf-8"))allqunfile.close()

二、补充：jieba库如何做到模糊搜索

1.jieba库

import  jieba
mystr="我今天与朋友聊天聊到了美女与野兽"
mycut=jieba.cut(mystr)
print(",".join(mycut))  #我,今天,与,朋友,聊天,聊到,了,美女,与,野兽
print("--".join(mycut))  #我--今天--与--朋友--聊天--聊到--了--美女--与--野兽

2.分词搜索

import  jieba
import jieba.posseg
mystr="软件工程"
laststr="软件工程1班"
jiebastr=",".join(jieba.cut(mystr,cut_all=True))  #这种分词会罗列出所有的分词可能：['软件', '软件工程', '工程']
wordlist=jiebastr.split(",")  #jiebastr=软件,软件工程,工程    wordlist=['软件', '软件工程', '工程']
print(wordlist)
length=len(wordlist)
getlength=0
for  word  in wordlist:if  laststr.find(word)!=-1:  getlength+=1
print(getlength/length)

三、信息搜索

我们根据刚刚合并的数据，从里面搜索”妈妈“，得到的那些群号就可以做母婴用品的推销
从里面搜索”软件工程“就可以向那些群号做IT方面的培训广告

1.常规的普通搜索

filepath="/mnt/hgfs/E/newQQ/QQqunalllite.txt"
datafile=open(filepath,"rb")
while True:searchstr=input("要查询的数据")datafile.seek(0,0)while True:line =datafile.readline()if not line:  #（读到最后一行就跳出循环）breakelse:line =line.decode("utf-8")if line.find(searchstr)!=-1:print(line,end="")datafile.close()

2.模糊搜索

import  jieba
import jieba.possegdef  findata(mystr,laststr):jiebastr=",".join(jieba.cut(mystr,cut_all=True))wordlist=jiebastr.split(",")length=len(wordlist)getlength=0for  word  in wordlist:if  laststr.find(word)!=-1:getlength+=1return getlength/lengthfilepath="/mnt/hgfs/E/newQQ/QQqunalllite.txt"
datafile=open(filepath,"rb")
datalist=datafile.readlines()  #这种一次性读取所有行的做法就是将几G的数据一次性读入内存，好处是读入以后查询非常快
print("load mem")while True:searchstr=input("要查询的数据")datafile.seek(0,0)for  line  in datalist:line =line.decode("utf-8")#if line.find(searchstr)!=-1:if findata(searchstr,line)>=0.3:  #匹配系数只要大于0.3那就输出来（这样就比之前查出来的数据更多）print(line,end="")datafile.close()

五、制作索引

我们根据刚刚合并的数据，制作索引

1.第一种土豪的做法— 一次性读入

filepath="/mnt/hgfs/E/newQQ/QQqunalllite.txt"
Qunfile=open(filepath,"rb")
Qunlist=Qunfile.readlines()  #一次性读入内存，土豪的做法
print(len(Qunlist)) #86907937lengthlist=[0]
for line in Qunlist:lengthlist.append(len(line))
del Qunlist
print("list")i=1
length=len(lengthlist)
while i<length-1:lengthlist[i]+=lengthlist[i-1]i+=1
del  lengthlist[length-1] #删除最后一位
print("sort")savefilepath="/mnt/hgfs/E/newQQ/QQqunallliteindex.txt"
savefile=open(savefilepath,"wb")
for  data in lengthlist:savefile.write(format(data,"15d").encode("utf-8"))savefile.close()
Qunfile.close()

2.比较节约内存的做法— 一行一行的读入

filepath="/mnt/hgfs/E/newQQ/QQqunalllite.txt"
Qunfile=open(filepath,"rb")lengthlist=[0]
while True:line=Qunfile.readline() #一行一行的读入if  not line:breakelse:lengthlist.append(len(line))print("list")i=1
length=len(lengthlist)
while i<length-1:lengthlist[i]+=lengthlist[i-1]i+=1
del  lengthlist[length-1]
print("sort")savefilepath="/mnt/hgfs/E/newQQ/QQqunallliteindex_disk1.txt"
savefile=open(savefilepath,"wb")
for  data in lengthlist:savefile.write(format(data,"15d").encode("utf-8"))savefile.close()
Qunfile.close()

3.最有利于内存的方法-----边读边写

filepath="/mnt/hgfs/E/newQQ/QQqunalllite.txt"
Qunfile=open(filepath,"rb")
savefilepath="/mnt/hgfs/E/newQQ/QQqunallliteindex_disk2.txt"
savefile=open(savefilepath,"wb")pos=0
savefile.write(format(pos,"15d").encode("utf-8"))while True:line=Qunfile.readline()if  not line:breakelse:pos +=len(line)savefile.write(format(pos, "15d").encode("utf-8"))savefile.close()
Qunfile.close()

六、根据索引快速查找

1.随机访问

csdnfilepath="/mnt/hgfs/E/newQQ/QQqunalllite.txt"
csdnindexfilepath="/mnt/hgfs/E/newQQ/QQqunallliteindex.txt"
csdnfile=open(csdnfilepath,"rb")
csdnindexfile=open(csdnindexfilepath,"rb")while True:linenum=eval(input("input lines"))csdnindexfile.seek(15*(linenum-1),0) #跳到索引文件的中间位置lineval=csdnindexfile.read(15)#读取10个字符lineval=eval(lineval)#转化为数字csdnfile.seek(lineval,0) #根据索引取出位置line=csdnfile.readline()line=line.decode("utf-8","ignore")print(line)csdnindexfile.close()
csdnfile.close()

2.二分查找

def search2(searchstr):low = 0  # 第一个high = 86907937-1  # 代表最后一个times = 0while low <= high:  # 不能重叠times += 1print("times", times)mid = (low + high) // 2  # 取出中间索引csdnindexfile.seek(15 * (mid - 1), 0)  # 跳到索引文件的中间位置lineval = csdnindexfile.read(15)  # 读取10个字符lineval = eval(lineval)  # 转化为数字csdnfile.seek(lineval, 0)  # 根据索引取出位置line = csdnfile.readline()line = line.decode("utf-8", "ignore")linelist=line.split("\t")middata=linelist[0]if searchstr< middata:  # 小于 淘汰1半high = mid - 1elif searchstr > middata:  # 小于 淘汰1半low = mid + 1else:print("find", line, mid)return midprint("not find")return -1csdnfilepath="/mnt/hgfs/E/newQQ/QQqunalllite.txt"
csdnindexfilepath="/mnt/hgfs/E/newQQ/QQqunallliteindex.txt"
csdnfile=open(csdnfilepath,"rb")
csdnindexfile=open(csdnindexfilepath,"rb")while True:searchstr=input("input searchstr")  #输入你要查找的字符串search2(searchstr)csdnindexfile.close()
csdnfile.close()

以下是对QQ文件的合并、制作索引、根据索引迅速查找

一、合并QQ文件

右边文件中依次是（编号、QQ号、QQ名、年龄、性别、？、QQ群号）

我们依次拿到（QQ号：21605735、QQ名：晴子、年龄：22，QQ群号：100100）

filedir="/mnt/hgfs/E/newQQ/QQ"filelist=[] #文件列表
for i in range(1,111): #批量生成110个文件加入列表filelist.append(filedir+"/Group"+str(i)+".txt")#用于归并的文件
allqunfile=open("/mnt/hgfs/E/newQQ/QQall.txt","wb")
#每个文件读取一次，每个文件写入归并的文件
for  filepath in filelist:print(filepath)tmpfile = open(filepath, "rb") #打开文件tmplist = tmpfile.readlines() #读取所有行for line in tmplist: #所有文件的行，批量写入归并line=line.decode("utf-8")linelist=line.split("\t")QQ= linelist[1].replace("\"", "")QQname = linelist[2].replace("\"", "")QQage = linelist[3].replace("\"", "")QQqun = linelist[6].replace("\"", "")# 抓取重要的数据，替换符号，wline = QQ + "\t" + QQname  + "\t" +QQage +"\t"+QQqunallqunfile.write(wline.encode("utf-8"))allqunfile.close()

得到的文件是按照QQ群号有序排列的，所以它比较适合根据QQ群号找这个群里的QQ号

二、QQ数据的索引

QQall.txt文件是按照QQ群号有序排列的，所以它比较适合根据QQ群号找这个群里的QQ号

filepath="/mnt/hgfs/E/newQQ/QQall.txt"
Qunfile=open(filepath,"rb")
savefilepath="/mnt/hgfs/E/newQQ/QQallindex.txt"
savefile=open(savefilepath,"wb")
pos=0
savefile.write(format(pos,"15d").encode("utf-8"))while True:line=Qunfile.readline()if  not line:breakelse:pos +=len(line)savefile.write(format(pos, "15d").encode("utf-8"))savefile.close()
Qunfile.close()

三、根据索引随机访问QQ群

csdnfilepath="/mnt/hgfs/E/newQQ/qun_data/qun_name_qq.txt"
csdnindexfilepath="/mnt/hgfs/E/newQQ/qun_data/qun_qq_index.txt"
csdnfile=open(csdnfilepath,"rb")
csdnindexfile=open(csdnindexfilepath,"rb")while True:linenum=eval(input("input lines"))csdnindexfile.seek(15*(linenum-1),0) #跳到索引文件的中间位置lineval=csdnindexfile.read(15)#读取15个字符lineval=eval(lineval)#转化为数字csdnfile.seek(lineval,0) #根据索引取出位置line=csdnfile.readline()line=line.decode("utf-8","ignore")print(line)csdnindexfile.close()
csdnfile.close()

四、根据QQ群查找QQ

def search2(searchstr):low = 0  # 第一个high = 1449403409-1  # 代表最后一个while low <= high:  # 不能重叠mid = (low + high) // 2  # 取出中间索引csdnindexfile.seek(15 * (mid - 1), 0)  # 跳到索引文件的中间位置lineval = csdnindexfile.read(15)  # 读取15个字符lineval = eval(lineval)  # 转化为数字csdnfile.seek(lineval, 0)  # 根据索引取出位置line = csdnfile.readline()line = line.decode("utf-8", "ignore")linelist=line.split(" # ")middata=linelist[2]middata=eval(middata)if searchstr< middata:  # 小于 淘汰1半high = mid - 1elif searchstr > middata:  # 小于 淘汰1半low = mid + 1else:#print("find",  mid,line)QQlist=[]QQlist.append(line) # 返回一个列表，多个QQ，先加入找到的第一个tmp_up=mid   #循环，向上查找while True:tmp_up-=1  #循环向上if tmp_up<low: #不可以低于下限break#索引文件取出位置tmp_upcsdnindexfile.seek(15 * (tmp_up - 1), 0) uplineval = csdnindexfile.read(15)  # 读取15个字符uplineval = eval(uplineval)  # 转化为数字# 根据索引文件在文件中取出位置tuplinelist[2]群号csdnfile.seek(uplineval, 0)  # 根据索引取出位置upline = csdnfile.readline()upline = upline.decode("utf-8", "ignore")uplinelist = upline.split(" # ")upmiddata = uplinelist[2]upmiddata = eval(upmiddata)if searchstr== upmiddata : #相等就继续，不等跳出循环#print(upline,tmp_up)QQlist.append(upline)else:breaktmp_down=mid #向下循环while True:tmp_down+=1 #向下移动，if tmp_down>high: #不能高于上限break# 索引文件取出位置tmp_downcsdnindexfile.seek(15 * (tmp_down - 1), 0)  downlineval = csdnindexfile.read(15) downlineval = eval(downlineval)  # 转化为数字# 根据索引文件在文件中取出位置tuplinelist[2]群号csdnfile.seek(downlineval, 0)  # 根据索引取出位置downline = csdnfile.readline()downline = downline.decode("utf-8", "ignore")downlinelist = downline.split(" # ")downmiddata = downlinelist[2]downmiddata = eval(downmiddata)if searchstr == downmiddata :#相等就继续，不等跳出循环#print(downline, tmp_down)QQlist.append( downline )else:breakreturn QQlistprint("not find")return -1csdnfilepath="/mnt/hgfs/E/newQQ/qun_data/qun_name_qq.txt"
csdnindexfilepath="/mnt/hgfs/E/newQQ/qun_data/qun_qq_index.txt"
csdnfile=open(csdnfilepath,"rb")
csdnindexfile=open(csdnindexfilepath,"rb")while True:searchstr=eval(input("input searchstr"))QQlist=search2(searchstr)for QQline in QQlist:print(QQline,end="")csdnindexfile.close()
csdnfile.close()

五、根据QQ查找它所在的QQ群

那么如何根据QQ查找它所在的QQ群呢？
按理来讲我们要根据QQ查找它所在的QQ群需要一份按照QQ号排列的正序文件
下面的qq_name_qun.txt是乱序的，它虽然从QQ群号上看是正序的，但是从QQ号上看是乱序的，所以我们做了一份倒排索引文件（index.txt是按照QQ号正序排列制作的的索引，只要有一个正序的索引，就可以了）

根据QQ查找它所在的QQ群

def search2(searchstr):low = 0  # 第一个high = 1449403409-1  # 代表最后一个while low <= high:  # 不能重叠mid = (low + high) // 2  # 取出中间索引csdnindexfile.seek(15 * (mid - 1), 0)  # 跳到索引文件的中间位置lineval = csdnindexfile.read(15)  # 读取15个字符lineval = eval(lineval)  # 转化为数字csdnfile.seek(lineval, 0)  # 根据索引取出位置line = csdnfile.readline()line = line.decode("utf-8", "ignore")linelist=line.split(" # ")middata=linelist[0]#middata=eval(middata)if searchstr< middata:  # 小于 淘汰1半high = mid - 1elif searchstr > middata:  # 小于 淘汰1半low = mid + 1else:QQlist = []QQlist.append(line)  # 返回一个列表，多个QQ，先加入找到的第一个tmp_up = mid  # 循环，向上查找while True:tmp_up -= 1  # 循环向上if tmp_up < low:  # 不可以低于下限breakif  tmp_up-1<0:break# 索引文件取出位置tmp_upcsdnindexfile.seek(15 * (tmp_up - 1), 0)  # 跳到索引文件的中间位置uplineval = csdnindexfile.read(15)  # 读取10个字符uplineval = eval(uplineval)  # 转化为数字# 根据索引文件在文件中取出位置tuplinelist[2]群号csdnfile.seek(uplineval, 0)  # 根据索引取出位置upline = csdnfile.readline()upline = upline.decode("utf-8", "ignore")uplinelist = upline.split(" # ")upmiddata = uplinelist[0]#upmiddata = eval(upmiddata)if searchstr == upmiddata:  # 相等就继续，不等跳出循环# print(upline,tmp_up)QQlist.append(upline)else:breaktmp_down = mid  # 向下循环while True:tmp_down += 1  # 向下移动，if tmp_down > high:  # 不能高于上限break# 索引文件取出位置tmp_downcsdnindexfile.seek(15 * (tmp_down - 1), 0)  # 跳到索引文件的中间位置downlineval = csdnindexfile.read(15)  # 读取10个字符downlineval = eval(downlineval)  # 转化为数字# 根据索引文件在文件中取出位置tuplinelist[2]群号csdnfile.seek(downlineval, 0)  # 根据索引取出位置downline = csdnfile.readline()downline = downline.decode("utf-8", "ignore")downlinelist = downline.split(" # ")downmiddata = downlinelist[0]#downmiddata = eval(downmiddata)if searchstr == downmiddata:  # 相等就继续，不等跳出循环# print(downline, tmp_down)QQlist.append(downline)else:breakreturn QQlistprint("not find")return -1csdnfilepath="/mnt/hgfs/E/newQQ/qq_data/qq_name_qun.txt"
csdnindexfilepath="/mnt/hgfs/E/newQQ/qq_data/index.txt"
csdnfile=open(csdnfilepath,"rb")
csdnindexfile=open(csdnindexfilepath,"rb")while True:searchstr=input("input searchstr")QQqunlist=search2(searchstr)#print(QQlist)for QQqun in QQqunlist:print(QQqun)csdnindexfile.close()
csdnfile.close()