python大作业爬虫_Python大作业---微博爬虫及简单数据分析

刚开始学python，选了这个题目，把代码放上来留念，没有用到很流行的框架，所以代码量挺大

GUI用wxpython写的

# _*_ coding: UTF-8 _*_

import os

import re

import requests

import sys

import wx

import traceback

from datetime import datetime

from datetime import timedelta

from lxml import etree

import data_analysis

global file_path

file_path = ''

class Wb(wx.App):

def Operate(self):

self.cookie = {}

self.username = '' # 用户名，如“Dear-迪丽热巴”

self.Number = 0 # 用户全部微博数

self.number1 = 0 # 爬取到的微博数

self.Guanzhu = 0

self.fans = 0

self.Content = [] # 微博内容

self.Time = [] # 微博发布时间

self.star = [] # 微博对应的点赞数

self.Zhuanfa = [] # 微博对应的转发数

self.Pinglun = [] # 微博对应的评论数

self.publish_tool = []

self.Id = 0000

###======================================================================================================

###======================================GUI=============================================================

# 建立一个窗口和frame控件

self.frame_operate = wx.Frame(

None, title="Weibo_Spider_GUI", size=(500, 500))

self.panel_operate = wx.Panel(self.frame_operate, -1)

# 设置字体格式

self.font1 = wx.Font(18, wx.ROMAN, wx.ITALIC, wx.NORMAL)

self.label1 = wx.StaticText(

self.panel_operate, -1, "WeiBo Spider", pos=(180, 60), style=wx.ALIGN_CENTER)

self.label1.SetFont(self.font1)

# cookie的标签和文本框

self.label2 = wx.StaticText(

self.panel_operate, -1, "请输入您微博登陆的有效cookie", pos=(160, 130), style=wx.ALIGN_CENTER)

self.textCookie = wx.TextCtrl(

self.panel_operate, -1, pos=(200, 150), size=(80, 20), style=wx.TE_CENTER)

# 获取所爬取用户的self.Id

self.label3 = wx.StaticText(

self.panel_operate, -1, "请输入您所要爬取微博账号的self.Id", pos=(160, 180), style=wx.ALIGN_CENTER)

self.textId = wx.TextCtrl(

self.panel_operate, -1, pos=(200, 200), size=(80, 20), style=wx.TE_CENTER)

# 文件存储路径

self.label4 = wx.StaticText(self.panel_operate,-1,"数据文件保存路径", pos=(160,230),style=wx.ALIGN_CENTER)

self.textFile_path = wx.TextCtrl(self.panel_operate,-1,pos=(200,250),size=(80,20),style=wx.TE_CENTER)

# 设置开始爬虫按钮

self.button_start = wx.Button(self.panel_operate, -1, "开始爬取微博信息", pos=(200, 350))

# 绑定响应事件

self.Bind(wx.EVT_BUTTON, self.get_cookie, self.button_start)

self.frame_operate.Show()

# 微博的正式UI界面-----------------------------------------------------------------------------------

# 获取用户输入的参数值

#注意getvalue不能和用户输入放在一个函数里，要分开写，而且，获取不同的值，也要放在不同函数里！！

def get_cookie(self,event):

self.cookie = {"Cookie": self.textCookie.GetValue()}

self.Id=int(self.textId.GetValue())

global file_path

file_path = self.textFile_path.GetValue()+os.sep+"%d" % self.Id + ".txt"

self.Onbutton_Start()

def Onbutton_Start(self):

self.GetName() #获取用户名

self.GetSimple_Info() # 获取微博数，转发量，关注数，粉丝数

self.weibo_para()

self.write_txt()

self.weibo_UI1()

def weibo_UI1(self):

# 建立新的窗口，展示用户的信息

# 弹出文本框：数据读取完毕

message = "文件爬取完毕"

wx.MessageBox(message)

self.weibo_UI2()

def weibo_UI2(self):

self.frame_operate.Destroy()

self.frame_Info = wx.Frame(None,title="User_Information",size=(500,500))

self.panel_Info = wx.Panel(self.frame_Info,-1)

t1 = "用户昵称：" + str(self.username)

t2 = "微博数:" + str(self.Number)

t3 = "粉丝数:"+str(self.fans)

t4 = "关注数："+str(self.Guanzhu)

self.label16 = wx.StaticText(self.panel_Info,-1,self.username,pos=(200,100),style=wx.ALIGN_LEFT)

self.label5 = wx.StaticText(self.panel_Info,-1,t1,pos=(180,130),style=wx.ALIGN_LEFT)

self.label13 = wx.StaticText(self.panel_Info,-1,t2,pos=(180,150),style=wx.ALIGN_LEFT)

self.label14 = wx.StaticText(self.panel_Info,-1,t3,pos=(180,170),style=wx.ALIGN_LEFT)

self.label15 = wx.StaticText(self.panel_Info,-1,t4,pos=(180,190),style=wx.ALIGN_LEFT)

self.font2 = wx.Font(13,wx.SCRIPT,wx.ITALIC,wx.NORMAL) #小字体 font1大字体

self.label16.SetFont(self.font1)

self.label5.SetFont(self.font2)

self.label13.SetFont(self.font2)

self.label14.SetFont(self.font2)

self.label15.SetFont(self.font2)

self.button_news = wx.Button(self.panel_Info,-1,"查看最近微博",pos=(220,280))

self.Bind(wx.EVT_BUTTON,self.weibo_UI3 ,self.button_news)

self.frame_Info.Show()

# 最进微博

def weibo_UI3(self,event):

self.frame_Info.Destroy()

self.frame_news = wx.Frame(None,title="---",size=(500,500))

self.panel_news = wx.Panel(self.frame_news,-1)

label18 = wx.StaticText(self.panel_news,-1,"最新微博动态",pos=(200,40))

if self.Content:

text1 = "最新/置顶微博为: " + self.Content[0]

text2 = "最新/置顶微博发布工具: " + self.publish_tool[0]

text3 = "最新/置顶微博发布时间: " + self.Time[0]

text4 = "最新/置顶微博获得赞数: " + str(self.star[0])

text5 = "最新/置顶微博获得转发数: " + str(self.Zhuanfa[0])

text6 = "最新/置顶微博获得评论数: " + str(self.Pinglun[0])

self.label6 = wx.TextCtrl(self.panel_news,-1,text1,pos=(90,60),size=(250,140), style=wx.TE_MULTILINE|wx.TE_RICH)

self.label7 = wx.StaticText(self.panel_news,-1,text2,pos=(90,200),style=wx.ALIGN_LEFT)

self.label8 = wx.StaticText(self.panel_news,-1,text3,pos=(90,220),style=wx.ALIGN_LEFT)

self.label9 = wx.StaticText(self.panel_news,-1,text4,pos=(90,240),style=wx.ALIGN_LEFT)

self.label10 = wx.StaticText(self.panel_news,-1,text5,pos=(90,260),style=wx.ALIGN_LEFT)

self.label11 = wx.StaticText(self.panel_news,-1,text6,pos=(90,280),style=wx.ALIGN_LEFT)

# 查看微博信息

self.Button_info = wx.Button(self.panel_news,-1,"点击查看之前的微博内容",pos=(220,340))

self.Bind(wx.EVT_BUTTON,self.weibo_pre_info,self.Button_info)

# 查看爬虫信息的文档

self.Button_file = wx.Button(self.panel_news,-1,"点击查看微博数据分析图表",pos=(220,380))

self.Bind(wx.EVT_BUTTON,self.analysis_UI,self.Button_file)

self.frame_news.Show()

def analysis_UI(self,event):

self.frame_data = wx.Frame(None,title="data_analysis--20177830115",size=(500,500))

self.panel_data = wx.Panel(self.frame_data,-1)

text1 = "2017-2018微博转发/点赞量折线统计图"

text2 = '原创微博与转发微博统计图'

text3 = '微博发布工具统计图'

text4 = '微博使用心情统计图'

self.button_1 = wx.Button(self.panel_data,-1,text1,pos=(180,120))

self.button_2 = wx.Button(self.panel_data,-1,text2,pos=(180,160))

self.button_3 = wx.Button(self.panel_data,-1,text3,pos=(180,200))

self.button_4 = wx.Button(self.panel_data,-1,text4,pos=(180,240))

self.Bind(wx.EVT_BUTTON,self.figure_1,self.button_1)

self.Bind(wx.EVT_BUTTON,self.figure_2,self.button_2)

self.Bind(wx.EVT_BUTTON,self.figure_3,self.button_3)

self.Bind(wx.EVT_BUTTON,self.figure_4,self.button_4)

self.frame_data.Show()

def figure_1(self,event):

global file_path

figure = data_analysis.analysis(file_path,self.Number)

figure.analyse_Zhexian()

def figure_2(self,event):

global file_path

figure = data_analysis.analysis(file_path,self.Number)

figure.analyse_YC()

def figure_3(self,event):

global file_path

figure = data_analysis.analysis(file_path,self.Number)

figure.analyse_GJ()

def figure_4(self,event):

global file_path

figure = data_analysis.analysis(file_path,self.Number)

figure.analyse_XQ()

def weibo_pre_info(self,event): ## 过度函数，为了让不断进入weibo_info函数中（分条输出）不报错。（多次进入没有event触发）

self.weibo_info()

def weibo_info(self):

#flag = 1#计次函数，flag==1，继续循环，flag==0退出循环，即不展示下一条微博 ## 这坑爹玩意根本不能用for循环，所以我只能不断进入函数

self.s = wx.Frame(None,title="---",size=(500,500))

self.f = wx.Panel(self.s,-1)

#for i in range(1,self.Number+1):

text1 = str(self.a+1)+":" + self.Content[self.a]

text2 = "发布工具: " + self.publish_tool[self.a]

text3 = "发布时间: " + self.Time[self.a]

text4 = "点赞数: " + str(self.star[self.a])

text5 = "转发数: " + str(self.Zhuanfa[self.a])

text6 = "评论数: " + str(self.Pinglun[self.a])

self.labela = wx.TextCtrl (self.f,-1,text1,pos=(80, 60),size=(250,140),style=wx.TE_MULTILINE|wx.TE_RICH) ##坑爹玩意，静态文本控件只能单行输出，就是不能多行！网上查的可以通过“...XXX~r XXX..”这样，

#但是相当无比麻烦，而且输出都是乱的，除非一条条设置？可能吗！！于是剑走偏锋，选择了用textCtr控件代替静态文本，就是可以改变框里的值，但是效果确实达到了。

self.labelb = wx.StaticText(self.f,-1,text2,pos=(80,200),style=wx.ALIGN_LEFT)

self.labelc = wx.StaticText(self.f,-1,text3,pos=(80,220),style=wx.ALIGN_LEFT)

self.labeld = wx.StaticText(self.f,-1,text4,pos=(80,240),style=wx.ALIGN_LEFT)

self.labele = wx.StaticText(self.f,-1,text5,pos=(80,260),style=wx.ALIGN_LEFT)

self.labelf = wx.StaticText(self.f,-1,text6,pos=(80,280),style=wx.ALIGN_LEFT)

self.button_next=wx.Button(self.f,-1,"查看下一条",pos=(300,380))

self.button_exit=wx.Button(self.f,-1,"关闭",pos=(100,380))

self.Bind(wx.EVT_BUTTON,self.exit,self.button_exit)

self.Bind(wx.EVT_BUTTON,self.cont,self.button_next)

self.s.Show()

def exit(self,event):

self.s.Destroy()

def cont(self,event):

self.a += 1

self.s.Destroy()

self.weibo_info()

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

具体爬虫部分，参考github某大佬的

# 获取用户昵称

def GetName(self):

url = "https://weibo.cn/%d/info" % (self.Id)

html = requests.get(url, cookies=self.cookie).content

selector = etree.HTML(html) # 将返回的html文档的标签补足

username = selector.xpath("//title/text()")[0]# 获取标签title的所有内容。第一个title就是列表的一个

self.username = username[:-3] # XXX的微博，后面三个字切片即为用户昵称

# 获取用户微博数、关注数、粉丝数

def GetSimple_Info(self):

url = "https://weibo.cn/u/%d?&page=1" % (self.Id)

html = requests.get(url, cookies=self.cookie).content

selector = etree.HTML(html) # 转化为标准的HTML

pattern = r"\d+\.?\d*"

# 微博数

wb_num = selector.xpath("//div[@class='tip2']/span[@class='tc']/text()")[0]#

微博[1543]&nbsp

regx = re.findall(pattern, wb_num, re.S | re.M) # 只要数字（字符）

for value in regx:

num_wb = int(value)

break

self.Number = num_wb

# 关注数

str_gz = selector.xpath("//div[@class='tip2']/a/text()")[0]

regx = re.findall(pattern, str_gz, re.M)

self.Guanzhu = int(regx[0])

# 粉丝数

str_fs = selector.xpath("//div[@class='tip2']/a/text()")[1]

regx = re.findall(pattern, str_fs, re.M)

self.fans = int(regx[0])

# 获取"长微博"全部文字内容

def GetLong(self, weibo_link):

html = requests.get(weibo_link, cookies=self.cookie).content

selector = etree.HTML(html)

info = selector.xpath("//div[@class='c']")[1]

wb_content = info.xpath("div/span[@class='ctt']")[0].xpath(

"string(.)").replace(u"\u200b", "").encode(sys.stdout.encoding, "ignore").decode(

sys.stdout.encoding)

return wb_content

# 获取转发微博信息

def GetZhuanfa(self, is_retweet, info, wb_content):

original_user = is_retweet[0].xpath("a/text()")

if not original_user:

wb_content = u"转发微博已被删除"

return wb_content

else:

original_user = original_user[0]

retweet_reason = info.xpath("div")[-1].xpath("string(.)").replace(u"\u200b", "").encode(

sys.stdout.encoding, "ignore").decode(

sys.stdout.encoding)

retweet_reason = retweet_reason[:retweet_reason.rindex(u"赞")]

wb_content = (retweet_reason + "\n" + u"原始用户: " +

original_user + "\n" + u"转发内容: " + wb_content)

return wb_content

#一个界面展示一条微博的发布时间、点赞数、转发数、评论数

def weibo_para(self):

url = "https://weibo.cn/u/%d?&page=1" % (self.Id)

html = requests.get(url, cookies=self.cookie).content

selector = etree.HTML(html)

if selector.xpath("//input[@name='mp']") == []:

page_num = 1

else:

page_num = (int)(selector.xpath(

"//input[@name='mp']")[0].attrib["value"])

pattern = r"\d+\.?\d*"

for page in range(1, page_num + 1):

url2 = "https://weibo.cn/u/%d?&page=%d" % (

self.Id, page)

html2 = requests.get(url2, cookies=self.cookie).content

selector2 = etree.HTML(html2)

info = selector2.xpath("//div[@class='c']")

is_empty = info[0].xpath("div/span[@class='ctt']")

if is_empty:

for i in range(0, len(info) - 2):

# 微博内容

str_t = info[i].xpath("div/span[@class='ctt']")

Content = str_t[0].xpath("string(.)").replace(u"\u200b", "").encode(

sys.stdout.encoding, "ignore").decode(

sys.stdout.encoding)

Content = Content[:-1]

weibo_Id = info[i].xpath("@id")[0][2:]

a_link = info[i].xpath(

"div/span[@class='ctt']/a")

is_retweet = info[i].xpath("div/span[@class='cmt']")

if a_link:

if a_link[-1].xpath("text()")[0] == u"全文":

weibo_link = "https://weibo.cn/comment/" + weibo_Id

wb_content = self.GetLong(weibo_link)

if wb_content:

if not is_retweet:

wb_content = wb_content[1:]

Content = wb_content

if is_retweet:

Content = self.GetZhuanfa(

is_retweet, info[i], Content)

self.Content.append(Content)

# 微博发布时间

str_time = info[i].xpath("div/span[@class='ct']")

str_time = str_time[0].xpath("string(.)").encode(sys.stdout.encoding, "ignore").decode(

sys.stdout.encoding)

Time = str_time.split(u'来自')[0]

if u"刚刚" in Time:

Time = datetime.now().strftime('%Y-%m-%d %H:%M')

elif u"分钟" in Time:

minute = Time[:Time.find(u"分钟")]

minute = timedelta(minutes=int(minute))

Time = (datetime.now() - minute).strftime("%Y-%m-%d %H:%M")

elif u"今天" in Time:

today = datetime.now().strftime("%Y-%m-%d")

time = Time[3:]

Time = today + " " + time

elif u"月" in Time:

year = datetime.now().strftime("%Y")

month = Time[0:2]

day = Time[3:5]

time = Time[7:12]

Time = (

year + "-" + month + "-" + day + " " + time)

else:

Time = Time[:16]

self.Time.append(Time)

str_footer = info[i].xpath("div")[-1]

str_footer = str_footer.xpath("string(.)").encode(

sys.stdout.encoding, "ignore").decode(sys.stdout.encoding)

str_footer = str_footer[str_footer.rfind(u'赞'):]

regx = re.findall(pattern, str_footer, re.M)

# 微博发布工具

if len(str_time.split(u'来自')) > 1:

publish_tool = str_time.split(u'来自')[1]

else:

publish_tool = u"无"

self.publish_tool.append(publish_tool)

str_footer = info[i].xpath("div")[-1]

str_footer = str_footer.xpath("string(.)").encode(

sys.stdout.encoding, "ignore").decode(sys.stdout.encoding)

str_footer = str_footer[str_footer.rfind(u'赞'):]

guid = re.findall(pattern, str_footer, re.M)

# 点赞数

star = int(regx[0])

self.star.append(star)

# 转发数

Zhuanfa = int(regx[1])

self.Zhuanfa.append(Zhuanfa)

# 评论数

Pinglun = int(regx[2])

self.Pinglun.append(Pinglun)

self.number1 += 1

# 将爬取的信息写入文件--------------------------------------------------------------------------

def write_txt(self):

try:

contents_header = u"\n\n微博内容: \n"

contents = (u"用户信息\n用户昵称：" + self.username +

u"\n用户Id: " + str(self.Id) +

u"\n微博数: " + str(self.Number) +

u"\n关注数: " + str(self.Guanzhu) +

u"\n粉丝数: " + str(self.fans) + contents_header + '\n')

for i in range(1, self.number1 + 1):

text = (str(i) + ":" + self.Content[i - 1] + "\n" +

u"发布工具: " + self.publish_tool[i - 1] + "\n" +

u"发布时间: " + self.Time[i - 1] + "\n" +

u"点赞数: " + str(self.star[i - 1]) +

u"转发数: " + str(self.Zhuanfa[i - 1]) +

u"评论数: " + str(self.Pinglun[i - 1]) + "\n\n")

contents = contents + text

global file_path

f = open(file_path, "wb")

f.write(contents.encode(sys.stdout.encoding))

f.close()

except Exception as e:

print("Error: ", e)

traceback.print_exc()

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

测试函数部分

def main():

weibo = Wb()

weibo.Operate()

weibo.MainLoop()

if __name__ == "__main__":

main()

数据分析部分：用matplotlib制图，只是粗浅学了一些，所以画的不够精美，数据过少，分析的可能有点问题，emmm，一共画了四张图， “2017-2018微博转发/点赞量折线统计图”、 ‘原创微博与转发微博统计图’ 、 ‘微博发布工具统计图’、 ‘微博使用心情统计图’

import re

import numpy as np

import matplotlib.pyplot as plt

import matplotlib.dates as mdate

from matplotlib import font_manager as fm

import time

from datetime import datetime

import webbrowser

class analysis(object):

def __init__(self,file_name,number):

self.file_name = file_name

self.number = number

self.X_data = []

self.Y1_data = []

self.Y_data = []

self.str = ""

## 折线图展示窗口

def analyse_Zhexian(self):

pattern = re.compile(r'转发数: \d+') # 查找数字

pattern1 = re.compile(r'\d+')#匹配转发数或者评论数的数字字符

pattern2 = re.compile(r'发布时间: (\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})')#提取时间

pattern3 = re.compile(r'.*2016.*')

pattern4 = re.compile(r'点赞数: \d+')

with open(self.file_name,encoding = "utf-8") as f:

str = f.read()

# 用正则表达式提取所需数据

result = pattern.findall(str)

ls3 = ''.join(result)#拼接成一个字符串

Result = pattern1.findall(ls3)#把所有数字提取完毕

Num_Zhuanfa = [ int(x) for x in Result ] # y轴1

result1 = pattern4.findall(str)

ls1 = ''.join(result1)

Result1 = pattern1.findall(ls1)

Num_Dianzan = [int(x) for x in Result1] # y轴2

# 将x轴数据转化为datetime 类型

Num_Zhuanfa_time = pattern2.findall(str)

for i in range(0,len(Num_Zhuanfa_time)):

if pattern3.findall(Num_Zhuanfa_time[i]):

stop = i # 用stop记录2016年的最后一条微博

break

#只选择2017-2018年两年的数据，因为微博数太多，横轴日期占比太大，matplotlib的横坐标显示不完全，中间会有大量重叠，这里数据分析的算法并不好，结果出来还是会有很大重叠，如果有更好的办法请大佬指教

Num_Zhuanfa = Num_Zhuanfa[0:stop:1]

Num_Zhuanfa_time = Num_Zhuanfa_time[0:stop:1]

Num_Dianzan = Num_Dianzan[0:stop:1]

# 数据除以1000，画图更美观

for i in range(0,len(Num_Zhuanfa)):

Num_Zhuanfa[i] = Num_Zhuanfa[i] /1000

for i in range(0,len(Num_Dianzan)):

Num_Dianzan[i] = Num_Dianzan[i] /1000

#将时间转化为时间戳再转化为datetime类型

aa=[time.strptime(i, "%Y-%m-%d %H:%M") for i in Num_Zhuanfa_time]

timeStamp = [int(time.mktime(a)) for a in aa]

Num_Zhuanfa_time=[datetime.fromtimestamp(k) for k in timeStamp]

# 处理数据量过多的问题

number = len(Num_Zhuanfa)

Group = int(0.18 * number)

k = number // Group # 数据太多，这里只要15%的数据，分组，每组随机选一个作为代表数据

for i in range(0,Group):

self.X_data.append(Num_Zhuanfa_time[i*k])

self.Y_data.append(Num_Zhuanfa[i*k])

self.Y1_data.append(Num_Dianzan[i*k])

# 绘制两条折线

fig1 = plt.figure(figsize=(8,5))

plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签

ax1 = fig1.add_subplot(1,1,1)

ax1.xaxis.set_major_formatter(mdate.DateFormatter('%Y-%m-%d %H-%M'))#设置时间标签显示格式

plt.xticks(self.X_data,rotation=90)#竖着输出时间

plt.yticks(np.linspace(0,5000,5,endpoint=True))

plt.title(u"2017-2018微博转发/点赞量折线图",color="black")

plt.plot(self.X_data,self.Y_data,"o-",color='skyblue',label="转发量",markersize=1.5) #折线

plt.plot(self.X_data,self.Y1_data,"o-",color='pink',label="点赞量",markersize=1.5)

plt.xlabel("发布时间")

plt.ylabel("数量(千/条)")

plt.legend() # 显示标签

plt.show()

def analyse_YC(self):

pattern = re.compile(r'转发理由')

with open(self.file_name,encoding = "utf-8") as f:

str = f.read()

Zhuanfa = pattern.findall(str)

Number_Zhuanfa = int(len(Zhuanfa))

Yuanchuang = self.number - Number_Zhuanfa

plt.rcParams['font.sans-serif'] = ['SimHei']

labels = ['转发微博','原创微博']

sizes = [Number_Zhuanfa,Yuanchuang]

explode= (0.1,0)

plt.pie(sizes,explode=explode,labels=labels,autopct='%1.1f%%',shadow=False,startangle=150)

plt.title(u"原创与转发微博量",color="black")

plt.show()

def analyse_GJ(self):

pattern = re.compile(r'发布工具: (.*)\n发布时间')

with open(self.file_name,encoding = "utf-8") as f:

str = f.read()

number_GJ = pattern.findall(str)

#print(number_GJ)

gongju = dict()

for i in number_GJ:

name = i

if name in gongju:

gongju[name]+=1

else:

gongju[name]=1

# 少于10的记录舍去

for key in list(gongju.keys()):

if gongju[key]<=10:

del gongju[key]

labels = list(gongju.keys())

sizes = list(gongju.values())

plt.rcParams['font.sans-serif'] = ['SimHei']

plt.pie(sizes,labels=labels,autopct='%1.1f%%',shadow=True,startangle=150)

plt.title(u"微博发布工具统计",color="black")

plt.show()

def analyse_XQ(self):

pattern = re.compile(r'\[(.{1,4})\].*\[(.{1,4})\]')

with open(self.file_name,encoding = "utf-8") as f:

str = f.read()

number_XQ = pattern.findall(str)

# print(number_XQ)

a=[]

for i in range(0,len(number_XQ)):

for j in (range(0,len(number_XQ[i]))):

a.append(number_XQ[i][j])

biaoqing = dict()

for i in a:

name = i

if name in biaoqing:

biaoqing[name]+=1

else:

biaoqing[name]=1

for key in list(biaoqing.keys()):

if biaoqing[key] <= 2:

del biaoqing[key]

labels = list(biaoqing.keys())

sizes = list(biaoqing.values())

fig1, ax1 = plt.subplots(http://www.my516.com)

patches, texts, autotexts = ax1.pie(sizes, labels=labels, autopct='%1.0f%%',

shadow=False, startangle=170)

ax1.axis('equal')

#重新设置字体大小

plt.rcParams['font.sans-serif'] = ['SimHei']

proptease = fm.FontProperties()

proptease.set_size('small')

plt.title(u"微博表情使用次数",color="black")

plt.setp(autotexts, fontproperties=proptease)

plt.setp(texts, fontproperties=proptease)

plt.show()

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

python程序打包

#在cmd下安装pyinstaller

pip install pyinstaller

#打包成一个可执行文件 -F (注意将cmd窗口切换至文件保存的路径下)

pyinstaller -F filename.py

本篇只适合新手简单学习，笔者也刚学，加上复习周，后期会逐渐完善，毕竟UI写的太丑了！

另：关于获取本地用户cookie和微博账号的id操作比较简单在此不再做详细解释。如果程序跑不出来相信我一定是cookie问题

---------------------

python大作业爬虫_Python大作业---微博爬虫及简单数据分析相关推荐

用python写一个简单的爬虫_Python实现简易Web爬虫详解
编辑推荐: 本文主要介绍了Python实现简易Web爬虫详解,希望对大家有帮助. 本文来自于脚本之家 ,由火龙果软件Alice编辑,推荐. 简介: 网络爬虫(又被称为网页蜘蛛),网络机器人,是一种按照 ...
python画五角星代码_Python GUI 编程tkinter--画五角星和简单的动画制作
1.利用Python的tkinter画一个五角星: 2.做一个简单的动画分析五角心的五个顶点的坐标: 确立五角星的中心点和半径,再确立五角星的五个角的坐标信息,其中五角星的各个角读书为36度,具体的 ...
python timer怎么用_python定时器（Timer）用法简单实例
python定时器(Timer)用法简单实例本文实例讲述了python定时器(Timer)用法.分享给大家供大家参考.具体如下: # encoding: UTF-8 import threading ...
python实战（一）Python爬取猫眼评分排行前100电影及简单数据分析可视化python实战（一）Python爬取猫眼排行前一百电影及简单数据分析可视化
python实战(一)Python爬取猫眼排行前一百电影及简单数据分析可视化一.抓取数据需要的库 request库响应http请求 json库将文本保存成json形式 pyquery 类似JQ ...
python实战（一）Python爬取猫眼评分排行前100电影及简单数据分析可视化
python实战(一)Python爬取猫眼排行前一百电影及简单数据分析可视化一.抓取数据需要的库 request库响应http请求 json库将文本保存成json形式 pyquery 类似JQ ...
python多线程爬取世纪佳缘女生资料并简单数据分析
一. 目标作为一只万年单身狗,一直很好奇女生找对象的时候都在想啥呢,这事也不好意思直接问身边的女生,不然别人还以为你要跟她表白啥的,况且工科出身的自己本来接触的女生就少,即使是挨个问遍,样本量也 ...
python五子棋大作业报告_Python 大作业之五子棋游戏(附代码)
Python 大作业--五子棋游戏姓名:吴欣学号: 姓名:张雨清学号: 一游戏介绍: 我们设计的是五子棋游戏,支持两人一个鼠标对下,黑方用左键单击,白方用右键单击,谁先下均可,落子无悔,下过的棋子对 ...
python作业网站_python大作业
利用python对豆瓣电影评价的爬取,并生成词云一.抓取网页数据第一步要对网页进行访问,python中使用的是urllib库.代码如下: from urllib import request re ...
python大作业思路_python大作业
原博文 2018-04-22 10:59 − 词云---利用python对电影评价的爬取一.抓取网页数据 1:网页爬取一些数据的前期工作 from urllib import request res ...

python大作业爬虫_Python大作业---微博爬虫及简单数据分析

python大作业爬虫_Python大作业---微博爬虫及简单数据分析相关推荐

最新文章

热门文章