#!/usr/bin/env python3#-*- coding: utf-8 -*-#by 默不知然

importurllib.requestfrom urllib importparsefrom bs4 importBeautifulSoupimporthttp.cookiejarimportxlwtimportzlibimportreimporttimeimportxlsxwriterimportsysimportdatetimeimportpymysql'''运行方法:

python vulnerabilities_crawler 2017-10-01 2017-10-31 178

第一个为开始时间,第二个为结束时间,第三个为总页数。'''

#获得漏洞详情链接列表

defvulnerabilities_url_list(url,start_time,end_time):

header={'User-Agent': 'Mozilla/5.0 (Linux; Android 4.1.2; Nexus 7 Build/JZ054K) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19','Accept-Encoding': 'gzip, deflate','Referer': 'http://cnnvd.org.cn/web/vulnerability/queryLds.tag'}

data={'qstartdate':'2017-10-30', #---------------》开始日期

'qenddate':'2017-10-31' #---------------》结束日期

}

data['qstartdate'] =start_time

data['qenddate'] =end_time

data= parse.urlencode(data).encode('utf-8')

vulnerabilities_url_html= urllib.request.Request(url,headers=header,data=data)

vulnerabilities_url_cookie=http.cookiejar.CookieJar()

vulnerabilities_url_opener=urllib.request.build_opener(urllib.request.HTTPCookieProcessor(vulnerabilities_url_cookie))

vulnerabilities_url_html=vulnerabilities_url_opener.open(vulnerabilities_url_html)

vulnerabilities_url_html= zlib.decompress(vulnerabilities_url_html.read(), 16+zlib.MAX_WBITS)

vulnerabilities_url_html=vulnerabilities_url_html.decode()#提取漏洞详情链接

response = r'href="(.+?)" target="_blank" class="a_title2"'vulnerabilities_link_list=re.compile(response).findall(vulnerabilities_url_html)#添加http前序

i =0for link invulnerabilities_link_list:

vulnerabilities_lists.append('http://cnnvd.org.cn'+vulnerabilities_link_list[i])

i+=1

print("已完成爬行第%d个漏洞链接"%i)

time.sleep(0.2)#漏洞信息爬取函数

defvulnerabilities_data(url):

header={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0','Accept-Encoding': 'gzip, deflate, sdch',

}

vulnerabilities_data_html= urllib.request.Request(url,headers=header)

vulnerabilities_data_cookie=http.cookiejar.CookieJar()

vulnerabilities_data_opener=urllib.request.build_opener(urllib.request.HTTPCookieProcessor(vulnerabilities_data_cookie))

vulnerabilities_data_html=vulnerabilities_data_opener.open(vulnerabilities_data_html)

vulnerabilities_data_html= zlib.decompress(vulnerabilities_data_html.read(), 16+zlib.MAX_WBITS)

vulnerabilities_data_html=vulnerabilities_data_html.decode()globalvulnerabilities_result_list

vulnerabilities_result_list=[] #抓取信息列表命名

#添加漏洞信息详情

vulnerabilities_detainled_soup1 = BeautifulSoup(vulnerabilities_data_html,'html.parser')

vulnerabilities_detainled_data= vulnerabilities_detainled_soup1.find('div',attrs={'class':'detail_xq w770'}) ##定义 漏洞信息详情 块的soup

vulnerabilities_detainled_data =vulnerabilities_detainled_data.decode()

vulnerabilities_detainled_soup= BeautifulSoup(vulnerabilities_detainled_data,'html.parser') #二次匹配

vulnerabilities_detainled_data_list= vulnerabilities_detainled_soup.find_all('li') #标签a信息汇总

try:

vulnerabilities_name= vulnerabilities_detainled_soup.h2.string #漏洞名称

except:

vulnerabilities_name= ''vulnerabilities_result_list.append(vulnerabilities_name)try:

vulnerabilities_cnnvd_num= vulnerabilities_detainled_soup.span.string #cnnvd编号

vulnerabilities_cnnvd_num = re.findall(r"\:([\s\S]*)",vulnerabilities_cnnvd_num)[0]except:

vulnerabilities_cnnvd_num= ''vulnerabilities_result_list.append(vulnerabilities_cnnvd_num)try: #漏洞等级

vulnerabilities_rank =vulnerabilities_detainled_soup.a.decode()

vulnerabilities_rank= re.search(u'([\u4e00-\u9fa5]+)',vulnerabilities_rank).group(0)except:

vulnerabilities_rank= ''vulnerabilities_result_list.append(vulnerabilities_rank)

vulnerabilities_cve_html= vulnerabilities_detainled_data_list[2].decode() #漏洞cve编号

vulnerabilities_cve_soup = BeautifulSoup(vulnerabilities_cve_html,'html.parser')try:

vulnerabilities_cve=vulnerabilities_cve_soup.a.string

vulnerabilities_cve= vulnerabilities_cve.replace("\r","").replace("\t","").replace("\n","").replace(" ","")except:

vulnerabilities_cve= ''vulnerabilities_result_list.append(vulnerabilities_cve)

vulnerabilities_type_html= vulnerabilities_detainled_data_list[3].decode() #漏洞类型

vulnerabilities_type_soup = BeautifulSoup(vulnerabilities_type_html,'html.parser')try:

vulnerabilities_type=vulnerabilities_type_soup.a.string

vulnerabilities_type= vulnerabilities_type.replace("\r","").replace("\t","").replace("\n","").replace(" ","")except:

vulnerabilities_type= ''vulnerabilities_result_list.append(vulnerabilities_type)

vulnerabilities_time_html= vulnerabilities_detainled_data_list[4].decode() #发布时间

vulnerabilities_time_soup = BeautifulSoup(vulnerabilities_time_html,'html.parser')try:

vulnerabilities_time=vulnerabilities_time_soup.a.string

vulnerabilities_time= vulnerabilities_time.replace("\r","").replace("\t","").replace("\n","")except:

vulnerabilities_time= ''vulnerabilities_result_list.append(vulnerabilities_time)

vulnerabilities_attack_html= vulnerabilities_detainled_data_list[5].decode() #威胁类型

vulnerabilities_attack_soup = BeautifulSoup(vulnerabilities_attack_html,'html.parser')try:

vulnerabilities_attack=vulnerabilities_attack_soup.a.string

vulnerabilities_attack= vulnerabilities_attack.replace("\r","").replace("\t","").replace("\n","")except:

vulnerabilities_attack= ''vulnerabilities_result_list.append(vulnerabilities_attack)

vulnerabilities_update_html= vulnerabilities_detainled_data_list[6].decode() #更新时间

vulnerabilities_update_soup = BeautifulSoup(vulnerabilities_update_html,'html.parser')try:

vulnerabilities_update=vulnerabilities_update_soup.a.string

vulnerabilities_update= vulnerabilities_update.replace("\r","").replace("\t","").replace("\n","")except:

vulnerabilities_update= ''vulnerabilities_result_list.append(vulnerabilities_update)

vulnerabilities_firm_html= vulnerabilities_detainled_data_list[7].decode() #厂商

vulnerabilities_firm_soup = BeautifulSoup(vulnerabilities_firm_html,'html.parser')try:

vulnerabilities_firm=vulnerabilities_firm_soup.a.string

vulnerabilities_firm= vulnerabilities_firm.replace("\r","").replace("\t","").replace("\n","")except:

vulnerabilities_firm= ''vulnerabilities_result_list.append(vulnerabilities_firm)

vulnerabilities_source_html= vulnerabilities_detainled_data_list[8].decode() #漏洞来源

vulnerabilities_source_soup = BeautifulSoup(vulnerabilities_source_html,'html.parser')try:

vulnerabilities_source=vulnerabilities_source_soup.a.string

vulnerabilities_source= vulnerabilities_source.replace("\r","").replace("\t","").replace("\n","")except:

vulnerabilities_source= ''vulnerabilities_result_list.append(vulnerabilities_source)#添加漏洞简介详情

vulnerabilities_title_html = vulnerabilities_detainled_soup1.find('div',attrs={'class':'d_ldjj'}) #定义 漏洞简介 块的soup

vulnerabilities_title_html =vulnerabilities_title_html.decode()

vulnerabilities_title_soup2= BeautifulSoup(vulnerabilities_title_html,'html.parser')try:

vulnerabilities_titles1= vulnerabilities_title_soup2.find_all(name='p')[0].string

vulnerabilities_titles2= vulnerabilities_title_soup2.find_all(name='p')[1].string

vulnerabilities_titles= vulnerabilities_titles1 +vulnerabilities_titles2

vulnerabilities_titles= vulnerabilities_titles.replace(' ','').replace('\t','').replace('\r','').replace('\n','')except:

vulnerabilities_titles= ''vulnerabilities_result_list.append(vulnerabilities_titles)#漏洞公告

vulnerabilities_notice_html = vulnerabilities_detainled_soup1.find('div',attrs={'class':'d_ldjj m_t_20'}) #定义 漏洞公告 块的soup

vulnerabilities_notice_html =vulnerabilities_notice_html.decode()

vulnerabilities_notice_soup2= BeautifulSoup(vulnerabilities_notice_html,'html.parser')try:

vulnerabilities_notice1= vulnerabilities_notice_soup2.find_all(name='p')[0].string

vulnerabilities_notice2= vulnerabilities_notice_soup2.find_all(name='p')[1].string

vulnerabilities_notice= vulnerabilities_notice1+vulnerabilities_notice2

vulnerabilities_notice= vulnerabilities_notice.replace('\n','').replace('\r','').replace('\t','')except:

vulnerabilities_notice= ''vulnerabilities_result_list.append(vulnerabilities_notice)#参考网址

vulnerabilities_reference_html = vulnerabilities_detainled_soup1.find_all('div',attrs={'class':'d_ldjj m_t_20'})[1] #定义 参考网址 块的soup

vulnerabilities_reference_html =vulnerabilities_reference_html.decode()

vulnerabilities_reference_soup2= BeautifulSoup(vulnerabilities_reference_html,'html.parser')try:

vulnerabilities_reference= vulnerabilities_reference_soup2.find_all(name='p')[1].string

vulnerabilities_reference= vulnerabilities_reference.replace('\n','').replace('\r','').replace('\t','').replace('链接:','')except:

vulnerabilities_reference= ''vulnerabilities_result_list.append(vulnerabilities_reference)#受影响实体

vulnerabilities_effect_html = vulnerabilities_detainled_soup1.find_all('div',attrs={'class':'d_ldjj m_t_20'})[2] #定义 受影响实体 块的soup

vulnerabilities_effect_html =vulnerabilities_effect_html.decode()

vulnerabilities_effect_soup2= BeautifulSoup(vulnerabilities_effect_html,'html.parser')try:

vulnerabilities_effect= vulnerabilities_effect_soup2.find_all(name='p')[0].string

vulnerabilities_effect= vulnerabilities_effect.replace('\n','').replace('\r','').replace('\t','').replace(' ','')except:try:

vulnerabilities_effect= vulnerabilities_effect_soup2.find_all(name='a')[0].string

vulnerabilities_effect= vulnerabilities_effect.replace('\n','').replace('\r','').replace('\t','').replace(' ','')except:

vulnerabilities_effect= ''vulnerabilities_result_list.append(vulnerabilities_effect)#补丁

vulnerabilities_patch_html = vulnerabilities_detainled_soup1.find_all('div',attrs={'class':'d_ldjj m_t_20'})[3] #定义 补丁 块的soup

vulnerabilities_patch_html =vulnerabilities_patch_html.decode()

vulnerabilities_patch_soup2= BeautifulSoup(vulnerabilities_patch_html,'html.parser')try:

vulnerabilities_patch= vulnerabilities_patch_soup2.find_all(name='p')[0].string

vulnerabilities_patch= vulnerabilities_patch.replace('\n','').replace('\r','').replace('\t','').replace(' ','')except:

vulnerabilities_patch= ''vulnerabilities_result_list.append(vulnerabilities_patch)#漏洞信息写入excel

defvulnerabilities_excel(excel):

workbook= xlsxwriter.Workbook('vulnerabilities_data.xlsx')

worksheet=workbook.add_worksheet()

row=0

col=0

worksheet.write(row,0,'漏洞名称')

worksheet.write(row,1,'CNNVD编号')

worksheet.write(row,2,'危害等级')

worksheet.write(row,3,'CVE编号')

worksheet.write(row,4,'漏洞类型')

worksheet.write(row,5,'发布时间')

worksheet.write(row,6,'攻击途径')

worksheet.write(row,7,'更新时间')

worksheet.write(row,8,'厂商')

worksheet.write(row,9,'漏洞来源')

worksheet.write(row,10,'漏洞描述')

worksheet.write(row,11,'解决方案')

worksheet.write(row,12,'参考链接')

worksheet.write(row,13,'受影响实体')

worksheet.write(row,14,'补丁')

row= 1

for i inrange(len(excel)):

worksheet.write(row,col,excel[i][0])

worksheet.write(row,col+1,excel[i][1])

worksheet.write(row,col+2,excel[i][2])

worksheet.write(row,col+3,excel[i][3])

worksheet.write(row,col+4,excel[i][4])

worksheet.write(row,col+5,excel[i][5])

worksheet.write(row,col+6,excel[i][6])

worksheet.write(row,col+7,excel[i][7])

worksheet.write(row,col+8,excel[i][8])

worksheet.write(row,col+9,excel[i][9])

worksheet.write(row,col+10,excel[i][10])

worksheet.write(row,col+11,excel[i][11])

worksheet.write(row,col+12,excel[i][12])

worksheet.write(row,col+13,excel[i][13])

worksheet.write(row,col+14,excel[i][14])

row+= 1workbook.close()#漏洞信息写入数据库

defvulnerabilities_mysql(excel):

db= pymysql.connect('127.0.0.1','root','xxxx','spider',charset='utf8')

cursor=db.cursor()for i inrange(len(excel)):

sql="INSERT INTO cnnvd(vulnerabilities_name,cnnvd_num,vulnerabilities_rank,cve_num,vulnerabilities_type,release_time,attack_path,update_time,company,vulnerabilities_source,vulnerabilities_data,solution,reference,object,path) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);"

try:

cursor.execute(sql,(excel[i][0],excel[i][1],excel[i][2],excel[i][3],excel[i][4],excel[i][5],excel[i][6],excel[i][7],excel[i][8],excel[i][9],excel[i][10],excel[i][11],excel[i][12],excel[i][13],excel[i][14]))except:print('写入数据库失败')print('写入数据库完毕!!!')

db.commit()

db.close()#爬取代理ip

defmain():#调用漏洞列表函数并获得漏洞链接列表

begin =datetime.datetime.now()globalvulnerabilities_lists

vulnerabilities_lists=[]

j= 1page_count= sys.argv[3]

page_count=int(page_count)

start_time= sys.argv[1]

end_time= sys.argv[2]while j<=page_count:try:

vulnerabilities_url= 'http://cnnvd.org.cn/web/vulnerability/queryLds.tag?pageno=%d&repairLd='%j

vulnerabilities_url_list(vulnerabilities_url,start_time,end_time)print("已完成爬行第%d页"%j)print('\n')

time.sleep(2)

j+=1

except:print('爬取失败,等待5秒后重新爬取。')

time.sleep(5)#调用漏洞信息函数并爬取漏洞信息

vulnerabilities_result_lists =[]

a=0while a

vulnerabilities_data(vulnerabilities_lists[a])

vulnerabilities_result_lists.append(vulnerabilities_result_list)

a+=1

print("完成爬行第%d个漏洞信息"%a)

time.sleep(1)except:print('爬取失败,等待5秒后重新爬取。')

time.sleep(5)#漏洞信息写入excel

vulnerabilities_excel(vulnerabilities_result_lists)#漏洞信息写入MySQL

#vulnerabilities_mysql(vulnerabilities_result_lists)

#爬行结束

end =datetime.datetime.now()

total_time= end -beginprint ('漏洞信息爬取结束')print ('应该爬行漏洞数量:',len(vulnerabilities_lists))print ('爬行时间:',total_time)if __name__ == '__main__':

main()

爬虫python漏洞群_python3-爬取cnnvd漏洞信息相关推荐

  1. python爬取豆瓣电影信息可行性分析_Python爬虫实现的根据分类爬取豆瓣电影信息功能示例...

    本文实例讲述了Python爬虫实现的根据分类爬取豆瓣电影信息功能.分享给大家供大家参考,具体如下: 代码的入口:if __name__ == '__main__': main()#! /usr/bin ...

  2. 从入门到入土:Python爬虫学习|实例练手|爬取LOL全英雄信息及技能||异步加载|初级难度反扒处理|寻找消失的API

    此博客仅用于记录个人学习进度,学识浅薄,若有错误观点欢迎评论区指出.欢迎各位前来交流.(部分材料来源网络,若有侵权,立即删除) 本人博客所有文章纯属学习之用,不涉及商业利益.不合适引用,自当删除! 若 ...

  3. python斗鱼抽奖_python3爬取斗鱼某些版块的主播人气

    python 3爬取斗鱼某些版块的主播人气 1.爬虫版块 import Test3 import urllib.request from bs4 import BeautifulSoup import ...

  4. Python爬虫实战之一 - 基于Requests爬取拉勾网招聘信息,并保存至本地csv文件

    Python爬虫实战之二 - 基于Requests抓取拉勾网招聘信息 ---------------readme--------------- 简介:本人产品汪一枚,Python自学数月,对于小白,本 ...

  5. Python搭建代理池爬取拉勾网招聘信息

    先来看一张图了解下爬虫 实现功能 多线程爬取拉勾网招聘信息 维护代理 ip 池 搭建 node 服务器 Taro 使用 echarts 做数据分析 1.多线程爬取拉勾网招聘信息 Tip:涉及知识 1. ...

  6. Python实现淘宝爬取——奶粉销售信息爬取及其数据可视化

    简介 双十一刚过,TB的销售额又创下了新高,我也为2000+亿做出了贡献 恰巧买了一袋德运奶粉,味道还不错.我就在想,接触爬虫也有两个多月了,还没有爬过TB这种经典的网站,借着劲头就爬取了一下TB上奶 ...

  7. Python实战案例:爬取中国执行信息公开网

    从面试题谈起 谈到这个项目的爬虫,就要从一道面试题谈起了. 这道面试题是: 请写一个爬虫从网址 http://zxgk.court.gov.cn/shixin/,检索被执行人姓名:"阿里&q ...

  8. 爬虫基本库request使用—爬取猫眼电影信息

    使用request库和正则表达式爬取猫眼电影信息. 1.爬取目标 猫眼电影TOP100的电影名称,时间,评分,等信息,将结果以文件存储. 2.准备工作 安装request库. 3.代码实现 1 imp ...

  9. python爬取天气_python3爬取各类天气信息

    本来是想从网上找找有没有现成的爬取空气质量状况和天气情况的爬虫程序,结果找了一会儿感觉还是自己写一个吧. 主要是爬取北京包括北京周边省会城市的空气质量数据和天气数据. 过程中出现了一个错误:Unico ...

  10. 爬虫实战:Requests+BeautifulSoup 爬取京东内衣信息并导入表格(python)

    准备工作 假如我们想把京东内衣类商品的信息全部保存到本地,通过手工复制粘贴将是一项非常庞大的工程,此时,可以用python爬虫实现. 第一步:分析网页地址 起始网页地址 起始网页地址 https:// ...

最新文章

  1. 一个量子物理学家是怎样研究 AI 的? | 8月书讯
  2. iphone导出照片到电脑_如何更改 iPhone 照片格式?
  3. SQL Server 为什么事务日志自动增长会降低你的性能
  4. 在一台电脑上运行两个或多个tomcat
  5. python中定义数据结构_Python中的数据结构—简介
  6. KVM--安装及初步使用
  7. Nginx 配置 SSL 及 HTTPS 协议通信过程
  8. html小游戏代码_研发实践:Mozilla分享如何开发一款WebVR小游戏
  9. [Java] 蓝桥杯BASIC-15 基础练习 字符串对比
  10. 的clear会清空内存吗_Python内存分配时有哪些不为你知的小秘密?
  11. SQL动态配置,动态解析SQL
  12. 【公基】口算提速,java 自动生成:加、减、乘、除、混合运算题目
  13. 软件渗透测试:定义、需求、过程
  14. cydia多开微信_cydia怎么安装微信
  15. 数据结构-图的应用-最小生成树(类C语言版)
  16. 家用洗地机实用吗?家用洗地机款式推荐
  17. Kali 开机报错解决方案
  18. 徐无忌MySQL笔记:MySQL数据库锁有几种?实现原理是什么?
  19. 如何处理u盘一插进电脑就自动打开里面的所有文件夹
  20. 捷联惯导速度更新_划桨效应补偿算法推导(双子样)

热门文章

  1. 利用VBA操作OutLook批量发送工资条
  2. 工资条EXCEL短信怎么发送?
  3. 数学建模:人口增长模型
  4. 主动轮廓模型——Snake分割算法 matlab源码
  5. PHP100的php教程批量打包下载
  6. 数字图像处理期末复习题
  7. 【转载】大学生必备软件免费自学视频教程,让那些培训班都见鬼去吧
  8. 调度指挥系统解决方案
  9. BeanUtils简介
  10. 达摩院的地球云计算平台AI Earth使用体验