2018年1月26日 16:04:06 更新代码。

excel结果:

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# by 默不知然
# 2017年12月18日 10:01:30import requests
from urllib import parse
from bs4 import BeautifulSoup
import xlwt
import zlib
import re
import time
import xlsxwriter
import sys
import datetime
import random
import threadpool'''
运行方法:
python spider_cnnvd.py 2017-10-01 2017-10-31 178
第一个为开始时间,第二个为结束时间,第三个为总页数。
'''#获取代理,并写入列表agent_lists
def agent_list(url):header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0'}r = requests.get(url,headers=header)agent_info = BeautifulSoup(r.content,'lxml').find(id="ip_list").find_all('tr')[1:]for i in range(len(agent_info)):info = agent_info[i].find_all('td')agents = {info[5].string : 'http://' + info[1].string}agent_lists.append(agents)#获得漏洞详情链接列表
def vulnerabilities_url_list(url,start_time,end_time):header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0','Content-Type': 'application/x-www-form-urlencoded'}data ={'qstartdate':'2018-02-01','qenddate':'2018-02-01'}data['qstartdate'] = start_timedata['qenddate'] = end_timeproxy = random.sample(agent_lists,1)[0]vulnerabilities_url_html = requests.post(url,headers=header,proxies=proxy,data=data)vulnerabilities_url_html = vulnerabilities_url_html.content.decode()#提取漏洞详情链接response = r'href="(.+?)" target="_blank" class="a_title2"'vulnerabilities_link_list = re.compile(response).findall(vulnerabilities_url_html)#添加http前序i = 0for link in vulnerabilities_link_list:vulnerabilities_lists.append('http://cnnvd.org.cn'+vulnerabilities_link_list[i])i+=1print("已完成爬行第%d个漏洞链接"%i)#漏洞信息爬取函数
def vulnerabilities_data(url):header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0'}#proxy = random.sample(agent_lists,1)[0]vulnerabilities_data_html = requests.get(url,headers=header)            #,proxies=proxy)vulnerabilities_data_html = vulnerabilities_data_html.content.decode()#global vulnerabilities_result_listvulnerabilities_result_list_eve=[]    #抓取信息列表命名#添加漏洞信息详情vulnerabilities_detainled_soup1 = BeautifulSoup(vulnerabilities_data_html,'html.parser')vulnerabilities_detainled_data = vulnerabilities_detainled_soup1.find('div',attrs={'class':'detail_xq w770'})    ##定义 漏洞信息详情 块的soupvulnerabilities_detainled_data = vulnerabilities_detainled_data.encode().decode()vulnerabilities_detainled_soup = BeautifulSoup(vulnerabilities_detainled_data,'html.parser')    #二次匹配
vulnerabilities_detainled_data_list = vulnerabilities_detainled_soup.find_all('li')    #标签a信息汇总    try:vulnerabilities_name = vulnerabilities_detainled_soup.h2.string    #漏洞名称except:vulnerabilities_name = ''vulnerabilities_result_list_eve.append(vulnerabilities_name)try:vulnerabilities_cnnvd_num = vulnerabilities_detainled_soup.span.string    #cnnvd编号vulnerabilities_cnnvd_num = re.findall(r"\:([\s\S]*)",vulnerabilities_cnnvd_num)[0]except:vulnerabilities_cnnvd_num = ''vulnerabilities_result_list_eve.append(vulnerabilities_cnnvd_num)try:                            #漏洞等级vulnerabilities_rank = vulnerabilities_detainled_soup.a.decode()vulnerabilities_rank = re.search(u'([\u4e00-\u9fa5]+)',vulnerabilities_rank).group(0)except:vulnerabilities_rank = ''vulnerabilities_result_list_eve.append(vulnerabilities_rank)vulnerabilities_cve_html = vulnerabilities_detainled_data_list[2].encode().decode()    #漏洞cve编号vulnerabilities_cve_soup = BeautifulSoup(vulnerabilities_cve_html,'html.parser')try:vulnerabilities_cve = vulnerabilities_cve_soup.a.stringvulnerabilities_cve = vulnerabilities_cve.replace("\r","").replace("\t","").replace("\n","").replace(" ","")except:vulnerabilities_cve = ''vulnerabilities_result_list_eve.append(vulnerabilities_cve)vulnerabilities_type_html = vulnerabilities_detainled_data_list[3].encode().decode()    #漏洞类型vulnerabilities_type_soup = BeautifulSoup(vulnerabilities_type_html,'html.parser')try:vulnerabilities_type = vulnerabilities_type_soup.a.stringvulnerabilities_type = vulnerabilities_type.replace("\r","").replace("\t","").replace("\n","").replace(" ","")except:vulnerabilities_type = ''vulnerabilities_result_list_eve.append(vulnerabilities_type)vulnerabilities_time_html = vulnerabilities_detainled_data_list[4].encode().decode()    #发布时间vulnerabilities_time_soup = BeautifulSoup(vulnerabilities_time_html,'html.parser')try:    vulnerabilities_time = vulnerabilities_time_soup.a.stringvulnerabilities_time = vulnerabilities_time.replace("\r","").replace("\t","").replace("\n","")except:vulnerabilities_time = ''vulnerabilities_result_list_eve.append(vulnerabilities_time)vulnerabilities_attack_html = vulnerabilities_detainled_data_list[5].encode().decode()    #威胁类型vulnerabilities_attack_soup = BeautifulSoup(vulnerabilities_attack_html,'html.parser')try:    vulnerabilities_attack = vulnerabilities_attack_soup.a.stringvulnerabilities_attack = vulnerabilities_attack.replace("\r","").replace("\t","").replace("\n","")except:vulnerabilities_attack = ''vulnerabilities_result_list_eve.append(vulnerabilities_attack)vulnerabilities_update_html = vulnerabilities_detainled_data_list[6].encode().decode()    #更新时间vulnerabilities_update_soup = BeautifulSoup(vulnerabilities_update_html,'html.parser')try:vulnerabilities_update = vulnerabilities_update_soup.a.stringvulnerabilities_update = vulnerabilities_update.replace("\r","").replace("\t","").replace("\n","")except:vulnerabilities_update = ''    vulnerabilities_result_list_eve.append(vulnerabilities_update)vulnerabilities_firm_html = vulnerabilities_detainled_data_list[7].encode().decode()    #厂商vulnerabilities_firm_soup = BeautifulSoup(vulnerabilities_firm_html,'html.parser')try:vulnerabilities_firm = vulnerabilities_firm_soup.a.stringvulnerabilities_firm = vulnerabilities_firm.replace("\r","").replace("\t","").replace("\n","")except:vulnerabilities_firm = ''vulnerabilities_result_list_eve.append(vulnerabilities_firm)vulnerabilities_source_html = vulnerabilities_detainled_data_list[8].encode().decode()    #漏洞来源vulnerabilities_source_soup = BeautifulSoup(vulnerabilities_source_html,'html.parser')try:vulnerabilities_source = vulnerabilities_source_soup.a.stringvulnerabilities_source = vulnerabilities_source.replace("\r","").replace("\t","").replace("\n","")except:vulnerabilities_source = ''vulnerabilities_result_list_eve.append(vulnerabilities_source)#添加漏洞简介详情vulnerabilities_title_html = vulnerabilities_detainled_soup1.find('div',attrs={'class':'d_ldjj'})    #定义 漏洞简介 块的soupvulnerabilities_title_html = vulnerabilities_title_html.encode().decode()vulnerabilities_title_soup2 = BeautifulSoup(vulnerabilities_title_html,'html.parser')try:vulnerabilities_titles1 = vulnerabilities_title_soup2.find_all(name='p')[0].stringvulnerabilities_titles2 = vulnerabilities_title_soup2.find_all(name='p')[1].stringvulnerabilities_titles = vulnerabilities_titles1 + vulnerabilities_titles2vulnerabilities_titles = vulnerabilities_titles.replace(' ','').replace('\t','').replace('\r','').replace('\n','')except:vulnerabilities_titles = ''vulnerabilities_result_list_eve.append(vulnerabilities_titles)#漏洞公告vulnerabilities_notice_html = vulnerabilities_detainled_soup1.find('div',attrs={'class':'d_ldjj m_t_20'})    #定义 漏洞公告 块的soupvulnerabilities_notice_html = vulnerabilities_notice_html.encode().decode()vulnerabilities_notice_soup2 = BeautifulSoup(vulnerabilities_notice_html,'html.parser')try:vulnerabilities_notice1 = vulnerabilities_notice_soup2.find_all(name='p')[0].stringvulnerabilities_notice2 = vulnerabilities_notice_soup2.find_all(name='p')[1].stringvulnerabilities_notice = vulnerabilities_notice1+vulnerabilities_notice2vulnerabilities_notice = vulnerabilities_notice.replace('\n','').replace('\r','').replace('\t','')except:vulnerabilities_notice = ''vulnerabilities_result_list_eve.append(vulnerabilities_notice)#参考网址vulnerabilities_reference_html = vulnerabilities_detainled_soup1.find_all('div',attrs={'class':'d_ldjj m_t_20'})[1]    #定义 参考网址 块的soupvulnerabilities_reference_html = vulnerabilities_reference_html.encode().decode()vulnerabilities_reference_soup2 = BeautifulSoup(vulnerabilities_reference_html,'html.parser')try:vulnerabilities_reference = vulnerabilities_reference_soup2.find_all(name='p')[1].stringvulnerabilities_reference = vulnerabilities_reference.replace('\n','').replace('\r','').replace('\t','').replace('链接:','')except:vulnerabilities_reference = ''vulnerabilities_result_list_eve.append(vulnerabilities_reference)#受影响实体vulnerabilities_effect_html = vulnerabilities_detainled_soup1.find_all('div',attrs={'class':'d_ldjj m_t_20'})[2]    #定义 受影响实体 块的soupvulnerabilities_effect_html = vulnerabilities_effect_html.encode().decode()vulnerabilities_effect_soup2 = BeautifulSoup(vulnerabilities_effect_html,'html.parser')try:vulnerabilities_effect = vulnerabilities_effect_soup2.find_all(name='p')[0].stringvulnerabilities_effect = vulnerabilities_effect.replace('\n','').replace('\r','').replace('\t','').replace(' ','')except:try:vulnerabilities_effect = vulnerabilities_effect_soup2.find_all(name='a')[0].stringvulnerabilities_effect = vulnerabilities_effect.replace('\n','').replace('\r','').replace('\t','').replace(' ','')        except:vulnerabilities_effect = ''vulnerabilities_result_list_eve.append(vulnerabilities_effect)#补丁vulnerabilities_patch_html = vulnerabilities_detainled_soup1.find_all('div',attrs={'class':'d_ldjj m_t_20'})[3]    #定义 补丁 块的soupvulnerabilities_patch_html = vulnerabilities_patch_html.encode().decode()vulnerabilities_patch_soup2 = BeautifulSoup(vulnerabilities_patch_html,'html.parser')try:vulnerabilities_patch = vulnerabilities_patch_soup2.find_all(name='p')[0].stringvulnerabilities_patch = vulnerabilities_patch.replace('\n','').replace('\r','').replace('\t','').replace(' ','')except:vulnerabilities_patch = ''vulnerabilities_result_list_eve.append(vulnerabilities_patch)for i in vulnerabilities_result_list_eve:vulnerabilities_result_list.append(i)print (re.findall(r'CNNVD-[\s+\S+]+',url)[0])#漏洞信息写入excel
def vulnerabilities_excel(excel):workbook = xlsxwriter.Workbook('spider_cnnvd.xlsx')worksheet = workbook.add_worksheet()row = 0col = 0worksheet.write(row,0,'漏洞名称')worksheet.write(row,1,'CNNVD编号')worksheet.write(row,2,'危害等级')worksheet.write(row,3,'CVE编号')worksheet.write(row,4,'漏洞类型')worksheet.write(row,5,'发布时间')worksheet.write(row,6,'攻击途径')worksheet.write(row,7,'更新时间')worksheet.write(row,8,'厂商')worksheet.write(row,9,'漏洞来源')worksheet.write(row,10,'漏洞描述')worksheet.write(row,11,'解决方案')worksheet.write(row,12,'参考链接')worksheet.write(row,13,'受影响实体')worksheet.write(row,14,'补丁')row = 1n = 0while n < len(excel):worksheet.write(row,col,excel[n])worksheet.write(row,col+1,excel[n+1])worksheet.write(row,col+2,excel[n+2])worksheet.write(row,col+3,excel[n+3])worksheet.write(row,col+4,excel[n+4])worksheet.write(row,col+5,excel[n+5])worksheet.write(row,col+6,excel[n+6])worksheet.write(row,col+7,excel[n+7])worksheet.write(row,col+8,excel[n+8])worksheet.write(row,col+9,excel[n+9])worksheet.write(row,col+10,excel[n+10])worksheet.write(row,col+11,excel[n+11])worksheet.write(row,col+12,excel[n+12])worksheet.write(row,col+13,excel[n+13])worksheet.write(row,col+14,excel[n+14])row += 1n += 15workbook.close()def main():agent_lists=[]vulnerabilities_lists=[]vulnerabilities_result_list = []#获取代理for i in range(1,2):url='http://www.xicidaili.com/nn/'+str(i)agent_list(url)#调用漏洞列表函数并获得漏洞链接列表begin = datetime.datetime.now()page_count = sys.argv[3]j = 1page_count = int(page_count)start_time = sys.argv[1]end_time = sys.argv[2]while j<=page_count:try:holes_url = 'http://cnnvd.org.cn/web/vulnerability/queryLds.tag?pageno=%d&repairLd='%jvulnerabilities_url_list(holes_url,start_time,end_time)print("已完成爬行第%d页"%j)print('\n')j+=1except:print('爬取失败,更换代理重新爬取。')time.sleep(10)global bibi = 0pool = threadpool.ThreadPool(5)requests = threadpool.makeRequests(vulnerabilities_data,vulnerabilities_lists)[pool.putRequest(req) for req in requests]pool.wait()#漏洞信息写入excel
    vulnerabilities_excel(vulnerabilities_result_list)#爬行结束end = datetime.datetime.now()total_time = end - beginprint ('漏洞信息爬取结束')print ('爬行漏洞数量: ',len(vulnerabilities_lists))print ('爬行时间: ',total_time)if __name__ == '__main__':main()

转载于:https://www.cnblogs.com/kaiho/p/8056317.html

python3-爬取cnnvd漏洞信息-代理+多线程相关推荐

  1. Python3 爬取豆瓣电影信息

    原文链接: Python3 爬取豆瓣电影信息 上一篇: python3 爬取电影信息 下一篇: neo4j 查询 豆瓣api https://developers.douban.com/wiki/?t ...

  2. python3爬取豆瓣电影信息,图片,有源码(使用简单)

    首先下载安装python3安装教程 在控制台(Windows按 win+R)下载python插件: python -m pip install --upgrade pip # 更新 pip insta ...

  3. python3爬取微信通讯录信息并保存头像

    安装 pip3 install itchat 代码 # -*- coding: utf-8 -*- import itchat#用于二维码登录微信, itchat.auto_login() #获取通讯 ...

  4. Python爬取12306车票信息

    Python3爬取12306车票信息 第一次写爬虫,咱从入门级--12306车票爬取 开始 我们要爬取的信息是https://www.12306.cn/index/上的车票信息 当我们选择出发地和目的 ...

  5. Python搭建代理池爬取拉勾网招聘信息

    先来看一张图了解下爬虫 实现功能 多线程爬取拉勾网招聘信息 维护代理 ip 池 搭建 node 服务器 Taro 使用 echarts 做数据分析 1.多线程爬取拉勾网招聘信息 Tip:涉及知识 1. ...

  6. Python2 Python3 爬取赶集网租房信息,带源码分析

    *之前偶然看了某个腾讯公开课的视频,写的爬取赶集网的租房信息,这几天突然想起来,于是自己分析了一下赶集网的信息,然后自己写了一遍,写完又用用Python3重写了一遍.之中也遇见了少许的坑.记一下.算是 ...

  7. Python3爬取网页信息乱码怎么解决?(更新:已解决)

    更新:乱码问题已经解决了. 将下面代码中的红色部分改为下面这样就不会出现个别职位信息乱码的情况了. soup2 = BeautifulSoup(wbdata2, 'html.parser',from_ ...

  8. [Python 爬虫] 使用 Scrapy 爬取新浪微博用户信息(四) —— 应对反爬技术(选取 User-Agent、添加 IP代理池以及Cookies池 )

    上一篇:[Python 爬虫] 使用 Scrapy 爬取新浪微博用户信息(三) -- 数据的持久化--使用MongoDB存储爬取的数据 最近项目有些忙,很多需求紧急上线,所以一直没能完善< 使用 ...

  9. Selenium+Python3爬取微博我发出的评论信息

    Selenium+Python3爬取微博我发出的评论信息 需求 代码 注: 需求 记录对话信息:对话文本.时间.用户.被回复链接.被回复用户.被回复文本. 将数据信息持久化保存,可选择截图. 代码 # ...

最新文章

  1. LeetCode简单题之二叉搜索树中的众数
  2. 一个研发团队是如何坚持7年技术分享的?
  3. 90 后利用平台漏洞薅羊毛,获利 45 万被抓捕!网友们却争论不休……
  4. Cocos2d:使用 CCCamera 做滚动效果 (Four Ways of Scrolling with Cocos2D)
  5. Jmeter拓展插件(jmeter-plugins)
  6. clickhouse 增量更新_ClickHouse王炸功能即将来袭?
  7. 使用“即时消息服务框架”(iMSF)实现分布式事务的三阶段提交协议(电商创建订单的示例)...
  8. 文件字符输入流 FileReader java
  9. CodeForces - 780C Andryusha and Colored Balloons(dfs染色)
  10. 重建控制文件--Rebuild controlfile
  11. 为什么日志只应该有三个级别
  12. 使用PyTorch建立您的第一个文本分类模型
  13. Android Studio安装教程
  14. 不经历风雨怎么见彩虹
  15. 座位预约php,座位预约系统使用指南
  16. 1919获阿里20亿投资,独角兽如何搅动酒饮业格局
  17. android手机直播推流
  18. 蓝牙耳机什么牌子好?安卓蓝牙耳机性价比推荐
  19. GET和POST两种基本请求方法的区别 1
  20. ipv4到ipv6过渡的三种方案

热门文章

  1. uni-app中props的Avoid mutating a prop的报错
  2. 软件测试(十五)—— Web专项测试
  3. html请求资源不可用,所请求的资源(/)不可用
  4. 「Masked Autoencoders」MAE算法相关及后续工作整理
  5. 计算机会计的专业知识,会计从业《会计电算化》基础知识:固定资产模块日常处理...
  6. Cesium 卫星轨迹、卫星通信、卫星过境,模拟数据传输。
  7. pdf转换成excel转换器怎么用
  8. ImageButton边框问题
  9. 判断字符串是否相等StringUtils.equals和String.equals
  10. 博客搭建十:hugo博客添加友链