python3-爬取cnnvd漏洞库

这几天要爬cnnvd漏洞库
网上找了半天发现各种代码不是跑不了就不好用
于是本菜鸡自己写了一个
先上效果

我是按页爬的
一页目录10条漏洞
一次爬1000页一万条
一共大概128900条
表里面的内容
不是多线程（不会），至于速度，每一万条在半小时左右
我每次开`6个程序爬，大概一个小时爬完的，，，

源代码就贴在这里了（虽然想要积分）
然后爬好的xls在这里，不想花时间爬的可以直接下
python代码和爬好的xls

# -*- coding:utf-8 -*-
import sys
#print (u'系统默认编码为',sys.getdefaultencoding())
default_encoding = 'utf-8' #重新设置编码方式为uft-8
if sys.getdefaultencoding() != default_encoding:reload(sys)sys.setdefaultencoding(default_encoding)
#print (u'系统默认编码为',sys.getdefaultencoding())
import requests
from  bs4 import BeautifulSoup
import traceback
import re
import xlwtdef getURLDATA(url):#url = 'http://www.cnnvd.org.cn/web/xxk/ldxqById.tag?CNNVD=CNNVD-201901-1014'header={'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36','Connection': 'keep-alive',}r=requests.get(url,headers=header,timeout=30)#r.raise_for_status()抛出异常html = BeautifulSoup(r.content.decode(),'html.parser')link=html.find(class_='detail_xq w770')#漏洞信息详情link_introduce=html.find(class_='d_ldjj')#漏洞简介link_others=html.find_all(class_='d_ldjj m_t_20')#其他#print(len(link_introduce))try:#print ("危害等级:"+link.contents[3].contents[3].find('a').text.lstrip().rstrip())#危害等级list4.append(str(link.contents[3].contents[3].find('a').text.lstrip().rstrip()))except:#print("危害等级:is empty")list4.append("")try:#print ("CVE编号:"+link.contents[3].contents[5].find('a').text.lstrip().rstrip())#CVE编号list5.append(str(link.contents[3].contents[5].find('a').text.lstrip().rstrip()))except:#print("CVE编号:is empty")list5.append("")try:#print ("漏洞类型:"+link.contents[3].contents[7].find('a').text.lstrip().rstrip())#漏洞类型list6.append(str(link.contents[3].contents[7].find('a').text.lstrip().rstrip()))except :#print("漏洞类型:is empty")list6.append("")try:#print ("发布时间:"+link.contents[3].contents[9].find('a').text.lstrip().rstrip())#发布时间list7.append(str(link.contents[3].contents[9].find('a').text.lstrip().rstrip()))except :#print("发布时间:is empty")list7.append("")   try:#print ("威胁类型:"+link.contents[3].contents[11].find('a').text.lstrip().rstrip())#威胁类型list8.append(str(link.contents[3].contents[11].find('a').text.lstrip().rstrip()))except :#print("威胁类型:is empty")list8.append("")try:#print ("更新时间:"+link.contents[3].contents[13].find('a').text.lstrip().rstrip())#更新时间list9.append(str(link.contents[3].contents[13].find('a').text.lstrip().rstrip()))except :#print("更新时间:is empty")list9.append("")try:#print ("厂商:"+link.contents[3].contents[15].find('a').text.lstrip().rstrip())#厂商list10.append(str(link.contents[3].contents[15].find('a').text.lstrip().rstrip()))except:#print("厂商:is empty")list10.append("")     #link_introduce=html.find(class_='d_ldjj')#漏洞简介try:link_introduce_data=BeautifulSoup(link_introduce.decode(),'html.parser').find_all(name='p')s=""for i in range(0,len(link_introduce_data)):##print (link_introduce_data[i].text.lstrip().rstrip())s=s+str(link_introduce_data[i].text.lstrip().rstrip())#print(s)list11.append(s)except :list11.append("")if(len(link_others)!=0):#link_others=html.find_all(class_='d_ldjj m_t_20')#print(len(link_others))try:#漏洞公告link_others_data1=BeautifulSoup(link_others[0].decode(),'html.parser').find_all(name='p')s=""for i in range(0,len(link_others_data1)):##print (link_others_data1[i].text.lstrip().rstrip())s=s+str(link_others_data1[i].text.lstrip().rstrip())#print(s)list12.append(s)except:list12.append("")try:#参考网址link_others_data2=BeautifulSoup(link_others[1].decode(),'html.parser').find_all(name='p')s=""for i in range(0,len(link_others_data2)):##print (link_others_data2[i].text.lstrip().rstrip())s=s+str(link_others_data2[i].text.lstrip().rstrip())#print(s)list13.append(s)except:list13.append("")try:#受影响实体link_others_data3=BeautifulSoup(link_others[2].decode(),'html.parser').find_all('a',attrs={'class':'a_title2'})s=""for i in range(0,len(link_others_data3)):##print (link_others_data3[i].text.lstrip().rstrip())s=s+str(link_others_data3[i].text.lstrip().rstrip())#print(s)list14.append(s)except:list14.append("")try:#补丁link_others_data3=BeautifulSoup(link_others[3].decode(),'html.parser').find_all('a',attrs={'class':'a_title2'})s=""for i in range(0,len(link_others_data3)):##print (link_others_data3[i].text.lstrip().rstrip())s=s+str(link_others_data3[i].text.lstrip().rstrip())#print(s)list15.append(s)except:list15.append("")else:list12.append("")list13.append("")list14.append("")list15.append("")if __name__=="__main__":global list4global list5global list6global list7global list8global list9global list10global list11global list12global list13global list14global list15list1 = []#网站的urllist2 = []#漏洞的名称list3 = []#cnnvd编号list4=[]#危害等级list5=[]#CVE编号list6=[]#漏洞类型list7=[]#发布时间list8=[]#威胁类型list9=[]#更新时间list10=[]#厂商list11=[]#漏洞简介list12=[]#漏洞公告list13=[]#参考网址list14=[]#受影响实体list15=[]#补丁    start=11400last=12000#url = 'http://www.cnnvd.org.cn/web/xxk/ldxqById.tag?CNNVD=CNNVD-201901-1014'#getURLDATA(url)f = xlwt.Workbook()  # 创建EXCEL工作簿sheet1 = f.add_sheet(u'sheet1', cell_overwrite_ok=True)  # 创建sheetsheet1.write(0, 0, "漏洞名称")sheet1.write(0, 1, "网址")sheet1.write(0, 2, "CNNVD编号")sheet1.write(0, 3, "危害等级")sheet1.write(0, 4, "CVE编号")sheet1.write(0, 5, "漏洞类型")sheet1.write(0, 6, "发布时间")sheet1.write(0, 7, "威胁类型")sheet1.write(0, 8, "更新时间")sheet1.write(0, 9, "厂商")sheet1.write(0, 10, "漏洞简介")sheet1.write(0, 11, "漏洞公告")sheet1.write(0, 12, "参考网址")sheet1.write(0, 13, "受影响实体")sheet1.write(0, 14, "补丁")for j in range(start,last+1):# url='http://www.cnnvd.org.cn/web/vulnerability/querylist.tag?pageno=1&repairLd='url = 'http://www.cnnvd.org.cn/web/vulnerability/querylist.tag?pageno='+str(j)+'&repairLd='print ("page"+str(j))header={'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36','Connection': 'keep-alive',}r=requests.get(url,headers=header,timeout=30)#r.raise_for_status()抛出异常html = BeautifulSoup(r.content.decode(),'html.parser')link=html.find_all(class_='a_title2')for i in link:##print (i.text.lstrip())try:list1.append(i.text.lstrip())##print ("http://www.cnnvd.org.cn"+i.attrs['href'])k=str(i.attrs['href'])list2.append("http://www.cnnvd.org.cn"+k)list3.append(k[28:])#print("http://www.cnnvd.org.cn"+k)getURLDATA("http://www.cnnvd.org.cn"+k)except:print("http://www.cnnvd.org.cn"+k)breakfor i in range(len(list15)):sheet1.write(i + 1, 0, list1[i])sheet1.write(i + 1, 1, list2[i])sheet1.write(i + 1, 2, list3[i])sheet1.write(i + 1, 3, list4[i])sheet1.write(i + 1, 4, list5[i])sheet1.write(i + 1, 5, list6[i])sheet1.write(i + 1, 6, list7[i])sheet1.write(i + 1, 7, list8[i])sheet1.write(i + 1, 8, list9[i])sheet1.write(i + 1, 9, list10[i])sheet1.write(i + 1, 10, list11[i])sheet1.write(i + 1, 11, list12[i])sheet1.write(i + 1, 12, list13[i])sheet1.write(i + 1, 13, list14[i])sheet1.write(i + 1, 14, list15[i])f.save(str(start)+"-"+str(last)+".xls")  #保存文件

python3-爬取cnnvd漏洞库相关推荐

Python2 Python3 爬取赶集网租房信息,带源码分析
*之前偶然看了某个腾讯公开课的视频,写的爬取赶集网的租房信息,这几天突然想起来,于是自己分析了一下赶集网的信息,然后自己写了一遍,写完又用用Python3重写了一遍.之中也遇见了少许的坑.记一下.算是 ...
python3爬取百度图片
python3爬取百度图片最终目的:能通过输入关键字进行搜索,爬取相应的图片存储到本地或者数据库首先打开百度图片的网站,搜索任意一个关键字,比如说:水果,得到如下的界面分析: 1.百度图片搜索结 ...
Python3爬取影片入库
Python3爬取影片入库 1.服务器说明 [root@openshift maoyan]# cat /etc/redhat-release CentOS Linux release 7.4.1708 ...
Python3爬取企查查网站的企业年表并存入MySQL
Python3爬取企查查网站的企业年表并存入MySQL 本篇博客的主要内容:爬取企查查网站的企业年报数据,存到mysql中,为了方便记录,分成两个模块来写: 第一个模块是爬取数据+解析数据,并将数据存 ...
Python3爬取国家统计局官网2019年全国所有城市（2020年更新）
Python3爬取国家统计局官网2019年全国所有城市(2020年更新) 一级城市爬取一级城市爬取由于最近需要用到所有城市的数据,故从统计局爬取19年的一级城市数据 import random i ...
python3爬取巨潮资讯网站年报数据
python3爬取巨潮资讯网站年报数据 2018年年底巨潮资讯http://www.cninfo.com.cn改版了,之前实习生从网上找的脚本不能用了,因此重新修改了下爬取脚本.最初脚本的原链接忘了, ...
使用python3 爬取豆瓣电影热映和即将上映
使用python3爬取都摆即将上映和正在热映的电影,代码如下直接使用bs4获取页面,使用css 获取到对应的信息后,使用字符串拼接的方式,将正在热映和即将上映的信息拼接出来并写入到html页面中,在 ...
Python3 爬取豆瓣电影信息
原文链接: Python3 爬取豆瓣电影信息上一篇: python3 爬取电影信息下一篇: neo4j 查询豆瓣api https://developers.douban.com/wiki/?t ...
python3爬取数据_python3爬取巨潮资讯网站年报数据
python3爬取巨潮资讯网站年报数据 2018年年底巨潮资讯http://www.cninfo.com.cn改版了,之前实习生从网上找的脚本不能用了,因此重新修改了下爬取脚本.最初脚本的原链接忘了, ...
python3爬取巨潮资讯网的年报数据
python3爬取巨潮资讯网的年报数据前期准备: 需要用到的库: 完整代码: 前期准备: 巨潮资讯网有反爬虫机制,所以先打开巨潮资讯网的年报板块,看看有什么解决办法. 巨潮咨询年报板块可以通过这样 ...

python3-爬取cnnvd漏洞库

python3-爬取cnnvd漏洞库相关推荐

最新文章

热门文章