mysql代码:

CREATE TABLE `copyright` (`id` int(11) NOT NULL AUTO_INCREMENT,`软件名称` varchar(500) DEFAULT NULL,`登记号` varchar(500) DEFAULT NULL,`分类号` varchar(500) DEFAULT NULL,`软件简称` varchar(500) DEFAULT NULL,`版本号` varchar(500) DEFAULT NULL,`首次发表日期` varchar(500) DEFAULT NULL,`登记批准日期` varchar(500) DEFAULT NULL,`软件著作权人` varchar(500) DEFAULT NULL,`软件著作权人详情` varchar(500) DEFAULT NULL,PRIMARY KEY (`id`),UNIQUE KEY `登记号` (`登记号`)
) ENGINE=InnoDB AUTO_INCREMENT=9871 DEFAULT CHARSET=utf8CREATE TABLE `patent` (`id` int(11) NOT NULL AUTO_INCREMENT,`专利名称` varchar(500) DEFAULT NULL,`发明人` varchar(500) DEFAULT NULL,`申请人` varchar(500) DEFAULT NULL,`申请日` datetime DEFAULT NULL,`公开日` datetime DEFAULT NULL,`详情地址` varchar(500) DEFAULT NULL,PRIMARY KEY (`id`),UNIQUE KEY `详情地址` (`详情地址`)
) ENGINE=InnoDB AUTO_INCREMENT=13610 DEFAULT CHARSET=utf8

python代码:

getpatentdata.py主程序
import re
from urllib.parse import unquote, quote
from lxml import etree
from requests_html import HTMLSessionfrom 抓取专利著作权信息.MysqlHelper import MysqlHelper# 获取专利信息
class Patent:def __init__(self, sqr, year):self.helper = MysqlHelper(host='localhost',port=8080,user='root',passwd='123',db='students',charset='utf8')self.creatTable()self.sum = 0while year <= 2019:if year >= 2016:dateList = ["%s-01-01" % str(year), "%s-02-01" % str(year), "%s-03-01" % str(year),"%s-04-01" % str(year), "%s-05-01" % str(year), "%s-06-01" % str(year),"%s-07-01" % str(year), "%s-08-01" % str(year), "%s-09-01" % str(year),"%s-10-01" % str(year), "%s-11-01" % str(year), "%s-12-01" % str(year),"%s-12-31" % str(year)]else:dateList = ["%s-01-01" % str(year), "%s-03-01" % str(year), "%s-05-01" % str(year),"%s-07-01" % str(year),"%s-09-01" % str(year),"%s-11-01" % str(year), "%s-12-31" % str(year)]print("*" * 66)print("\033[36m开始抓取%s年的专利数据,已累计抓取%s条数据\033[0m" % (str(year), self.sum))print("*" * 66)for i in range(len(dateList) - 1):self.getPatent(sqr, dateList[i], dateList[i + 1])else:year += 1else:print("\033[34m专利数据抓取完毕!共抓取%s条数据\033[0m" % str(self.sum))showFunction()# 建表def creatTable(self):sql1 = "CREATE TABLE `patent` (`id` int primary key not null auto_increment,`专利名称` varchar(500) DEFAULT NULL  ,`发明人` varchar(500) DEFAULT NULL ,`申请人` varchar(500) DEFAULT NULL,`申请日` datetime DEFAULT NULL,`公开日` datetime DEFAULT NULL,`详情地址` varchar(500) DEFAULT NULL UNIQUE);"self.helper.execute(sql1)# 获取数据 sqr申请人 sqday_start申请日开始 sqday_end申请日结束def getPatent(self, sqr, sqday_start, sqday_end):self.patent_url = "http://dbpub.cnki.net/Grid2008/Dbpub/Brief.aspx?curpage=8&RecordsPerPage=350&QueryID=64&ID=SCPD&turnpage=1&systemno=&NaviDatabaseName=SCPD_ZJCLS&NaviField=%e4%b8%93%e9%a2%98%e5%ad%90%e6%a0%8f%e7%9b%ae%e4%bb%a3%e7%a0%81&navigatorValue=&subBase=all"self.session = HTMLSession()self.add_url = "http://dbpub.cnki.net/Grid2008/Dbpub/"self.headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8','Accept-Encoding': 'gzip,deflate','Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2','Connection': 'keep-alive','Content-Type': 'application/x-www-form-urlencoded','Origin': 'http://dbpub.cnki.net','Host': 'dbpub.cnki.net','Upgrade-Insecure-Requests': '1','User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0'}# h = self.session.post(self.patent_url, headers=headers)# pagenum = etree.HTML(h.html.html).xpath('//div[@id="id_grid_total"]/text()')[0][5:-3]# print("共%s条数据" % pagenum)# self.patent_url = self.patent_url + "&RecordsPerPage=" + pagenumself.data = "ID=SCPD&hdnSearchType=&hdnIsAll=false&NaviField=%E4%B8%93%E9%A2%98%E5%AD%90%E6%A0%8F%E7%9B%AE%E4%BB%A3%E7%A0%81&NaviDatabaseName=SCPD_ZJCLS&systemno=&hdnFathorCode=sysAll&selectbox=I&strNavigatorValue=%2CA%2CB%2CC%2CD%2CE%2CF%2CH%2CI&strNavigatorName=%2C%E5%9F%BA%E7%A1%80%E7%A7%91%E5%AD%A6%2C%E5%B7%A5%E7%A8%8B%E7%A7%91%E6%8A%80%E2%85%A0%E8%BE%91%2C%E5%B7%A5%E7%A8%8B%E7%A7%91%E6%8A%80%E2%85%A1%E8%BE%91%2C%E5%86%9C%E4%B8%9A%E7%A7%91%E6%8A%80%2C%E5%8C%BB%E8%8D%AF%E5%8D%AB%E7%94%9F%E7%A7%91%E6%8A%80%2C%E5%93%B2%E5%AD%A6%E4%B8%8E%E4%BA%BA%E6%96%87%E7%A7%91%E5%AD%A6%2C%E7%A4%BE%E4%BC%9A%E7%A7%91%E5%AD%A6%E2%85%A1%E8%BE%91%2C%E4%BF%A1%E6%81%AF%E7%A7%91%E6%8A%80&singleleafcode=&searchAttachCondition=&SearchQueryID=5&SearchFieldRelationDirectory=&updateTempDB=&bCurYearTempDB=1&fieldtips=%E7%AF%87%E5%90%8D%2F%5B%E5%9C%A8%E6%96%87%E7%8C%AE%E6%A0%87%E9%A2%98%E4%B8%AD%E6%A3%80%E7%B4%A2%E3%80%82%E5%AF%B9%E8%AF%A5%E6%A3%80%E7%B4%A2%E9%A1%B9%E7%9A%84%E6%A3%80%E7%B4%A2%E6%98%AF%E6%8C%89%E8%AF%8D%E8%BF%9B%E8%A1%8C%E7%9A%84%EF%BC%8C%E8%AF%B7%E5%B0%BD%E5%8F%AF%E8%83%BD%E8%BE%93%E5%85%A5%E5%AE%8C%E6%95%B4%E7%9A%84%E8%AF%8D%EF%BC%8C%E4%BB%A5%E9%81%BF%E5%85%8D%E6%BC%8F%E6%A3%80%E3%80%82%5D%2C%E5%85%B3%E9%94%AE%E8%AF%8D%2F%5B%E6%A3%80%E7%B4%A2%E6%96%87%E7%8C%AE%E7%9A%84%E5%85%B3%E9%94%AE%E8%AF%8D%E4%B8%AD%E6%BB%A1%E8%B6%B3%E6%A3%80%E7%B4%A2%E6%9D%A1%E4%BB%B6%E7%9A%84%E6%96%87%E7%8C%AE%E3%80%82%E5%AF%B9%E8%AF%A5%E6%A3%80%E7%B4%A2%E9%A1%B9%E7%9A%84%E6%A3%80%E7%B4%A2%E6%98%AF%E6%8C%89%E8%AF%8D%E8%BF%9B%E8%A1%8C%E7%9A%84%EF%BC%8C%E8%AF%B7%E5%B0%BD%E5%8F%AF%E8%83%BD%E8%BE%93%E5%85%A5%E5%AE%8C%E6%95%B4%E7%9A%84%E8%AF%8D%EF%BC%8C%E4%BB%A5%E9%81%BF%E5%85%8D%E6%BC%8F%E6%A3%80%E3%80%82%5D%2C%E7%AC%AC%E4%B8%80%E8%B4%A3%E4%BB%BB%E4%BA%BA%2F%5B%E8%AF%B7%E9%80%89%E6%8B%A9%E6%A3%80%E7%B4%A2%E9%A1%B9%E5%B9%B6%E6%8C%87%E5%AE%9A%E7%9B%B8%E5%BA%94%E7%9A%84%E6%A3%80%E7%B4%A2%E8%AF%8D%EF%BC%8C%E9%80%89%E6%8B%A9%E6%8E%92%E5%BA%8F%E6%96%B9%E5%BC%8F%E3%80%81%E5%8C%B9%E9%85%8D%E6%A8%A1%E5%BC%8F%E3%80%81%E6%96%87%E7%8C%AE%E6%97%B6%E9%97%B4%E7%AD%89%E9%99%90%E5%AE%9A%E6%9D%A1%E4%BB%B6%EF%BC%8C%E7%84%B6%E5%90%8E%E7%82%B9%E5%87%BB%E2%80%9C%E6%A3%80%E7%B4%A2%E2%80%9D%E3%80%82%5D%2C%E4%BD%9C%E8%80%85%2F%5B%E5%8F%AF%E8%BE%93%E5%85%A5%E4%BD%9C%E8%80%85%E5%AE%8C%E6%95%B4%E5%A7%93%E5%90%8D%EF%BC%8C%E6%88%96%E5%8F%AA%E8%BE%93%E5%85%A5%E8%BF%9E%E7%BB%AD%E7%9A%84%E4%B8%80%E9%83%A8%E5%88%86%E3%80%82%5D%2C%E6%9C%BA%E6%9E%84%2F%5B%E5%8F%AF%E8%BE%93%E5%85%A5%E5%AE%8C%E6%95%B4%E7%9A%84%E6%9C%BA%E6%9E%84%E5%90%8D%E7%A7%B0%EF%BC%8C%E6%88%96%E5%8F%AA%E8%BE%93%E5%85%A5%E8%BF%9E%E7%BB%AD%E7%9A%84%E4%B8%80%E9%83%A8%E5%88%86%E3%80%82%5D%2C%E4%B8%AD%E6%96%87%E6%91%98%E8%A6%81%2F%5B%E5%AF%B9%E8%AF%A5%E6%A3%80%E7%B4%A2%E9%A1%B9%E7%9A%84%E6%A3%80%E7%B4%A2%E6%98%AF%E6%8C%89%E8%AF%8D%E8%BF%9B%E8%A1%8C%E7%9A%84%EF%BC%8C%E8%AF%B7%E5%B0%BD%E5%8F%AF%E8%83%BD%E8%BE%93%E5%85%A5%E5%AE%8C%E6%95%B4%E7%9A%84%E8%AF%8D%EF%BC%8C%E4%BB%A5%E9%81%BF%E5%85%8D%E6%BC%8F%E6%A3%80%E3%80%82%5D%2C%E5%BC%95%E6%96%87%2F%5B%E8%AF%B7%E9%80%89%E6%8B%A9%E6%A3%80%E7%B4%A2%E9%A1%B9%E5%B9%B6%E6%8C%87%E5%AE%9A%E7%9B%B8%E5%BA%94%E7%9A%84%E6%A3%80%E7%B4%A2%E8%AF%8D%EF%BC%8C%E9%80%89%E6%8B%A9%E6%8E%92%E5%BA%8F%E6%96%B9%E5%BC%8F%E3%80%81%E5%8C%B9%E9%85%8D%E6%A8%A1%E5%BC%8F%E3%80%81%E6%96%87%E7%8C%AE%E6%97%B6%E9%97%B4%E7%AD%89%E9%99%90%E5%AE%9A%E6%9D%A1%E4%BB%B6%EF%BC%8C%E7%84%B6%E5%90%8E%E7%82%B9%E5%87%BB%E2%80%9C%E6%A3%80%E7%B4%A2%E2%80%9D%E3%80%82%5D%2C%E5%85%A8%E6%96%87%2F%E8%AF%B7%E9%80%89%E6%8B%A9%E6%A3%80%E7%B4%A2%E9%A1%B9%E5%B9%B6%E6%8C%87%E5%AE%9A%E7%9B%B8%E5%BA%94%E7%9A%84%E6%A3%80%E7%B4%A2%E8%AF%8D%EF%BC%8C%E9%80%89%E6%8B%A9%E6%8E%92%E5%BA%8F%E6%96%B9%E5%BC%8F%E3%80%81%E5%8C%B9%E9%85%8D%E6%A8%A1%E5%BC%8F%E3%80%81%E6%96%87%E7%8C%AE%E6%97%B6%E9%97%B4%E7%AD%89%E9%99%90%E5%AE%9A%E6%9D%A1%E4%BB%B6%EF%BC%8C%E7%84%B6%E5%90%8E%E7%82%B9%E5%87%BB%E2%80%9C%E6%A3%80%E7%B4%A2%E2%80%9D%E3%80%82%5D%2C%E5%9F%BA%E9%87%91%2F%5B%E6%A3%80%E7%B4%A2%E5%8F%97%E6%BB%A1%E8%B6%B3%E6%9D%A1%E4%BB%B6%E7%9A%84%E5%9F%BA%E9%87%91%E8%B5%84%E5%8A%A9%E7%9A%84%E6%96%87%E7%8C%AE%E3%80%82%5D%2C%E4%B8%AD%E6%96%87%E5%88%8A%E5%90%8D%2F%5B%E8%AF%B7%E8%BE%93%E5%85%A5%E9%83%A8%E5%88%86%E6%88%96%E5%85%A8%E9%83%A8%E5%88%8A%E5%90%8D%E3%80%82%5D%2CISSN%2F%5B%E8%AF%B7%E8%BE%93%E5%85%A5%E5%AE%8C%E6%95%B4%E7%9A%84ISSN%E5%8F%B7%E3%80%82%5D%2C%E5%B9%B4%2F%5B%E8%BE%93%E5%85%A5%E5%9B%9B%E4%BD%8D%E6%95%B0%E5%AD%97%E7%9A%84%E5%B9%B4%E4%BB%BD%E3%80%82%5D%2C%E6%9C%9F%2F%5B%E8%BE%93%E5%85%A5%E6%9C%9F%E5%88%8A%E7%9A%84%E6%9C%9F%E5%8F%B7%EF%BC%8C%E5%A6%82%E6%9E%9C%E4%B8%8D%E8%B6%B3%E4%B8%A4%E4%BD%8D%E6%95%B0%E5%AD%97%EF%BC%8C%E8%AF%B7%E5%9C%A8%E5%89%8D%E9%9D%A2%E8%A1%A5%E2%80%9C0%E2%80%9D%EF%BC%8C%E5%A6%82%E2%80%9C08%E2%80%9D%E3%80%82%5D%2C%E4%B8%BB%E9%A2%98%2F%5B%E4%B8%BB%E9%A2%98%E5%8C%85%E6%8B%AC%E7%AF%87%E5%90%8D%E3%80%81%E5%85%B3%E9%94%AE%E8%AF%8D%E3%80%81%E4%B8%AD%E6%96%87%E6%91%98%E8%A6%81%E3%80%82%E5%8F%AF%E6%A3%80%E7%B4%A2%E5%87%BA%E8%BF%99%E4%B8%89%E9%A1%B9%E4%B8%AD%E4%BB%BB%E4%B8%80%E9%A1%B9%E6%88%96%E5%A4%9A%E9%A1%B9%E6%BB%A1%E8%B6%B3%E6%8C%87%E5%AE%9A%E6%A3%80%E7%B4%A2%E6%9D%A1%E4%BB%B6%E7%9A%84%E6%96%87%E7%8C%AE%E3%80%82%E5%AF%B9%E4%B8%BB%E9%A2%98%E6%98%AF%E6%8C%89%E8%AF%8D%E6%A3%80%E7%B4%A2%E7%9A%84%EF%BC%8C%E8%AF%B7%E5%B0%BD%E5%8F%AF%E8%83%BD%E8%BE%93%E5%85%A5%E5%AE%8C%E6%95%B4%E7%9A%84%E8%AF%8D%EF%BC%8C%E4%BB%A5%E9%81%BF%E5%85%8D%E6%BC%8F%E6%A3%80%E3%80%82%5D&advancedfield1=%E7%94%B3%E8%AF%B7%E4%BA%BA&advancedvalue1=" + quote(sqr) + "&imageField.x=50&imageField.y=11&searchmatch=0&order=dec&RecordsPerPage=350&hdnUSPSubDB=%E4%B8%93%E5%88%A9%E7%B1%BB%E5%88%AB%2C%2B1%2B2%2B3%2B%2C3%2C3&TableType=PY&display=chinese&encode=gb&TablePrefix=SCPD&View=SCPD&yearFieldName=%E5%B9%B4&userright=&VarNum=1&MM_fieldValue_1_1=" + sqday_start + "&MM_fieldValue_1_2=" + sqday_end + "&MM_slt_updateTime=&MM_Update_Time=&MM_Update_EndTime=&MM_fieldValue_2_1=&MM_fieldValue_2_2=&MM_hiddenTxtName=MM_fieldValue_1_1%40%40%40MM_fieldValue_1_2%40%40%40MM_fieldValue_2_1%40%40%40MM_fieldValue_2_2%40%40%40MM_Update_Time%40%40%40MM_Update_EndTime&MM_fieldName=%E7%94%B3%E8%AF%B7%E6%97%A5%40%40%40%E7%94%B3%E8%AF%B7%E6%97%A5%40%40%40%E5%85%AC%E5%BC%80%E6%97%A5%40%40%40%E5%85%AC%E5%BC%80%E6%97%A5%40%40%40%E6%9B%B4%E6%96%B0%E6%97%A5%E6%9C%9F%40%40%40%E6%9B%B4%E6%96%B0%E6%97%A5%E6%9C%9F&MM_hiddenRelation=%3E%3D%40%40%40%3C%3D%40%40%40%3E%3D%40%40%40%3C%3D%40%40%40%3E%3D%40%40%40%3C%3D&lastpage=7&RecordsPerPage2=50&systemno=%2C&classtype=&QueryID=5&turnpage=&curpage=1&curpage1=1&curpage2=1"# print(unquote(datas))# print(h.text)patentdata = ''while len(patentdata) == 0:self.h1 = self.session.post(self.patent_url, headers=self.headers, data=self.data)patentdata = etree.HTML(self.h1.text).xpath('//table[@class="s_table"]//tr')print("\033[31m从%s 到 %s 共有%s条专利数据\033[0m" % (sqday_start, sqday_end, len(patentdata) - 1))# 数据写入数据库for i in range(1, len(patentdata)):item = patentdata[i]# number = item.xpath('./td[@class="s_tabletd_rb"]')[0].xpath('string(.)')patentname = item.xpath('./td[@class="s_tabletd_rb"]')[1].xpath('string(.)')patentpeople = item.xpath('./td[@class="s_tabletd_rb"]')[2].xpath('string(.)')sqpeople = item.xpath('./td[@class="s_tabletd_rb"]')[3].xpath('string(.)')sqday = item.xpath('./td[@class="s_tabletd_rb"]')[4].xpath('string(.)')openday = item.xpath('./td[@class="s_tabletd_rb"]')[5].xpath('string(.)')address = self.add_url + item.xpath('./td[@class="s_tabletd_rb"]//a/@href')[0]# print("*" * 66)sql = "insert into patent(`专利名称`,`发明人`,`申请人` ,`申请日`,`公开日` ,`详情地址`) values(%s,%s,%s,%s,%s,%s)on duplicate key update `专利名称` = %s and `发明人` = %s and `申请人` = %s  and `申请日` = %s  and `公开日` = %s  and `详情地址` = %s;"params = [patentname, patentpeople, sqpeople, sqday, openday, address, patentname,patentpeople, sqpeople, sqday, openday, address]result = self.helper.execute(sql, params)if str(result).__contains__('1292'):passelse:print(str(i) + '.' + patentname + '   数据入库成功!')self.sum += 1# 获取软件著作权信息
class Copyright:def __init__(self, key):self.helper = MysqlHelper(host='localhost',port=8080,user='root',passwd='123',db='students',charset='utf8')self.creatTable()self.getCopyrightData(key)# 建表def creatTable(self):sql1 = "CREATE TABLE `copyright` (`id` int primary key not null auto_increment,`软件名称` varchar(500) DEFAULT NULL  ,`登记号` varchar(500) DEFAULT NULL UNIQUE,`分类号` varchar(500) DEFAULT NULL,`软件简称` varchar(500) DEFAULT NULL,`版本号` varchar(500) DEFAULT NULL,`首次发表日期` varchar(500) DEFAULT NULL,`登记批准日期` varchar(500) DEFAULT NULL,`软件著作权人` varchar(500) DEFAULT NULL,`软件著作权人详情` varchar(500) DEFAULT NULL);"self.helper.execute(sql1)def getCopyrightData(self, key):self.add_url = 'https://www.qichacha.com'self.page = 1self.sum = 0self.copyright_url = "https://www.qichacha.com/more_rjzzq.html?key=%s&p=%s" % (key, str(self.page))self.session = HTMLSession()self.headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8','Accept-Encoding': 'gzip, deflate, br','Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2','Cache-Control': 'max-age=0','Connection': 'keep-alive','Cookie': 'acw_tc=9dff1e1d15740724795763997e1d4fc677c413795a13ba5e12a187111d; QCCSESSID=4koqg095imku2ge3616s51au67; _uab_collina=157407248111977014224544; zg_did=%7B%22did%22%3A%20%2216e7e07f5ad448-0acb91ff1d41898-4c302b7a-fa000-16e7e07f5af58a%22%7D; zg_de1d1a35bfa24ce29bbf2c7eb17e6c4f=%7B%22sid%22%3A%201574072481204%2C%22updated%22%3A%201574072508771%2C%22info%22%3A%201574072481208%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22%22%2C%22cuid%22%3A%20%223edfa18efe756b45eb94b06651c93d3a%22%7D; UM_distinctid=16e7e07f5e0258-0c4202a4fac742-4c302b7a-fa000-16e7e07f5e6346; CNZZDATA1254842228=281213894-1574070903-%7C1574070903; Hm_lvt_3456bee468c83cc63fb5147f119f1075=1574072482; Hm_lpvt_3456bee468c83cc63fb5147f119f1075=1574072509','Host': 'www.qichacha.com','Upgrade-Insecure-Requests': '1','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0'}try:self.h = self.session.get(self.copyright_url, headers=self.headers)except Exception:self.h = self.session.get(self.copyright_url, headers=self.headers)# print(self.h.text)# 获取总页数pagesum = etree.HTML(self.h.text).xpath('//a[@class="end"]/text()')[0]while self.page <= int(pagesum):print("*" * 66)print('\033[31m开始抓取第%s页的数据,共%s页\033[0m' % (self.page, pagesum))self.copyright_url = "https://www.qichacha.com/more_rjzzq.html?key=%s&p=%s" % (key, str(self.page))self.copyrightdata = ''while not len(self.copyrightdata):self.h = self.session.get(self.copyright_url, headers=self.headers)self.copyrightdata = etree.HTML(self.h.text).xpath('//section[@id="searchlist"]')# 数据写入数据库print("*" * 66)for item in self.copyrightdata:# 软件名称copyrightname = item.xpath('.//span[@class="name"]')[0].xpath('string(.)')djh_and_flh = re.split(r'[::]',re.sub(r'\s+', '',item.xpath('.//small[@class="text-muted clear text-ellipsis m-t-xs"]')[0].xpath('string(.)')))# 登记号djh = re.findall(r'(.*?)分类号', djh_and_flh[1])[0]# 分类号flh = djh_and_flh[2]if not len(djh):djh = '空'if not len(flh):flh = '空'rjjc_and_bbh = re.split(r':', re.sub(r'\s+', '', item.xpath('.//small[@class="text-muted clear text-ellipsis m-t-xs"]')[1].xpath('string(.)')))# 软件简称rjname = re.findall(r'(.*?)版本号', rjjc_and_bbh[1])[0]# 版本号bbh = rjjc_and_bbh[2]fbtime_and_pztime = re.split(r':', re.sub(r'\s+', '', item.xpath('.//small[@class="text-muted clear text-ellipsis m-t-xs"]')[2].xpath('string(.)')))# 首次发表日期fbtime = re.findall(r'(.*?)登记批准日期', fbtime_and_pztime[1])[0]# 登记批准日期pztime = fbtime_and_pztime[2]if djh == '-':djh = ''if flh == '-':flh = ''# 软件著作权人rjzzqr = re.split(r':', re.sub(r'\s+', '',item.xpath('.//footer [@class="panel-footer clear"]')[0].xpath('string(.)')))[1]# 软件著作权人详情try:rjurl = self.add_url + item.xpath('.//footer [@class="panel-footer clear"]/a/@href')[0]except IndexError:rjurl = '空'sql = "insert into copyright(`软件名称`,`登记号`,`分类号` ,`软件简称`,`版本号` ,`首次发表日期`,`登记批准日期`,`软件著作权人`,`软件著作权人详情`) values(%s,%s,%s,%s,%s,%s,%s,%s,%s)on duplicate key update `软件名称` = %s and `登记号` = %s and `分类号` = %s  and `软件简称` = %s  and `版本号` = %s  and `首次发表日期` = %s and `登记批准日期` = %s and `软件著作权人` = %s and `软件著作权人详情` = %s;"params = [copyrightname, djh, flh, rjname, bbh, fbtime, pztime,rjzzqr, rjurl, copyrightname, djh, flh, rjname, bbh, fbtime, pztime,rjzzqr, rjurl]result = self.helper.execute(sql, params)if str(result).__contains__('1292'):passelse:print(copyrightname + ',' +djh + ',' + flh + ',' + rjname + ',' + bbh + ',' + fbtime + ',' + pztime + ',' + rjzzqr + ',' + rjurl)print('数据入库成功!')# print("*" * 66)# 累计数据数量self.sum += 1else:print('\033[34m累计抓取数据%s条!\033[0m' % self.sum)self.page += 1else:print("\033[34m著作权数据抓取完毕!共抓取%s条数据\033[0m" % str(self.sum))showFunction()# 展示功能菜单
def showFunction():print("*" * 66)print("\t\t\t\t\t专利著作权信息下载工具V1.0\t\t\t\t\t")print("*" * 66)print("\033[34m请选择功能\n1.抓取全部专利数据\n2.已抓取全部专利数据,执行更新数据操作\n3.抓取全部著作权数据\n4.退出程序\033[0m")print("*" * 66)point = Truewhile point:fuc = input('请输入功能序号:')if not fuc.isdigit():print("\033[31m输入错误,请输入功能序号!\033[0m")point = Trueelif int(fuc) == 1:strs = input("请输入申请人关键词(直接回车键返回上一级):")if not len(strs):point = Trueelse:Patent(strs, 1985)point = Falseelif int(fuc) == 2:strs = input("请输入申请人关键词(直接回车键返回上一级):")if not len(strs):point = Trueelse:Patent(strs, 2019)point = Falseelif int(fuc) == 3:strs = input("请输入著作权关键词(直接回车键返回上一级):")if not len(strs):point = Trueelse:Copyright(strs)point = Falseelif int(fuc) == 4:print('程序已关闭...')exit()else:print("\033[31m输入错误,请输入正确的功能序号!\033[0m")point = True# Patent("江西")# Patent("南昌")if __name__ == '__main__':showFunction()
MysqlHelper.py数据库辅助连接类:
from click._compat import raw_input
from pymysql import *"""封装mysql连接类"""class MysqlHelper:"""初始化数据库参数"""def __init__(self, host, port, user, passwd, db, charset):# 数据库连接地址self.host = host# 地址端口self.port = port# 数据库用户名self.user = user# 数据库密码self.passwd = passwd# 数据库名称self.db = db# 编码self.charset = charset"""连接数据库,获取Connection对象和cursor游标对象"""def open(self):self.conn = connect(host=self.host, port=self.port, user=self.user, passwd=self.passwd, db=self.db,charset=self.charset)self.cursor = self.conn.cursor()"""执行用户输入的sql语句,参数化sql语句中的输入值"""def execute(self, sql, params=()):try:# 打开数据库连接self.open()# 执行sql语句self.cursor.execute(sql, params)# 提交事务self.conn.commit()# 关闭数据库连接self.close()# print("sql执行完成")except Exception as e:# 发送错误回滚# self.rollback()return edef createDataBase(self, sql, params=()):try:# 打开数据库连接conn = connect(host=self.host, port=self.port, user=self.user, passwd=self.passwd,charset=self.charset)cursor = conn.cursor()# 执行sql语句cursor.execute(sql, params)# 提交事务conn.commit()# 关闭数据库连接cursor.close()conn.close()# print("sql执行完成")except Exception as e:# 发送错误回滚# self.rollback()print(e)"""返回sql全部查询结果"""def all(self, sql, params=()):try:# 打开数据库连接self.open()# 执行sql语句self.cursor.execute(sql, params)# 调用cursor的fetchall获取全部执行结果result = self.cursor.fetchall()# 关闭数据库连接self.close()# 返回执行结果return resultexcept Exception as e:return e"""返回sql查询结果一行"""def single(self, sql, params=()):try:# 打开数据库连接self.open()# 执行sql语句self.cursor.execute(sql, params)# 调用cursor的fetchone获取全部执行结果中的一条result = self.cursor.fetchone()# 关闭数据库连接self.close()# 返回执行结果return resultexcept Exception as e:print(e)"""数据库回滚"""def rollback(self):self.conn.rollback()"""关闭数据库"""def close(self):self.cursor.close()self.conn.close()"""测试用"""
if __name__ == '__main__':msh = MysqlHelper('localhost', 8080, 'root', '123', 'test', 'utf8')name = raw_input('请输入学生姓名:')sbname = raw_input('请输入科目名称:')sql = 'insert into students(name) values(%s)'sql1 = 'insert into subjects(sbname) values(%s)'sql2 = 'select id,name from students where id<5'msh.execute(sql, [name])msh.execute(sql1, [sbname])print(msh.all(sql2))

程序可能存在部分bug,欢迎交流指正。

python爬虫篇4——爬取专利著作权信息相关推荐

  1. [Python 爬虫] 使用 Scrapy 爬取新浪微博用户信息(四) —— 应对反爬技术(选取 User-Agent、添加 IP代理池以及Cookies池 )

    上一篇:[Python 爬虫] 使用 Scrapy 爬取新浪微博用户信息(三) -- 数据的持久化--使用MongoDB存储爬取的数据 最近项目有些忙,很多需求紧急上线,所以一直没能完善< 使用 ...

  2. [Python 爬虫] 使用 Scrapy 爬取新浪微博用户信息(二) —— 编写一个基本的 Spider 爬取微博用户信息

    上一篇:[Python 爬虫] 使用 Scrapy 爬取新浪微博用户信息(一) -- 新建爬虫项目 在上一篇我们新建了一个 sina_scrapy 的项目,这一节我们开始正式编写爬虫的代码. 选择目标 ...

  3. [Python 爬虫] 使用 Scrapy 爬取新浪微博用户信息(三) —— 数据的持久化——使用MongoDB存储爬取的数据

    上一篇:[Python 爬虫] 使用 Scrapy 爬取新浪微博用户信息(二) -- 编写一个基本的 Spider 爬取微博用户信息 在上一篇博客中,我们已经新建了一个爬虫应用,并简单实现了爬取一位微 ...

  4. Python爬虫入门(爬取豆瓣电影信息小结)

    Python爬虫入门(爬取豆瓣电影信息小结) 1.爬虫概念 网络爬虫,是一种按照一定规则,自动抓取互联网信息的程序或脚本.爬虫的本质是模拟浏览器打开网页,获取网页中我们想要的那部分数据. 2.基本流程 ...

  5. python爬虫训练:爬取榜单信息

    一.创作背景 这学期的大作业是要根据这学期的学习内容做一个综合程序,这次是一个爬取酷狗音乐飙升榜单的信息,并下载下来.可以方便和我一样喜欢白嫖的人员免费下载音乐. 二.使用的库 主要使用了reques ...

  6. Python爬虫使用selenium爬取天猫商品信息

    文章目录 很多人学习python,不知道从何学起. 很多人学习python,掌握了基本语法过后,不知道在哪里寻找案例上手. 很多已经做案例的人,却不知道如何去学习更加高深的知识. 那么针对这三类人,我 ...

  7. python爬虫篇1——爬取中英文论文文献数据

    程序运行截图: mysql代码: CREATE TABLE `article` (`id` int(11) NOT NULL,`article_time` varchar(50) DEFAULT NU ...

  8. Python 爬虫系列:爬取全球机场信息

    前言 最近公司需要全球机场信息,用来做一些数据分析.刚好发现有个网站上有这个信息,只是没有机场的经纬度信息,不过有了机场信息,经纬度信息到时候我们自己补上去就行 网站元素分析 我们找到了有这些信息的网 ...

  9. python爬虫使用selenium爬取动态网页信息——以智联招聘网站为例

    python版本3.6 #导入两个模块 from selenium import webdriver import time from openpyxl import Workbook import ...

最新文章

  1. Java项目:精美网上音乐平台(前后端分离+java+vue+Springboot+ssm+mysql+maven+redis)
  2. 图解Win32汇编字符串和Debug输出
  3. NTU 课程笔记:MAS 714(16) 图灵机
  4. Web前端培训:有哪些好用的前端开发工具呢?
  5. UDT协议实现分析——UDT初始化和销毁
  6. 操作系统:第一章 计算机系统概述
  7. 基于Solr DIH实现MySQL表数据全量索引和增量索引
  8. 计算机组组内培训记录,计算机教研组活动记录.doc
  9. Java8新特性总结 -5.Stream API函数式操作流元素集合
  10. php 嵌套函数公式解析,Pyparsing,使用嵌套解析器解析php函数注释块的内容
  11. 布尔运算(Boolean Operations)
  12. 《塞尔达传说》系列游戏评测
  13. 在html css中加粗显示,HTML和CSS实现字体加粗的方法有哪些
  14. charles 请求出现乱码_解决Charles抓取https报文乱码问题
  15. WebApp实时开源框架Clouda入门使用与记录
  16. mysql报duplicate_mysql DUPLICATE KEY UPDATE 问题
  17. Mac上安装最流畅的Windows XP Lite(免费虚拟机VirtualBox)
  18. cf1675 F.Vlad and Unfinished Business
  19. linux分区方案探讨
  20. Performance Counter的使用

热门文章

  1. 用python画气球循环画图_Python 利用循环画散点图
  2. C陷阱与缺陷学习总结
  3. 以前画过的一些框架图拿出来当花瓶,高手勿喷啊
  4. Safari浏览器下colgroup失效导致表格列宽均分
  5. 时间提示问候语、导航栏制作
  6. Vue3 解决电脑分辨率及缩放导致页面变形的问题
  7. 视频技术系列 - 谈显示屏技术
  8. UPNP协议(超详细)--客户端之一简介
  9. 荣耀9原生android,荣耀9评测:系统体验和性能音效
  10. uni-app+vue+后台管理实现记账流水功能的移动应用