python 版权保护,python爬虫篇4——爬取专利著作权信息
mysql代码:
CREATE TABLE `copyright` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`软件名称` varchar(500) DEFAULT NULL,
`登记号` varchar(500) DEFAULT NULL,
`分类号` varchar(500) DEFAULT NULL,
`软件简称` varchar(500) DEFAULT NULL,
`版本号` varchar(500) DEFAULT NULL,
`首次发表日期` varchar(500) DEFAULT NULL,
`登记批准日期` varchar(500) DEFAULT NULL,
`软件著作权人` varchar(500) DEFAULT NULL,
`软件著作权人详情` varchar(500) DEFAULT NULL,
PRIMARY KEY (`id`),
UNIQUE KEY `登记号` (`登记号`)
) ENGINE=InnoDB AUTO_INCREMENT=9871 DEFAULT CHARSET=utf8
CREATE TABLE `patent` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`专利名称` varchar(500) DEFAULT NULL,
`发明人` varchar(500) DEFAULT NULL,
`申请人` varchar(500) DEFAULT NULL,
`申请日` datetime DEFAULT NULL,
`公开日` datetime DEFAULT NULL,
`详情地址` varchar(500) DEFAULT NULL,
PRIMARY KEY (`id`),
UNIQUE KEY `详情地址` (`详情地址`)
) ENGINE=InnoDB AUTO_INCREMENT=13610 DEFAULT CHARSET=utf8
python代码:
getpatentdata.py主程序
import re
from urllib.parse import unquote, quote
from lxml import etree
from requests_html import HTMLSession
from 抓取专利著作权信息.MysqlHelper import MysqlHelper
# 获取专利信息
class Patent:
def __init__(self, sqr, year):
self.helper = MysqlHelper(host='localhost',
port=8080,
user='root',
passwd='123',
db='students',
charset='utf8')
self.creatTable()
self.sum = 0
while year <= 2019:
if year >= 2016:
dateList = ["%s-01-01" % str(year), "%s-02-01" % str(year), "%s-03-01" % str(year),
"%s-04-01" % str(year), "%s-05-01" % str(year), "%s-06-01" % str(year),
"%s-07-01" % str(year), "%s-08-01" % str(year), "%s-09-01" % str(year),
"%s-10-01" % str(year), "%s-11-01" % str(year), "%s-12-01" % str(year),
"%s-12-31" % str(year)]
else:
dateList = ["%s-01-01" % str(year), "%s-03-01" % str(year), "%s-05-01" % str(year),
"%s-07-01" % str(year),
"%s-09-01" % str(year),
"%s-11-01" % str(year), "%s-12-31" % str(year)]
print("*" * 66)
print("\033[36m开始抓取%s年的专利数据,已累计抓取%s条数据\033[0m" % (str(year), self.sum))
print("*" * 66)
for i in range(len(dateList) - 1):
self.getPatent(sqr, dateList[i], dateList[i + 1])
else:
year += 1
else:
print("\033[34m专利数据抓取完毕!共抓取%s条数据\033[0m" % str(self.sum))
showFunction()
# 建表
def creatTable(self):
sql1 = "CREATE TABLE `patent` (`id` int primary key not null auto_increment,`专利名称` varchar(500) DEFAULT NULL ,`发明人` varchar(500) DEFAULT NULL ,`申请人` varchar(500) DEFAULT NULL,`申请日` datetime DEFAULT NULL,`公开日` datetime DEFAULT NULL,`详情地址` varchar(500) DEFAULT NULL UNIQUE);"
self.helper.execute(sql1)
# 获取数据 sqr申请人 sqday_start申请日开始 sqday_end申请日结束
def getPatent(self, sqr, sqday_start, sqday_end):
self.patent_url = "http://dbpub.cnki.net/Grid2008/Dbpub/Brief.aspx?curpage=8&RecordsPerPage=350&QueryID=64&ID=SCPD&turnpage=1&systemno=&NaviDatabaseName=SCPD_ZJCLS&NaviField=%e4%b8%93%e9%a2%98%e5%ad%90%e6%a0%8f%e7%9b%ae%e4%bb%a3%e7%a0%81&navigatorValue=&subBase=all"
self.session = HTMLSession()
self.add_url = "http://dbpub.cnki.net/Grid2008/Dbpub/"
self.headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip,deflate',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded',
'Origin': 'http://dbpub.cnki.net',
'Host': 'dbpub.cnki.net',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0'
}
# h = self.session.post(self.patent_url, headers=headers)
# pagenum = etree.HTML(h.html.html).xpath('//div[@id="id_grid_total"]/text()')[0][5:-3]
# print("共%s条数据" % pagenum)
# self.patent_url = self.patent_url + "&RecordsPerPage=" + pagenum
self.data = "ID=SCPD&hdnSearchType=&hdnIsAll=false&NaviField=%E4%B8%93%E9%A2%98%E5%AD%90%E6%A0%8F%E7%9B%AE%E4%BB%A3%E7%A0%81&NaviDatabaseName=SCPD_ZJCLS&systemno=&hdnFathorCode=sysAll&selectbox=I&strNavigatorValue=%2CA%2CB%2CC%2CD%2CE%2CF%2CH%2CI&strNavigatorName=%2C%E5%9F%BA%E7%A1%80%E7%A7%91%E5%AD%A6%2C%E5%B7%A5%E7%A8%8B%E7%A7%91%E6%8A%80%E2%85%A0%E8%BE%91%2C%E5%B7%A5%E7%A8%8B%E7%A7%91%E6%8A%80%E2%85%A1%E8%BE%91%2C%E5%86%9C%E4%B8%9A%E7%A7%91%E6%8A%80%2C%E5%8C%BB%E8%8D%AF%E5%8D%AB%E7%94%9F%E7%A7%91%E6%8A%80%2C%E5%93%B2%E5%AD%A6%E4%B8%8E%E4%BA%BA%E6%96%87%E7%A7%91%E5%AD%A6%2C%E7%A4%BE%E4%BC%9A%E7%A7%91%E5%AD%A6%E2%85%A1%E8%BE%91%2C%E4%BF%A1%E6%81%AF%E7%A7%91%E6%8A%80&singleleafcode=&searchAttachCondition=&SearchQueryID=5&SearchFieldRelationDirectory=&updateTempDB=&bCurYearTempDB=1&fieldtips=%E7%AF%87%E5%90%8D%2F%5B%E5%9C%A8%E6%96%87%E7%8C%AE%E6%A0%87%E9%A2%98%E4%B8%AD%E6%A3%80%E7%B4%A2%E3%80%82%E5%AF%B9%E8%AF%A5%E6%A3%80%E7%B4%A2%E9%A1%B9%E7%9A%84%E6%A3%80%E7%B4%A2%E6%98%AF%E6%8C%89%E8%AF%8D%E8%BF%9B%E8%A1%8C%E7%9A%84%EF%BC%8C%E8%AF%B7%E5%B0%BD%E5%8F%AF%E8%83%BD%E8%BE%93%E5%85%A5%E5%AE%8C%E6%95%B4%E7%9A%84%E8%AF%8D%EF%BC%8C%E4%BB%A5%E9%81%BF%E5%85%8D%E6%BC%8F%E6%A3%80%E3%80%82%5D%2C%E5%85%B3%E9%94%AE%E8%AF%8D%2F%5B%E6%A3%80%E7%B4%A2%E6%96%87%E7%8C%AE%E7%9A%84%E5%85%B3%E9%94%AE%E8%AF%8D%E4%B8%AD%E6%BB%A1%E8%B6%B3%E6%A3%80%E7%B4%A2%E6%9D%A1%E4%BB%B6%E7%9A%84%E6%96%87%E7%8C%AE%E3%80%82%E5%AF%B9%E8%AF%A5%E6%A3%80%E7%B4%A2%E9%A1%B9%E7%9A%84%E6%A3%80%E7%B4%A2%E6%98%AF%E6%8C%89%E8%AF%8D%E8%BF%9B%E8%A1%8C%E7%9A%84%EF%BC%8C%E8%AF%B7%E5%B0%BD%E5%8F%AF%E8%83%BD%E8%BE%93%E5%85%A5%E5%AE%8C%E6%95%B4%E7%9A%84%E8%AF%8D%EF%BC%8C%E4%BB%A5%E9%81%BF%E5%85%8D%E6%BC%8F%E6%A3%80%E3%80%82%5D%2C%E7%AC%AC%E4%B8%80%E8%B4%A3%E4%BB%BB%E4%BA%BA%2F%5B%E8%AF%B7%E9%80%89%E6%8B%A9%E6%A3%80%E7%B4%A2%E9%A1%B9%E5%B9%B6%E6%8C%87%E5%AE%9A%E7%9B%B8%E5%BA%94%E7%9A%84%E6%A3%80%E7%B4%A2%E8%AF%8D%EF%BC%8C%E9%80%89%E6%8B%A9%E6%8E%92%E5%BA%8F%E6%96%B9%E5%BC%8F%E3%80%81%E5%8C%B9%E9%85%8D%E6%A8%A1%E5%BC%8F%E3%80%81%E6%96%87%E7%8C%AE%E6%97%B6%E9%97%B4%E7%AD%89%E9%99%90%E5%AE%9A%E6%9D%A1%E4%BB%B6%EF%BC%8C%E7%84%B6%E5%90%8E%E7%82%B9%E5%87%BB%E2%80%9C%E6%A3%80%E7%B4%A2%E2%80%9D%E3%80%82%5D%2C%E4%BD%9C%E8%80%85%2F%5B%E5%8F%AF%E8%BE%93%E5%85%A5%E4%BD%9C%E8%80%85%E5%AE%8C%E6%95%B4%E5%A7%93%E5%90%8D%EF%BC%8C%E6%88%96%E5%8F%AA%E8%BE%93%E5%85%A5%E8%BF%9E%E7%BB%AD%E7%9A%84%E4%B8%80%E9%83%A8%E5%88%86%E3%80%82%5D%2C%E6%9C%BA%E6%9E%84%2F%5B%E5%8F%AF%E8%BE%93%E5%85%A5%E5%AE%8C%E6%95%B4%E7%9A%84%E6%9C%BA%E6%9E%84%E5%90%8D%E7%A7%B0%EF%BC%8C%E6%88%96%E5%8F%AA%E8%BE%93%E5%85%A5%E8%BF%9E%E7%BB%AD%E7%9A%84%E4%B8%80%E9%83%A8%E5%88%86%E3%80%82%5D%2C%E4%B8%AD%E6%96%87%E6%91%98%E8%A6%81%2F%5B%E5%AF%B9%E8%AF%A5%E6%A3%80%E7%B4%A2%E9%A1%B9%E7%9A%84%E6%A3%80%E7%B4%A2%E6%98%AF%E6%8C%89%E8%AF%8D%E8%BF%9B%E8%A1%8C%E7%9A%84%EF%BC%8C%E8%AF%B7%E5%B0%BD%E5%8F%AF%E8%83%BD%E8%BE%93%E5%85%A5%E5%AE%8C%E6%95%B4%E7%9A%84%E8%AF%8D%EF%BC%8C%E4%BB%A5%E9%81%BF%E5%85%8D%E6%BC%8F%E6%A3%80%E3%80%82%5D%2C%E5%BC%95%E6%96%87%2F%5B%E8%AF%B7%E9%80%89%E6%8B%A9%E6%A3%80%E7%B4%A2%E9%A1%B9%E5%B9%B6%E6%8C%87%E5%AE%9A%E7%9B%B8%E5%BA%94%E7%9A%84%E6%A3%80%E7%B4%A2%E8%AF%8D%EF%BC%8C%E9%80%89%E6%8B%A9%E6%8E%92%E5%BA%8F%E6%96%B9%E5%BC%8F%E3%80%81%E5%8C%B9%E9%85%8D%E6%A8%A1%E5%BC%8F%E3%80%81%E6%96%87%E7%8C%AE%E6%97%B6%E9%97%B4%E7%AD%89%E9%99%90%E5%AE%9A%E6%9D%A1%E4%BB%B6%EF%BC%8C%E7%84%B6%E5%90%8E%E7%82%B9%E5%87%BB%E2%80%9C%E6%A3%80%E7%B4%A2%E2%80%9D%E3%80%82%5D%2C%E5%85%A8%E6%96%87%2F%E8%AF%B7%E9%80%89%E6%8B%A9%E6%A3%80%E7%B4%A2%E9%A1%B9%E5%B9%B6%E6%8C%87%E5%AE%9A%E7%9B%B8%E5%BA%94%E7%9A%84%E6%A3%80%E7%B4%A2%E8%AF%8D%EF%BC%8C%E9%80%89%E6%8B%A9%E6%8E%92%E5%BA%8F%E6%96%B9%E5%BC%8F%E3%80%81%E5%8C%B9%E9%85%8D%E6%A8%A1%E5%BC%8F%E3%80%81%E6%96%87%E7%8C%AE%E6%97%B6%E9%97%B4%E7%AD%89%E9%99%90%E5%AE%9A%E6%9D%A1%E4%BB%B6%EF%BC%8C%E7%84%B6%E5%90%8E%E7%82%B9%E5%87%BB%E2%80%9C%E6%A3%80%E7%B4%A2%E2%80%9D%E3%80%82%5D%2C%E5%9F%BA%E9%87%91%2F%5B%E6%A3%80%E7%B4%A2%E5%8F%97%E6%BB%A1%E8%B6%B3%E6%9D%A1%E4%BB%B6%E7%9A%84%E5%9F%BA%E9%87%91%E8%B5%84%E5%8A%A9%E7%9A%84%E6%96%87%E7%8C%AE%E3%80%82%5D%2C%E4%B8%AD%E6%96%87%E5%88%8A%E5%90%8D%2F%5B%E8%AF%B7%E8%BE%93%E5%85%A5%E9%83%A8%E5%88%86%E6%88%96%E5%85%A8%E9%83%A8%E5%88%8A%E5%90%8D%E3%80%82%5D%2CISSN%2F%5B%E8%AF%B7%E8%BE%93%E5%85%A5%E5%AE%8C%E6%95%B4%E7%9A%84ISSN%E5%8F%B7%E3%80%82%5D%2C%E5%B9%B4%2F%5B%E8%BE%93%E5%85%A5%E5%9B%9B%E4%BD%8D%E6%95%B0%E5%AD%97%E7%9A%84%E5%B9%B4%E4%BB%BD%E3%80%82%5D%2C%E6%9C%9F%2F%5B%E8%BE%93%E5%85%A5%E6%9C%9F%E5%88%8A%E7%9A%84%E6%9C%9F%E5%8F%B7%EF%BC%8C%E5%A6%82%E6%9E%9C%E4%B8%8D%E8%B6%B3%E4%B8%A4%E4%BD%8D%E6%95%B0%E5%AD%97%EF%BC%8C%E8%AF%B7%E5%9C%A8%E5%89%8D%E9%9D%A2%E8%A1%A5%E2%80%9C0%E2%80%9D%EF%BC%8C%E5%A6%82%E2%80%9C08%E2%80%9D%E3%80%82%5D%2C%E4%B8%BB%E9%A2%98%2F%5B%E4%B8%BB%E9%A2%98%E5%8C%85%E6%8B%AC%E7%AF%87%E5%90%8D%E3%80%81%E5%85%B3%E9%94%AE%E8%AF%8D%E3%80%81%E4%B8%AD%E6%96%87%E6%91%98%E8%A6%81%E3%80%82%E5%8F%AF%E6%A3%80%E7%B4%A2%E5%87%BA%E8%BF%99%E4%B8%89%E9%A1%B9%E4%B8%AD%E4%BB%BB%E4%B8%80%E9%A1%B9%E6%88%96%E5%A4%9A%E9%A1%B9%E6%BB%A1%E8%B6%B3%E6%8C%87%E5%AE%9A%E6%A3%80%E7%B4%A2%E6%9D%A1%E4%BB%B6%E7%9A%84%E6%96%87%E7%8C%AE%E3%80%82%E5%AF%B9%E4%B8%BB%E9%A2%98%E6%98%AF%E6%8C%89%E8%AF%8D%E6%A3%80%E7%B4%A2%E7%9A%84%EF%BC%8C%E8%AF%B7%E5%B0%BD%E5%8F%AF%E8%83%BD%E8%BE%93%E5%85%A5%E5%AE%8C%E6%95%B4%E7%9A%84%E8%AF%8D%EF%BC%8C%E4%BB%A5%E9%81%BF%E5%85%8D%E6%BC%8F%E6%A3%80%E3%80%82%5D&advancedfield1=%E7%94%B3%E8%AF%B7%E4%BA%BA&advancedvalue1=" + quote(
sqr) + "&imageField.x=50&imageField.y=11&searchmatch=0&order=dec&RecordsPerPage=350&hdnUSPSubDB=%E4%B8%93%E5%88%A9%E7%B1%BB%E5%88%AB%2C%2B1%2B2%2B3%2B%2C3%2C3&TableType=PY&display=chinese&encode=gb&TablePrefix=SCPD&View=SCPD&yearFieldName=%E5%B9%B4&userright=&VarNum=1&MM_fieldValue_1_1=" + sqday_start + "&MM_fieldValue_1_2=" + sqday_end + "&MM_slt_updateTime=&MM_Update_Time=&MM_Update_EndTime=&MM_fieldValue_2_1=&MM_fieldValue_2_2=&MM_hiddenTxtName=MM_fieldValue_1_1%40%40%40MM_fieldValue_1_2%40%40%40MM_fieldValue_2_1%40%40%40MM_fieldValue_2_2%40%40%40MM_Update_Time%40%40%40MM_Update_EndTime&MM_fieldName=%E7%94%B3%E8%AF%B7%E6%97%A5%40%40%40%E7%94%B3%E8%AF%B7%E6%97%A5%40%40%40%E5%85%AC%E5%BC%80%E6%97%A5%40%40%40%E5%85%AC%E5%BC%80%E6%97%A5%40%40%40%E6%9B%B4%E6%96%B0%E6%97%A5%E6%9C%9F%40%40%40%E6%9B%B4%E6%96%B0%E6%97%A5%E6%9C%9F&MM_hiddenRelation=%3E%3D%40%40%40%3C%3D%40%40%40%3E%3D%40%40%40%3C%3D%40%40%40%3E%3D%40%40%40%3C%3D&lastpage=7&RecordsPerPage2=50&systemno=%2C&classtype=&QueryID=5&turnpage=&curpage=1&curpage1=1&curpage2=1"
# print(unquote(datas))
# print(h.text)
patentdata = ''
while len(patentdata) == 0:
self.h1 = self.session.post(self.patent_url, headers=self.headers, data=self.data)
patentdata = etree.HTML(self.h1.text).xpath('//table[@class="s_table"]//tr')
print("\033[31m从%s 到 %s 共有%s条专利数据\033[0m" % (sqday_start, sqday_end, len(patentdata) - 1))
# 数据写入数据库
for i in range(1, len(patentdata)):
item = patentdata[i]
# number = item.xpath('./td[@class="s_tabletd_rb"]')[0].xpath('string(.)')
patentname = item.xpath('./td[@class="s_tabletd_rb"]')[1].xpath('string(.)')
patentpeople = item.xpath('./td[@class="s_tabletd_rb"]')[2].xpath('string(.)')
sqpeople = item.xpath('./td[@class="s_tabletd_rb"]')[3].xpath('string(.)')
sqday = item.xpath('./td[@class="s_tabletd_rb"]')[4].xpath('string(.)')
openday = item.xpath('./td[@class="s_tabletd_rb"]')[5].xpath('string(.)')
address = self.add_url + item.xpath('./td[@class="s_tabletd_rb"]//a/@href')[0]
# print("*" * 66)
sql = "insert into patent(`专利名称`,`发明人`,`申请人` ,`申请日`,`公开日` ,`详情地址`) values(%s,%s,%s,%s,%s,%s)on duplicate key update `专利名称` = %s and `发明人` = %s and `申请人` = %s and `申请日` = %s and `公开日` = %s and `详情地址` = %s;"
params = [patentname, patentpeople, sqpeople, sqday, openday, address, patentname,
patentpeople, sqpeople, sqday, openday, address]
result = self.helper.execute(sql, params)
if str(result).__contains__('1292'):
pass
else:
print(str(i) + '.' + patentname + ' 数据入库成功!')
self.sum += 1
# 获取软件著作权信息
class Copyright:
def __init__(self, key):
self.helper = MysqlHelper(host='localhost',
port=8080,
user='root',
passwd='123',
db='students',
charset='utf8')
self.creatTable()
self.getCopyrightData(key)
# 建表
def creatTable(self):
sql1 = "CREATE TABLE `copyright` (`id` int primary key not null auto_increment,`软件名称` varchar(500) DEFAULT NULL ,`登记号` varchar(500) DEFAULT NULL UNIQUE,`分类号` varchar(500) DEFAULT NULL,`软件简称` varchar(500) DEFAULT NULL,`版本号` varchar(500) DEFAULT NULL,`首次发表日期` varchar(500) DEFAULT NULL,`登记批准日期` varchar(500) DEFAULT NULL,`软件著作权人` varchar(500) DEFAULT NULL,`软件著作权人详情` varchar(500) DEFAULT NULL);"
self.helper.execute(sql1)
def getCopyrightData(self, key):
self.add_url = 'https://www.qichacha.com'
self.page = 1
self.sum = 0
self.copyright_url = "https://www.qichacha.com/more_rjzzq.html?key=%s&p=%s" % (key, str(self.page))
self.session = HTMLSession()
self.headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'acw_tc=9dff1e1d15740724795763997e1d4fc677c413795a13ba5e12a187111d; QCCSESSID=4koqg095imku2ge3616s51au67; _uab_collina=157407248111977014224544; zg_did=%7B%22did%22%3A%20%2216e7e07f5ad448-0acb91ff1d41898-4c302b7a-fa000-16e7e07f5af58a%22%7D; zg_de1d1a35bfa24ce29bbf2c7eb17e6c4f=%7B%22sid%22%3A%201574072481204%2C%22updated%22%3A%201574072508771%2C%22info%22%3A%201574072481208%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22%22%2C%22cuid%22%3A%20%223edfa18efe756b45eb94b06651c93d3a%22%7D; UM_distinctid=16e7e07f5e0258-0c4202a4fac742-4c302b7a-fa000-16e7e07f5e6346; CNZZDATA1254842228=281213894-1574070903-%7C1574070903; Hm_lvt_3456bee468c83cc63fb5147f119f1075=1574072482; Hm_lpvt_3456bee468c83cc63fb5147f119f1075=1574072509',
'Host': 'www.qichacha.com',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0'
}
try:
self.h = self.session.get(self.copyright_url, headers=self.headers)
except Exception:
self.h = self.session.get(self.copyright_url, headers=self.headers)
# print(self.h.text)
# 获取总页数
pagesum = etree.HTML(self.h.text).xpath('//a[@class="end"]/text()')[0]
while self.page <= int(pagesum):
print("*" * 66)
print('\033[31m开始抓取第%s页的数据,共%s页\033[0m' % (self.page, pagesum))
self.copyright_url = "https://www.qichacha.com/more_rjzzq.html?key=%s&p=%s" % (key, str(self.page))
self.copyrightdata = ''
while not len(self.copyrightdata):
self.h = self.session.get(self.copyright_url, headers=self.headers)
self.copyrightdata = etree.HTML(self.h.text).xpath('//section[@id="searchlist"]')
# 数据写入数据库
print("*" * 66)
for item in self.copyrightdata:
# 软件名称
copyrightname = item.xpath('.//span[@class="name"]')[0].xpath('string(.)')
djh_and_flh = re.split(r'[::]',
re.sub(r'\s+', '',
item.xpath('.//small[@class="text-muted clear text-ellipsis m-t-xs"]')[
0].xpath(
'string(.)')))
# 登记号
djh = re.findall(r'(.*?)分类号', djh_and_flh[1])[0]
# 分类号
flh = djh_and_flh[2]
if not len(djh):
djh = '空'
if not len(flh):
flh = '空'
rjjc_and_bbh = re.split(r':', re.sub(r'\s+', '', item.xpath(
'.//small[@class="text-muted clear text-ellipsis m-t-xs"]')[1].xpath('string(.)')))
# 软件简称
rjname = re.findall(r'(.*?)版本号', rjjc_and_bbh[1])[0]
# 版本号
bbh = rjjc_and_bbh[2]
fbtime_and_pztime = re.split(r':', re.sub(r'\s+', '', item.xpath(
'.//small[@class="text-muted clear text-ellipsis m-t-xs"]')[2].xpath('string(.)')))
# 首次发表日期
fbtime = re.findall(r'(.*?)登记批准日期', fbtime_and_pztime[1])[0]
# 登记批准日期
pztime = fbtime_and_pztime[2]
if djh == '-':
djh = ''
if flh == '-':
flh = ''
# 软件著作权人
rjzzqr = re.split(r':', re.sub(r'\s+', '',
item.xpath('.//footer [@class="panel-footer clear"]')[0].xpath(
'string(.)')))[1]
# 软件著作权人详情
try:
rjurl = self.add_url + item.xpath('.//footer [@class="panel-footer clear"]/a/@href')[0]
except IndexError:
rjurl = '空'
sql = "insert into copyright(`软件名称`,`登记号`,`分类号` ,`软件简称`,`版本号` ,`首次发表日期`,`登记批准日期`,`软件著作权人`,`软件著作权人详情`) values(%s,%s,%s,%s,%s,%s,%s,%s,%s)on duplicate key update `软件名称` = %s and `登记号` = %s and `分类号` = %s and `软件简称` = %s and `版本号` = %s and `首次发表日期` = %s and `登记批准日期` = %s and `软件著作权人` = %s and `软件著作权人详情` = %s;"
params = [copyrightname, djh, flh, rjname, bbh, fbtime, pztime,
rjzzqr, rjurl, copyrightname, djh, flh, rjname, bbh, fbtime, pztime,
rjzzqr, rjurl]
result = self.helper.execute(sql, params)
if str(result).__contains__('1292'):
pass
else:
print(copyrightname + ',' +
djh + ',' + flh + ',' + rjname + ',' + bbh + ',' + fbtime + ',' + pztime + ',' + rjzzqr + ',' + rjurl)
print('数据入库成功!')
# print("*" * 66)
# 累计数据数量
self.sum += 1
else:
print('\033[34m累计抓取数据%s条!\033[0m' % self.sum)
self.page += 1
else:
print("\033[34m著作权数据抓取完毕!共抓取%s条数据\033[0m" % str(self.sum))
showFunction()
# 展示功能菜单
def showFunction():
print("*" * 66)
print("\t\t\t\t\t专利著作权信息下载工具V1.0\t\t\t\t\t")
print("*" * 66)
print("\033[34m请选择功能\n1.抓取全部专利数据\n2.已抓取全部专利数据,执行更新数据操作\n3.抓取全部著作权数据\n4.退出程序\033[0m")
print("*" * 66)
point = True
while point:
fuc = input('请输入功能序号:')
if not fuc.isdigit():
print("\033[31m输入错误,请输入功能序号!\033[0m")
point = True
elif int(fuc) == 1:
strs = input("请输入申请人关键词(直接回车键返回上一级):")
if not len(strs):
point = True
else:
Patent(strs, 1985)
point = False
elif int(fuc) == 2:
strs = input("请输入申请人关键词(直接回车键返回上一级):")
if not len(strs):
point = True
else:
Patent(strs, 2019)
point = False
elif int(fuc) == 3:
strs = input("请输入著作权关键词(直接回车键返回上一级):")
if not len(strs):
point = True
else:
Copyright(strs)
point = False
elif int(fuc) == 4:
print('程序已关闭...')
exit()
else:
print("\033[31m输入错误,请输入正确的功能序号!\033[0m")
point = True
# Patent("江西")
# Patent("南昌")
if __name__ == '__main__':
showFunction()
MysqlHelper.py数据库辅助连接类:
from click._compat import raw_input
from pymysql import *
"""封装mysql连接类"""
class MysqlHelper:
"""初始化数据库参数"""
def __init__(self, host, port, user, passwd, db, charset):
# 数据库连接地址
self.host = host
# 地址端口
self.port = port
# 数据库用户名
self.user = user
# 数据库密码
self.passwd = passwd
# 数据库名称
self.db = db
# 编码
self.charset = charset
"""连接数据库,获取Connection对象和cursor游标对象"""
def open(self):
self.conn = connect(host=self.host, port=self.port, user=self.user, passwd=self.passwd, db=self.db,
charset=self.charset)
self.cursor = self.conn.cursor()
"""执行用户输入的sql语句,参数化sql语句中的输入值"""
def execute(self, sql, params=()):
try:
# 打开数据库连接
self.open()
# 执行sql语句
self.cursor.execute(sql, params)
# 提交事务
self.conn.commit()
# 关闭数据库连接
self.close()
# print("sql执行完成")
except Exception as e:
# 发送错误回滚
# self.rollback()
return e
def createDataBase(self, sql, params=()):
try:
# 打开数据库连接
conn = connect(host=self.host, port=self.port, user=self.user, passwd=self.passwd,
charset=self.charset)
cursor = conn.cursor()
# 执行sql语句
cursor.execute(sql, params)
# 提交事务
conn.commit()
# 关闭数据库连接
cursor.close()
conn.close()
# print("sql执行完成")
except Exception as e:
# 发送错误回滚
# self.rollback()
print(e)
"""返回sql全部查询结果"""
def all(self, sql, params=()):
try:
# 打开数据库连接
self.open()
# 执行sql语句
self.cursor.execute(sql, params)
# 调用cursor的fetchall获取全部执行结果
result = self.cursor.fetchall()
# 关闭数据库连接
self.close()
# 返回执行结果
return result
except Exception as e:
return e
"""返回sql查询结果一行"""
def single(self, sql, params=()):
try:
# 打开数据库连接
self.open()
# 执行sql语句
self.cursor.execute(sql, params)
# 调用cursor的fetchone获取全部执行结果中的一条
result = self.cursor.fetchone()
# 关闭数据库连接
self.close()
# 返回执行结果
return result
except Exception as e:
print(e)
"""数据库回滚"""
def rollback(self):
self.conn.rollback()
"""关闭数据库"""
def close(self):
self.cursor.close()
self.conn.close()
"""测试用"""
if __name__ == '__main__':
msh = MysqlHelper('localhost', 8080, 'root', '123', 'test', 'utf8')
name = raw_input('请输入学生姓名:')
sbname = raw_input('请输入科目名称:')
sql = 'insert into students(name) values(%s)'
sql1 = 'insert into subjects(sbname) values(%s)'
sql2 = 'select id,name from students where id<5'
msh.execute(sql, [name])
msh.execute(sql1, [sbname])
print(msh.all(sql2))
程序可能存在部分bug,欢迎交流指正。
python 版权保护,python爬虫篇4——爬取专利著作权信息相关推荐
- python爬虫篇4——爬取专利著作权信息
mysql代码: CREATE TABLE `copyright` (`id` int(11) NOT NULL AUTO_INCREMENT,`软件名称` varchar(500) DEFAULT ...
- [Python 爬虫] 使用 Scrapy 爬取新浪微博用户信息(四) —— 应对反爬技术(选取 User-Agent、添加 IP代理池以及Cookies池 )
上一篇:[Python 爬虫] 使用 Scrapy 爬取新浪微博用户信息(三) -- 数据的持久化--使用MongoDB存储爬取的数据 最近项目有些忙,很多需求紧急上线,所以一直没能完善< 使用 ...
- [Python 爬虫] 使用 Scrapy 爬取新浪微博用户信息(二) —— 编写一个基本的 Spider 爬取微博用户信息
上一篇:[Python 爬虫] 使用 Scrapy 爬取新浪微博用户信息(一) -- 新建爬虫项目 在上一篇我们新建了一个 sina_scrapy 的项目,这一节我们开始正式编写爬虫的代码. 选择目标 ...
- [Python 爬虫] 使用 Scrapy 爬取新浪微博用户信息(三) —— 数据的持久化——使用MongoDB存储爬取的数据
上一篇:[Python 爬虫] 使用 Scrapy 爬取新浪微博用户信息(二) -- 编写一个基本的 Spider 爬取微博用户信息 在上一篇博客中,我们已经新建了一个爬虫应用,并简单实现了爬取一位微 ...
- python -又一次爬虫练习(爬取LOL所有的英雄头像)
python -又一次爬虫练习(爬取LOL所有的英雄头像) 目标网站:https://lol.qq.com/data/info-heros.shtml#Navi 一开始我尝试用requests来get ...
- Python爬虫入门(爬取豆瓣电影信息小结)
Python爬虫入门(爬取豆瓣电影信息小结) 1.爬虫概念 网络爬虫,是一种按照一定规则,自动抓取互联网信息的程序或脚本.爬虫的本质是模拟浏览器打开网页,获取网页中我们想要的那部分数据. 2.基本流程 ...
- 爬虫篇——User-Agent爬取备用及存储
爬虫篇--User-Agent爬取备用及存储 代码 代码 本文通过抓取常见的User-Agent(用户代理),将其写入列表并保存为json格式文件,且将代码进行了封装,方便以后抓取数据时动态的更新请求 ...
- layui获取input信息_python爬虫—用selenium爬取京东商品信息
python爬虫--用selenium爬取京东商品信息 1.先附上效果图(我偷懒只爬了4页) 2.京东的网址https://www.jd.com/ 3.我这里是不加载图片,加快爬取速度,也可以用Hea ...
- python爬虫训练:爬取榜单信息
一.创作背景 这学期的大作业是要根据这学期的学习内容做一个综合程序,这次是一个爬取酷狗音乐飙升榜单的信息,并下载下来.可以方便和我一样喜欢白嫖的人员免费下载音乐. 二.使用的库 主要使用了reques ...
- go爬虫和python爬虫哪个好_python 爬虫实战项目--爬取京东商品信息(价格、优惠、排名、好评率等)-Go语言中文社区...
利用splash爬取京东商品信息 一.环境 window7 python3.5 pycharm scrapy scrapy-splash MySQL 二.简介 为了体验scrapy-splash 的动 ...
最新文章
- exchange 2010备份及恢复
- 4 angular 重构 项目_vuejs angularjs 框架的一些比较(vue项目重构四)
- css禁止鼠标双击选中文字
- 008_效果和动画的Callback函数
- easypoi 如何合并相同的列,如何在Java中的POI中使用XWPFTable合并单元格(或应用colspan)?...
- 关于CPU Cache——程序猿需要知道的那些事
- apache OFBiz的安装
- 设置 Visual Studio 文件版权信息 - C语言零基础入门教程
- 用virt-manager管理远程KVM虚拟机
- Python内置函数sorted()高级排序用法
- 为所有北京奥运冠军名字作诗(诗集)
- html中c b和b s,Web开发中B/S架构和C/S架构的区别
- IDC发布2020上半年SD-WAN报告:阿里云领跑国内服务市场
- 扒一扒,互联网大厂内部都用什么软件沟通?
- 非常有意思的Flowlet
- chrome://flags是什么?
- 魏则西事件中,百度属于一般的龌龊
- PostgreSQL-Arcgis地理数据库中的系统表
- [转]水木不在,何以清华?
- laravel 分析html,Laravel 5:使用Blad显示HTML