mysql代码:

CREATE TABLE `copyright` (

`id` int(11) NOT NULL AUTO_INCREMENT,

`软件名称` varchar(500) DEFAULT NULL,

`登记号` varchar(500) DEFAULT NULL,

`分类号` varchar(500) DEFAULT NULL,

`软件简称` varchar(500) DEFAULT NULL,

`版本号` varchar(500) DEFAULT NULL,

`首次发表日期` varchar(500) DEFAULT NULL,

`登记批准日期` varchar(500) DEFAULT NULL,

`软件著作权人` varchar(500) DEFAULT NULL,

`软件著作权人详情` varchar(500) DEFAULT NULL,

PRIMARY KEY (`id`),

UNIQUE KEY `登记号` (`登记号`)

) ENGINE=InnoDB AUTO_INCREMENT=9871 DEFAULT CHARSET=utf8

CREATE TABLE `patent` (

`id` int(11) NOT NULL AUTO_INCREMENT,

`专利名称` varchar(500) DEFAULT NULL,

`发明人` varchar(500) DEFAULT NULL,

`申请人` varchar(500) DEFAULT NULL,

`申请日` datetime DEFAULT NULL,

`公开日` datetime DEFAULT NULL,

`详情地址` varchar(500) DEFAULT NULL,

PRIMARY KEY (`id`),

UNIQUE KEY `详情地址` (`详情地址`)

) ENGINE=InnoDB AUTO_INCREMENT=13610 DEFAULT CHARSET=utf8

python代码:

getpatentdata.py主程序

import re

from urllib.parse import unquote, quote

from lxml import etree

from requests_html import HTMLSession

from 抓取专利著作权信息.MysqlHelper import MysqlHelper

# 获取专利信息

class Patent:

def __init__(self, sqr, year):

self.helper = MysqlHelper(host='localhost',

port=8080,

user='root',

passwd='123',

db='students',

charset='utf8')

self.creatTable()

self.sum = 0

while year <= 2019:

if year >= 2016:

dateList = ["%s-01-01" % str(year), "%s-02-01" % str(year), "%s-03-01" % str(year),

"%s-04-01" % str(year), "%s-05-01" % str(year), "%s-06-01" % str(year),

"%s-07-01" % str(year), "%s-08-01" % str(year), "%s-09-01" % str(year),

"%s-10-01" % str(year), "%s-11-01" % str(year), "%s-12-01" % str(year),

"%s-12-31" % str(year)]

else:

dateList = ["%s-01-01" % str(year), "%s-03-01" % str(year), "%s-05-01" % str(year),

"%s-07-01" % str(year),

"%s-09-01" % str(year),

"%s-11-01" % str(year), "%s-12-31" % str(year)]

print("*" * 66)

print("\033[36m开始抓取%s年的专利数据,已累计抓取%s条数据\033[0m" % (str(year), self.sum))

print("*" * 66)

for i in range(len(dateList) - 1):

self.getPatent(sqr, dateList[i], dateList[i + 1])

else:

year += 1

else:

print("\033[34m专利数据抓取完毕!共抓取%s条数据\033[0m" % str(self.sum))

showFunction()

# 建表

def creatTable(self):

sql1 = "CREATE TABLE `patent` (`id` int primary key not null auto_increment,`专利名称` varchar(500) DEFAULT NULL ,`发明人` varchar(500) DEFAULT NULL ,`申请人` varchar(500) DEFAULT NULL,`申请日` datetime DEFAULT NULL,`公开日` datetime DEFAULT NULL,`详情地址` varchar(500) DEFAULT NULL UNIQUE);"

self.helper.execute(sql1)

# 获取数据 sqr申请人 sqday_start申请日开始 sqday_end申请日结束

def getPatent(self, sqr, sqday_start, sqday_end):

self.patent_url = "http://dbpub.cnki.net/Grid2008/Dbpub/Brief.aspx?curpage=8&RecordsPerPage=350&QueryID=64&ID=SCPD&turnpage=1&systemno=&NaviDatabaseName=SCPD_ZJCLS&NaviField=%e4%b8%93%e9%a2%98%e5%ad%90%e6%a0%8f%e7%9b%ae%e4%bb%a3%e7%a0%81&navigatorValue=&subBase=all"

self.session = HTMLSession()

self.add_url = "http://dbpub.cnki.net/Grid2008/Dbpub/"

self.headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

'Accept-Encoding': 'gzip,deflate',

'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',

'Connection': 'keep-alive',

'Content-Type': 'application/x-www-form-urlencoded',

'Origin': 'http://dbpub.cnki.net',

'Host': 'dbpub.cnki.net',

'Upgrade-Insecure-Requests': '1',

'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0'

}

# h = self.session.post(self.patent_url, headers=headers)

# pagenum = etree.HTML(h.html.html).xpath('//div[@id="id_grid_total"]/text()')[0][5:-3]

# print("共%s条数据" % pagenum)

# self.patent_url = self.patent_url + "&RecordsPerPage=" + pagenum

self.data = "ID=SCPD&hdnSearchType=&hdnIsAll=false&NaviField=%E4%B8%93%E9%A2%98%E5%AD%90%E6%A0%8F%E7%9B%AE%E4%BB%A3%E7%A0%81&NaviDatabaseName=SCPD_ZJCLS&systemno=&hdnFathorCode=sysAll&selectbox=I&strNavigatorValue=%2CA%2CB%2CC%2CD%2CE%2CF%2CH%2CI&strNavigatorName=%2C%E5%9F%BA%E7%A1%80%E7%A7%91%E5%AD%A6%2C%E5%B7%A5%E7%A8%8B%E7%A7%91%E6%8A%80%E2%85%A0%E8%BE%91%2C%E5%B7%A5%E7%A8%8B%E7%A7%91%E6%8A%80%E2%85%A1%E8%BE%91%2C%E5%86%9C%E4%B8%9A%E7%A7%91%E6%8A%80%2C%E5%8C%BB%E8%8D%AF%E5%8D%AB%E7%94%9F%E7%A7%91%E6%8A%80%2C%E5%93%B2%E5%AD%A6%E4%B8%8E%E4%BA%BA%E6%96%87%E7%A7%91%E5%AD%A6%2C%E7%A4%BE%E4%BC%9A%E7%A7%91%E5%AD%A6%E2%85%A1%E8%BE%91%2C%E4%BF%A1%E6%81%AF%E7%A7%91%E6%8A%80&singleleafcode=&searchAttachCondition=&SearchQueryID=5&SearchFieldRelationDirectory=&updateTempDB=&bCurYearTempDB=1&fieldtips=%E7%AF%87%E5%90%8D%2F%5B%E5%9C%A8%E6%96%87%E7%8C%AE%E6%A0%87%E9%A2%98%E4%B8%AD%E6%A3%80%E7%B4%A2%E3%80%82%E5%AF%B9%E8%AF%A5%E6%A3%80%E7%B4%A2%E9%A1%B9%E7%9A%84%E6%A3%80%E7%B4%A2%E6%98%AF%E6%8C%89%E8%AF%8D%E8%BF%9B%E8%A1%8C%E7%9A%84%EF%BC%8C%E8%AF%B7%E5%B0%BD%E5%8F%AF%E8%83%BD%E8%BE%93%E5%85%A5%E5%AE%8C%E6%95%B4%E7%9A%84%E8%AF%8D%EF%BC%8C%E4%BB%A5%E9%81%BF%E5%85%8D%E6%BC%8F%E6%A3%80%E3%80%82%5D%2C%E5%85%B3%E9%94%AE%E8%AF%8D%2F%5B%E6%A3%80%E7%B4%A2%E6%96%87%E7%8C%AE%E7%9A%84%E5%85%B3%E9%94%AE%E8%AF%8D%E4%B8%AD%E6%BB%A1%E8%B6%B3%E6%A3%80%E7%B4%A2%E6%9D%A1%E4%BB%B6%E7%9A%84%E6%96%87%E7%8C%AE%E3%80%82%E5%AF%B9%E8%AF%A5%E6%A3%80%E7%B4%A2%E9%A1%B9%E7%9A%84%E6%A3%80%E7%B4%A2%E6%98%AF%E6%8C%89%E8%AF%8D%E8%BF%9B%E8%A1%8C%E7%9A%84%EF%BC%8C%E8%AF%B7%E5%B0%BD%E5%8F%AF%E8%83%BD%E8%BE%93%E5%85%A5%E5%AE%8C%E6%95%B4%E7%9A%84%E8%AF%8D%EF%BC%8C%E4%BB%A5%E9%81%BF%E5%85%8D%E6%BC%8F%E6%A3%80%E3%80%82%5D%2C%E7%AC%AC%E4%B8%80%E8%B4%A3%E4%BB%BB%E4%BA%BA%2F%5B%E8%AF%B7%E9%80%89%E6%8B%A9%E6%A3%80%E7%B4%A2%E9%A1%B9%E5%B9%B6%E6%8C%87%E5%AE%9A%E7%9B%B8%E5%BA%94%E7%9A%84%E6%A3%80%E7%B4%A2%E8%AF%8D%EF%BC%8C%E9%80%89%E6%8B%A9%E6%8E%92%E5%BA%8F%E6%96%B9%E5%BC%8F%E3%80%81%E5%8C%B9%E9%85%8D%E6%A8%A1%E5%BC%8F%E3%80%81%E6%96%87%E7%8C%AE%E6%97%B6%E9%97%B4%E7%AD%89%E9%99%90%E5%AE%9A%E6%9D%A1%E4%BB%B6%EF%BC%8C%E7%84%B6%E5%90%8E%E7%82%B9%E5%87%BB%E2%80%9C%E6%A3%80%E7%B4%A2%E2%80%9D%E3%80%82%5D%2C%E4%BD%9C%E8%80%85%2F%5B%E5%8F%AF%E8%BE%93%E5%85%A5%E4%BD%9C%E8%80%85%E5%AE%8C%E6%95%B4%E5%A7%93%E5%90%8D%EF%BC%8C%E6%88%96%E5%8F%AA%E8%BE%93%E5%85%A5%E8%BF%9E%E7%BB%AD%E7%9A%84%E4%B8%80%E9%83%A8%E5%88%86%E3%80%82%5D%2C%E6%9C%BA%E6%9E%84%2F%5B%E5%8F%AF%E8%BE%93%E5%85%A5%E5%AE%8C%E6%95%B4%E7%9A%84%E6%9C%BA%E6%9E%84%E5%90%8D%E7%A7%B0%EF%BC%8C%E6%88%96%E5%8F%AA%E8%BE%93%E5%85%A5%E8%BF%9E%E7%BB%AD%E7%9A%84%E4%B8%80%E9%83%A8%E5%88%86%E3%80%82%5D%2C%E4%B8%AD%E6%96%87%E6%91%98%E8%A6%81%2F%5B%E5%AF%B9%E8%AF%A5%E6%A3%80%E7%B4%A2%E9%A1%B9%E7%9A%84%E6%A3%80%E7%B4%A2%E6%98%AF%E6%8C%89%E8%AF%8D%E8%BF%9B%E8%A1%8C%E7%9A%84%EF%BC%8C%E8%AF%B7%E5%B0%BD%E5%8F%AF%E8%83%BD%E8%BE%93%E5%85%A5%E5%AE%8C%E6%95%B4%E7%9A%84%E8%AF%8D%EF%BC%8C%E4%BB%A5%E9%81%BF%E5%85%8D%E6%BC%8F%E6%A3%80%E3%80%82%5D%2C%E5%BC%95%E6%96%87%2F%5B%E8%AF%B7%E9%80%89%E6%8B%A9%E6%A3%80%E7%B4%A2%E9%A1%B9%E5%B9%B6%E6%8C%87%E5%AE%9A%E7%9B%B8%E5%BA%94%E7%9A%84%E6%A3%80%E7%B4%A2%E8%AF%8D%EF%BC%8C%E9%80%89%E6%8B%A9%E6%8E%92%E5%BA%8F%E6%96%B9%E5%BC%8F%E3%80%81%E5%8C%B9%E9%85%8D%E6%A8%A1%E5%BC%8F%E3%80%81%E6%96%87%E7%8C%AE%E6%97%B6%E9%97%B4%E7%AD%89%E9%99%90%E5%AE%9A%E6%9D%A1%E4%BB%B6%EF%BC%8C%E7%84%B6%E5%90%8E%E7%82%B9%E5%87%BB%E2%80%9C%E6%A3%80%E7%B4%A2%E2%80%9D%E3%80%82%5D%2C%E5%85%A8%E6%96%87%2F%E8%AF%B7%E9%80%89%E6%8B%A9%E6%A3%80%E7%B4%A2%E9%A1%B9%E5%B9%B6%E6%8C%87%E5%AE%9A%E7%9B%B8%E5%BA%94%E7%9A%84%E6%A3%80%E7%B4%A2%E8%AF%8D%EF%BC%8C%E9%80%89%E6%8B%A9%E6%8E%92%E5%BA%8F%E6%96%B9%E5%BC%8F%E3%80%81%E5%8C%B9%E9%85%8D%E6%A8%A1%E5%BC%8F%E3%80%81%E6%96%87%E7%8C%AE%E6%97%B6%E9%97%B4%E7%AD%89%E9%99%90%E5%AE%9A%E6%9D%A1%E4%BB%B6%EF%BC%8C%E7%84%B6%E5%90%8E%E7%82%B9%E5%87%BB%E2%80%9C%E6%A3%80%E7%B4%A2%E2%80%9D%E3%80%82%5D%2C%E5%9F%BA%E9%87%91%2F%5B%E6%A3%80%E7%B4%A2%E5%8F%97%E6%BB%A1%E8%B6%B3%E6%9D%A1%E4%BB%B6%E7%9A%84%E5%9F%BA%E9%87%91%E8%B5%84%E5%8A%A9%E7%9A%84%E6%96%87%E7%8C%AE%E3%80%82%5D%2C%E4%B8%AD%E6%96%87%E5%88%8A%E5%90%8D%2F%5B%E8%AF%B7%E8%BE%93%E5%85%A5%E9%83%A8%E5%88%86%E6%88%96%E5%85%A8%E9%83%A8%E5%88%8A%E5%90%8D%E3%80%82%5D%2CISSN%2F%5B%E8%AF%B7%E8%BE%93%E5%85%A5%E5%AE%8C%E6%95%B4%E7%9A%84ISSN%E5%8F%B7%E3%80%82%5D%2C%E5%B9%B4%2F%5B%E8%BE%93%E5%85%A5%E5%9B%9B%E4%BD%8D%E6%95%B0%E5%AD%97%E7%9A%84%E5%B9%B4%E4%BB%BD%E3%80%82%5D%2C%E6%9C%9F%2F%5B%E8%BE%93%E5%85%A5%E6%9C%9F%E5%88%8A%E7%9A%84%E6%9C%9F%E5%8F%B7%EF%BC%8C%E5%A6%82%E6%9E%9C%E4%B8%8D%E8%B6%B3%E4%B8%A4%E4%BD%8D%E6%95%B0%E5%AD%97%EF%BC%8C%E8%AF%B7%E5%9C%A8%E5%89%8D%E9%9D%A2%E8%A1%A5%E2%80%9C0%E2%80%9D%EF%BC%8C%E5%A6%82%E2%80%9C08%E2%80%9D%E3%80%82%5D%2C%E4%B8%BB%E9%A2%98%2F%5B%E4%B8%BB%E9%A2%98%E5%8C%85%E6%8B%AC%E7%AF%87%E5%90%8D%E3%80%81%E5%85%B3%E9%94%AE%E8%AF%8D%E3%80%81%E4%B8%AD%E6%96%87%E6%91%98%E8%A6%81%E3%80%82%E5%8F%AF%E6%A3%80%E7%B4%A2%E5%87%BA%E8%BF%99%E4%B8%89%E9%A1%B9%E4%B8%AD%E4%BB%BB%E4%B8%80%E9%A1%B9%E6%88%96%E5%A4%9A%E9%A1%B9%E6%BB%A1%E8%B6%B3%E6%8C%87%E5%AE%9A%E6%A3%80%E7%B4%A2%E6%9D%A1%E4%BB%B6%E7%9A%84%E6%96%87%E7%8C%AE%E3%80%82%E5%AF%B9%E4%B8%BB%E9%A2%98%E6%98%AF%E6%8C%89%E8%AF%8D%E6%A3%80%E7%B4%A2%E7%9A%84%EF%BC%8C%E8%AF%B7%E5%B0%BD%E5%8F%AF%E8%83%BD%E8%BE%93%E5%85%A5%E5%AE%8C%E6%95%B4%E7%9A%84%E8%AF%8D%EF%BC%8C%E4%BB%A5%E9%81%BF%E5%85%8D%E6%BC%8F%E6%A3%80%E3%80%82%5D&advancedfield1=%E7%94%B3%E8%AF%B7%E4%BA%BA&advancedvalue1=" + quote(

sqr) + "&imageField.x=50&imageField.y=11&searchmatch=0&order=dec&RecordsPerPage=350&hdnUSPSubDB=%E4%B8%93%E5%88%A9%E7%B1%BB%E5%88%AB%2C%2B1%2B2%2B3%2B%2C3%2C3&TableType=PY&display=chinese&encode=gb&TablePrefix=SCPD&View=SCPD&yearFieldName=%E5%B9%B4&userright=&VarNum=1&MM_fieldValue_1_1=" + sqday_start + "&MM_fieldValue_1_2=" + sqday_end + "&MM_slt_updateTime=&MM_Update_Time=&MM_Update_EndTime=&MM_fieldValue_2_1=&MM_fieldValue_2_2=&MM_hiddenTxtName=MM_fieldValue_1_1%40%40%40MM_fieldValue_1_2%40%40%40MM_fieldValue_2_1%40%40%40MM_fieldValue_2_2%40%40%40MM_Update_Time%40%40%40MM_Update_EndTime&MM_fieldName=%E7%94%B3%E8%AF%B7%E6%97%A5%40%40%40%E7%94%B3%E8%AF%B7%E6%97%A5%40%40%40%E5%85%AC%E5%BC%80%E6%97%A5%40%40%40%E5%85%AC%E5%BC%80%E6%97%A5%40%40%40%E6%9B%B4%E6%96%B0%E6%97%A5%E6%9C%9F%40%40%40%E6%9B%B4%E6%96%B0%E6%97%A5%E6%9C%9F&MM_hiddenRelation=%3E%3D%40%40%40%3C%3D%40%40%40%3E%3D%40%40%40%3C%3D%40%40%40%3E%3D%40%40%40%3C%3D&lastpage=7&RecordsPerPage2=50&systemno=%2C&classtype=&QueryID=5&turnpage=&curpage=1&curpage1=1&curpage2=1"

# print(unquote(datas))

# print(h.text)

patentdata = ''

while len(patentdata) == 0:

self.h1 = self.session.post(self.patent_url, headers=self.headers, data=self.data)

patentdata = etree.HTML(self.h1.text).xpath('//table[@class="s_table"]//tr')

print("\033[31m从%s 到 %s 共有%s条专利数据\033[0m" % (sqday_start, sqday_end, len(patentdata) - 1))

# 数据写入数据库

for i in range(1, len(patentdata)):

item = patentdata[i]

# number = item.xpath('./td[@class="s_tabletd_rb"]')[0].xpath('string(.)')

patentname = item.xpath('./td[@class="s_tabletd_rb"]')[1].xpath('string(.)')

patentpeople = item.xpath('./td[@class="s_tabletd_rb"]')[2].xpath('string(.)')

sqpeople = item.xpath('./td[@class="s_tabletd_rb"]')[3].xpath('string(.)')

sqday = item.xpath('./td[@class="s_tabletd_rb"]')[4].xpath('string(.)')

openday = item.xpath('./td[@class="s_tabletd_rb"]')[5].xpath('string(.)')

address = self.add_url + item.xpath('./td[@class="s_tabletd_rb"]//a/@href')[0]

# print("*" * 66)

sql = "insert into patent(`专利名称`,`发明人`,`申请人` ,`申请日`,`公开日` ,`详情地址`) values(%s,%s,%s,%s,%s,%s)on duplicate key update `专利名称` = %s and `发明人` = %s and `申请人` = %s and `申请日` = %s and `公开日` = %s and `详情地址` = %s;"

params = [patentname, patentpeople, sqpeople, sqday, openday, address, patentname,

patentpeople, sqpeople, sqday, openday, address]

result = self.helper.execute(sql, params)

if str(result).__contains__('1292'):

pass

else:

print(str(i) + '.' + patentname + ' 数据入库成功!')

self.sum += 1

# 获取软件著作权信息

class Copyright:

def __init__(self, key):

self.helper = MysqlHelper(host='localhost',

port=8080,

user='root',

passwd='123',

db='students',

charset='utf8')

self.creatTable()

self.getCopyrightData(key)

# 建表

def creatTable(self):

sql1 = "CREATE TABLE `copyright` (`id` int primary key not null auto_increment,`软件名称` varchar(500) DEFAULT NULL ,`登记号` varchar(500) DEFAULT NULL UNIQUE,`分类号` varchar(500) DEFAULT NULL,`软件简称` varchar(500) DEFAULT NULL,`版本号` varchar(500) DEFAULT NULL,`首次发表日期` varchar(500) DEFAULT NULL,`登记批准日期` varchar(500) DEFAULT NULL,`软件著作权人` varchar(500) DEFAULT NULL,`软件著作权人详情` varchar(500) DEFAULT NULL);"

self.helper.execute(sql1)

def getCopyrightData(self, key):

self.add_url = 'https://www.qichacha.com'

self.page = 1

self.sum = 0

self.copyright_url = "https://www.qichacha.com/more_rjzzq.html?key=%s&p=%s" % (key, str(self.page))

self.session = HTMLSession()

self.headers = {

'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

'Accept-Encoding': 'gzip, deflate, br',

'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',

'Cache-Control': 'max-age=0',

'Connection': 'keep-alive',

'Cookie': 'acw_tc=9dff1e1d15740724795763997e1d4fc677c413795a13ba5e12a187111d; QCCSESSID=4koqg095imku2ge3616s51au67; _uab_collina=157407248111977014224544; zg_did=%7B%22did%22%3A%20%2216e7e07f5ad448-0acb91ff1d41898-4c302b7a-fa000-16e7e07f5af58a%22%7D; zg_de1d1a35bfa24ce29bbf2c7eb17e6c4f=%7B%22sid%22%3A%201574072481204%2C%22updated%22%3A%201574072508771%2C%22info%22%3A%201574072481208%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22%22%2C%22cuid%22%3A%20%223edfa18efe756b45eb94b06651c93d3a%22%7D; UM_distinctid=16e7e07f5e0258-0c4202a4fac742-4c302b7a-fa000-16e7e07f5e6346; CNZZDATA1254842228=281213894-1574070903-%7C1574070903; Hm_lvt_3456bee468c83cc63fb5147f119f1075=1574072482; Hm_lpvt_3456bee468c83cc63fb5147f119f1075=1574072509',

'Host': 'www.qichacha.com',

'Upgrade-Insecure-Requests': '1',

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0'

}

try:

self.h = self.session.get(self.copyright_url, headers=self.headers)

except Exception:

self.h = self.session.get(self.copyright_url, headers=self.headers)

# print(self.h.text)

# 获取总页数

pagesum = etree.HTML(self.h.text).xpath('//a[@class="end"]/text()')[0]

while self.page <= int(pagesum):

print("*" * 66)

print('\033[31m开始抓取第%s页的数据,共%s页\033[0m' % (self.page, pagesum))

self.copyright_url = "https://www.qichacha.com/more_rjzzq.html?key=%s&p=%s" % (key, str(self.page))

self.copyrightdata = ''

while not len(self.copyrightdata):

self.h = self.session.get(self.copyright_url, headers=self.headers)

self.copyrightdata = etree.HTML(self.h.text).xpath('//section[@id="searchlist"]')

# 数据写入数据库

print("*" * 66)

for item in self.copyrightdata:

# 软件名称

copyrightname = item.xpath('.//span[@class="name"]')[0].xpath('string(.)')

djh_and_flh = re.split(r'[::]',

re.sub(r'\s+', '',

item.xpath('.//small[@class="text-muted clear text-ellipsis m-t-xs"]')[

0].xpath(

'string(.)')))

# 登记号

djh = re.findall(r'(.*?)分类号', djh_and_flh[1])[0]

# 分类号

flh = djh_and_flh[2]

if not len(djh):

djh = '空'

if not len(flh):

flh = '空'

rjjc_and_bbh = re.split(r':', re.sub(r'\s+', '', item.xpath(

'.//small[@class="text-muted clear text-ellipsis m-t-xs"]')[1].xpath('string(.)')))

# 软件简称

rjname = re.findall(r'(.*?)版本号', rjjc_and_bbh[1])[0]

# 版本号

bbh = rjjc_and_bbh[2]

fbtime_and_pztime = re.split(r':', re.sub(r'\s+', '', item.xpath(

'.//small[@class="text-muted clear text-ellipsis m-t-xs"]')[2].xpath('string(.)')))

# 首次发表日期

fbtime = re.findall(r'(.*?)登记批准日期', fbtime_and_pztime[1])[0]

# 登记批准日期

pztime = fbtime_and_pztime[2]

if djh == '-':

djh = ''

if flh == '-':

flh = ''

# 软件著作权人

rjzzqr = re.split(r':', re.sub(r'\s+', '',

item.xpath('.//footer [@class="panel-footer clear"]')[0].xpath(

'string(.)')))[1]

# 软件著作权人详情

try:

rjurl = self.add_url + item.xpath('.//footer [@class="panel-footer clear"]/a/@href')[0]

except IndexError:

rjurl = '空'

sql = "insert into copyright(`软件名称`,`登记号`,`分类号` ,`软件简称`,`版本号` ,`首次发表日期`,`登记批准日期`,`软件著作权人`,`软件著作权人详情`) values(%s,%s,%s,%s,%s,%s,%s,%s,%s)on duplicate key update `软件名称` = %s and `登记号` = %s and `分类号` = %s and `软件简称` = %s and `版本号` = %s and `首次发表日期` = %s and `登记批准日期` = %s and `软件著作权人` = %s and `软件著作权人详情` = %s;"

params = [copyrightname, djh, flh, rjname, bbh, fbtime, pztime,

rjzzqr, rjurl, copyrightname, djh, flh, rjname, bbh, fbtime, pztime,

rjzzqr, rjurl]

result = self.helper.execute(sql, params)

if str(result).__contains__('1292'):

pass

else:

print(copyrightname + ',' +

djh + ',' + flh + ',' + rjname + ',' + bbh + ',' + fbtime + ',' + pztime + ',' + rjzzqr + ',' + rjurl)

print('数据入库成功!')

# print("*" * 66)

# 累计数据数量

self.sum += 1

else:

print('\033[34m累计抓取数据%s条!\033[0m' % self.sum)

self.page += 1

else:

print("\033[34m著作权数据抓取完毕!共抓取%s条数据\033[0m" % str(self.sum))

showFunction()

# 展示功能菜单

def showFunction():

print("*" * 66)

print("\t\t\t\t\t专利著作权信息下载工具V1.0\t\t\t\t\t")

print("*" * 66)

print("\033[34m请选择功能\n1.抓取全部专利数据\n2.已抓取全部专利数据,执行更新数据操作\n3.抓取全部著作权数据\n4.退出程序\033[0m")

print("*" * 66)

point = True

while point:

fuc = input('请输入功能序号:')

if not fuc.isdigit():

print("\033[31m输入错误,请输入功能序号!\033[0m")

point = True

elif int(fuc) == 1:

strs = input("请输入申请人关键词(直接回车键返回上一级):")

if not len(strs):

point = True

else:

Patent(strs, 1985)

point = False

elif int(fuc) == 2:

strs = input("请输入申请人关键词(直接回车键返回上一级):")

if not len(strs):

point = True

else:

Patent(strs, 2019)

point = False

elif int(fuc) == 3:

strs = input("请输入著作权关键词(直接回车键返回上一级):")

if not len(strs):

point = True

else:

Copyright(strs)

point = False

elif int(fuc) == 4:

print('程序已关闭...')

exit()

else:

print("\033[31m输入错误,请输入正确的功能序号!\033[0m")

point = True

# Patent("江西")

# Patent("南昌")

if __name__ == '__main__':

showFunction()

MysqlHelper.py数据库辅助连接类:

from click._compat import raw_input

from pymysql import *

"""封装mysql连接类"""

class MysqlHelper:

"""初始化数据库参数"""

def __init__(self, host, port, user, passwd, db, charset):

# 数据库连接地址

self.host = host

# 地址端口

self.port = port

# 数据库用户名

self.user = user

# 数据库密码

self.passwd = passwd

# 数据库名称

self.db = db

# 编码

self.charset = charset

"""连接数据库,获取Connection对象和cursor游标对象"""

def open(self):

self.conn = connect(host=self.host, port=self.port, user=self.user, passwd=self.passwd, db=self.db,

charset=self.charset)

self.cursor = self.conn.cursor()

"""执行用户输入的sql语句,参数化sql语句中的输入值"""

def execute(self, sql, params=()):

try:

# 打开数据库连接

self.open()

# 执行sql语句

self.cursor.execute(sql, params)

# 提交事务

self.conn.commit()

# 关闭数据库连接

self.close()

# print("sql执行完成")

except Exception as e:

# 发送错误回滚

# self.rollback()

return e

def createDataBase(self, sql, params=()):

try:

# 打开数据库连接

conn = connect(host=self.host, port=self.port, user=self.user, passwd=self.passwd,

charset=self.charset)

cursor = conn.cursor()

# 执行sql语句

cursor.execute(sql, params)

# 提交事务

conn.commit()

# 关闭数据库连接

cursor.close()

conn.close()

# print("sql执行完成")

except Exception as e:

# 发送错误回滚

# self.rollback()

print(e)

"""返回sql全部查询结果"""

def all(self, sql, params=()):

try:

# 打开数据库连接

self.open()

# 执行sql语句

self.cursor.execute(sql, params)

# 调用cursor的fetchall获取全部执行结果

result = self.cursor.fetchall()

# 关闭数据库连接

self.close()

# 返回执行结果

return result

except Exception as e:

return e

"""返回sql查询结果一行"""

def single(self, sql, params=()):

try:

# 打开数据库连接

self.open()

# 执行sql语句

self.cursor.execute(sql, params)

# 调用cursor的fetchone获取全部执行结果中的一条

result = self.cursor.fetchone()

# 关闭数据库连接

self.close()

# 返回执行结果

return result

except Exception as e:

print(e)

"""数据库回滚"""

def rollback(self):

self.conn.rollback()

"""关闭数据库"""

def close(self):

self.cursor.close()

self.conn.close()

"""测试用"""

if __name__ == '__main__':

msh = MysqlHelper('localhost', 8080, 'root', '123', 'test', 'utf8')

name = raw_input('请输入学生姓名:')

sbname = raw_input('请输入科目名称:')

sql = 'insert into students(name) values(%s)'

sql1 = 'insert into subjects(sbname) values(%s)'

sql2 = 'select id,name from students where id<5'

msh.execute(sql, [name])

msh.execute(sql1, [sbname])

print(msh.all(sql2))

程序可能存在部分bug,欢迎交流指正。

python 版权保护,python爬虫篇4——爬取专利著作权信息相关推荐

  1. python爬虫篇4——爬取专利著作权信息

    mysql代码: CREATE TABLE `copyright` (`id` int(11) NOT NULL AUTO_INCREMENT,`软件名称` varchar(500) DEFAULT ...

  2. [Python 爬虫] 使用 Scrapy 爬取新浪微博用户信息(四) —— 应对反爬技术(选取 User-Agent、添加 IP代理池以及Cookies池 )

    上一篇:[Python 爬虫] 使用 Scrapy 爬取新浪微博用户信息(三) -- 数据的持久化--使用MongoDB存储爬取的数据 最近项目有些忙,很多需求紧急上线,所以一直没能完善< 使用 ...

  3. [Python 爬虫] 使用 Scrapy 爬取新浪微博用户信息(二) —— 编写一个基本的 Spider 爬取微博用户信息

    上一篇:[Python 爬虫] 使用 Scrapy 爬取新浪微博用户信息(一) -- 新建爬虫项目 在上一篇我们新建了一个 sina_scrapy 的项目,这一节我们开始正式编写爬虫的代码. 选择目标 ...

  4. [Python 爬虫] 使用 Scrapy 爬取新浪微博用户信息(三) —— 数据的持久化——使用MongoDB存储爬取的数据

    上一篇:[Python 爬虫] 使用 Scrapy 爬取新浪微博用户信息(二) -- 编写一个基本的 Spider 爬取微博用户信息 在上一篇博客中,我们已经新建了一个爬虫应用,并简单实现了爬取一位微 ...

  5. python -又一次爬虫练习(爬取LOL所有的英雄头像)

    python -又一次爬虫练习(爬取LOL所有的英雄头像) 目标网站:https://lol.qq.com/data/info-heros.shtml#Navi 一开始我尝试用requests来get ...

  6. Python爬虫入门(爬取豆瓣电影信息小结)

    Python爬虫入门(爬取豆瓣电影信息小结) 1.爬虫概念 网络爬虫,是一种按照一定规则,自动抓取互联网信息的程序或脚本.爬虫的本质是模拟浏览器打开网页,获取网页中我们想要的那部分数据. 2.基本流程 ...

  7. 爬虫篇——User-Agent爬取备用及存储

    爬虫篇--User-Agent爬取备用及存储 代码 代码 本文通过抓取常见的User-Agent(用户代理),将其写入列表并保存为json格式文件,且将代码进行了封装,方便以后抓取数据时动态的更新请求 ...

  8. layui获取input信息_python爬虫—用selenium爬取京东商品信息

    python爬虫--用selenium爬取京东商品信息 1.先附上效果图(我偷懒只爬了4页) 2.京东的网址https://www.jd.com/ 3.我这里是不加载图片,加快爬取速度,也可以用Hea ...

  9. python爬虫训练:爬取榜单信息

    一.创作背景 这学期的大作业是要根据这学期的学习内容做一个综合程序,这次是一个爬取酷狗音乐飙升榜单的信息,并下载下来.可以方便和我一样喜欢白嫖的人员免费下载音乐. 二.使用的库 主要使用了reques ...

  10. go爬虫和python爬虫哪个好_python 爬虫实战项目--爬取京东商品信息(价格、优惠、排名、好评率等)-Go语言中文社区...

    利用splash爬取京东商品信息 一.环境 window7 python3.5 pycharm scrapy scrapy-splash MySQL 二.简介 为了体验scrapy-splash 的动 ...

最新文章

  1. exchange 2010备份及恢复
  2. 4 angular 重构 项目_vuejs angularjs 框架的一些比较(vue项目重构四)
  3. css禁止鼠标双击选中文字
  4. 008_效果和动画的Callback函数
  5. easypoi 如何合并相同的列,如何在Java中的POI中使用XWPFTable合并单元格(或应用colspan)?...
  6. 关于CPU Cache——程序猿需要知道的那些事
  7. apache OFBiz的安装
  8. 设置 Visual Studio 文件版权信息 - C语言零基础入门教程
  9. 用virt-manager管理远程KVM虚拟机
  10. Python内置函数sorted()高级排序用法
  11. 为所有北京奥运冠军名字作诗(诗集)
  12. html中c b和b s,Web开发中B/S架构和C/S架构的区别
  13. IDC发布2020上半年SD-WAN报告:阿里云领跑国内服务市场
  14. 扒一扒,互联网大厂内部都用什么软件沟通?
  15. 非常有意思的Flowlet
  16. chrome://flags是什么?
  17. 魏则西事件中,百度属于一般的龌龊
  18. PostgreSQL-Arcgis地理数据库中的系统表
  19. [转]水木不在,何以清华?
  20. laravel 分析html,Laravel 5:使用Blad显示HTML

热门文章

  1. 推荐英语学习几本好书
  2. 什么叫断章取义,什么叫曲解——你被骗了多少年?
  3. python 语音转文字_音频转文字这种刚需,我用python写了个软件,免费不限时
  4. CSS — 导航栏篇(一)
  5. 单片机最小系统由几部分组成?TTL是什么?
  6. PC机通过二层交换机连接三层交换机
  7. 推荐个echarts网站
  8. Intel Hex概述 以及 intel2readmemh 和 Intel HEX to BINARY File Converter Utility
  9. 什么是自锁开关?自锁开关的工作原理介绍
  10. 第二章第三章 查找和排序(上)