使用Python爬取知网信息

import requests
from urllib import request
from lxml import etree
import re
import csv, time
from w3lib.html import remove_tagsdef write_data(name):#timenow = time.strftime("%Y-%m-%d-%H%M%S", time.localtime())file = namewith open(file, 'a+', encoding='utf-8-sig', newline='') as f:writer_f = csv.writer(f)writer_f.writerow(['题目','作者','单位','所属期刊','发表时间','下载数','摘要','关键词','专题','分类号'])return file
def get_one(year):data = {'IsSearch': 'true',#CO33_1,_2'QueryJson': '{"Platform":"","DBCode":"CJFQ","KuaKuCode":"","QNode":{"QGroup":[{"Key":"Subject","Title":"","Logic":4,"Items":[],"ChildItems":[]},{"Key":"ControlGroup","Title":"","Logic":1,"Items":[],"ChildItems":[{"Key":".extend-tit-checklist","Title":"","Logic":1,"Items":[{"Key":0,"Title":"SCI","Logic":2,"Name":"SI","Operate":"=","Value":"Y","ExtendType":14,"ExtendValue":"","Value2":"","BlurType":""},{"Key":0,"Title":"EI","Logic":2,"Name":"EI","Operate":"=","Value":"Y","ExtendType":14,"ExtendValue":"","Value2":"","BlurType":""},{"Key":0,"Title":"北大核心","Logic":2,"Name":"HX","Operate":"=","Value":"Y","ExtendType":14,"ExtendValue":"","Value2":"","BlurType":""},{"Key":0,"Title":"CSSCI","Logic":2,"Name":"CSI","Operate":"=","Value":"Y","ExtendType":14,"ExtendValue":"","Value2":"","BlurType":""},{"Key":0,"Title":"CSCD","Logic":2,"Name":"CSD","Operate":"=","Value":"Y","ExtendType":14,"ExtendValue":"","Value2":"","BlurType":""}],"ChildItems":[]}]},{"Key":"NaviParam","Title":"","Logic":1,"Items":[{"Key":"navi","Title":"","Logic":1,"Name":"专题子栏目代码","Operate":"=","Value":"C033_8?","ExtendType":13,"ExtendValue":"","Value2":"","BlurType":""}],"ChildItems":[]},{"Key":"MutiGroup","Title":"","Logic":1,"Items":[],"ChildItems":[{"Key":"3","Title":"","Logic":1,"Items":[{"Key":"'+year+'","Title":"'+year+'","Logic":2,"Name":"年","Operate":"","Value":"'+year+'","ExtendType":0,"ExtendValue":"","Value2":"","BlurType":""}],"ChildItems":[]}]}]}}','PageName': 'AdvSearch','DBCode': 'CJFQ',# 'KuaKuCodes': '','CurPage': '1'}headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.70','Referer': 'https://kns.cnki.net/KNS8/AdvSearch?dbcode=CJFQ',#'Cookie' : 'Ecp_ClientId=2210720103601729746; Ecp_IpLoginFail=210720106.117.97.127; cnkiUserKey=3c237cd1-59ab-91cb-22ce-bbfc3ad33332; Ecp_ClientIp=106.117.97.127; RsPerPage=20; ASP.NET_SessionId=fkwgrvclzgkpqsagarxa5mdb; SID_kcms=124120; _pk_ref=["","",1626830042,"https://www.cnki.net/"]; _pk_ses=*; SID_kns_new=kns123112; SID_kns8=123122; CurrSortField=发表时间/(发表时间,'TIME')+desc; CurrSortFieldType=desc; _pk_id=2de25f01-dfc6-40ba-888f-beabd2c0efb8.1626748599.8.1626830093.1626830042.'}url = 'https://kns.cnki.net/KNS8/Brief/GetGridTableHtml'response = requests.post(url,headers=headers,data=data).text#print(response)pattern = re.compile('<div class=\'pages\'> <span class="total">共(.*?)页</span>.*?</div>',re.S)item = re.findall(pattern,response)return item[0]def get_two(year,i,file):data = {'IsSearch': 'true','QueryJson': '{"Platform":"","DBCode":"CJFQ","KuaKuCode":"","QNode":{"QGroup":[{"Key":"Subject","Title":"","Logic":4,"Items":[],"ChildItems":[]},{"Key":"ControlGroup","Title":"","Logic":1,"Items":[],"ChildItems":[{"Key":".extend-tit-checklist","Title":"","Logic":1,"Items":[{"Key":0,"Title":"SCI","Logic":2,"Name":"SI","Operate":"=","Value":"Y","ExtendType":14,"ExtendValue":"","Value2":"","BlurType":""},{"Key":0,"Title":"EI","Logic":2,"Name":"EI","Operate":"=","Value":"Y","ExtendType":14,"ExtendValue":"","Value2":"","BlurType":""},{"Key":0,"Title":"北大核心","Logic":2,"Name":"HX","Operate":"=","Value":"Y","ExtendType":14,"ExtendValue":"","Value2":"","BlurType":""},{"Key":0,"Title":"CSSCI","Logic":2,"Name":"CSI","Operate":"=","Value":"Y","ExtendType":14,"ExtendValue":"","Value2":"","BlurType":""},{"Key":0,"Title":"CSCD","Logic":2,"Name":"CSD","Operate":"=","Value":"Y","ExtendType":14,"ExtendValue":"","Value2":"","BlurType":""}],"ChildItems":[]}]},{"Key":"NaviParam","Title":"","Logic":1,"Items":[{"Key":"navi","Title":"","Logic":1,"Name":"专题子栏目代码","Operate":"=","Value":"C033_8?","ExtendType":13,"ExtendValue":"","Value2":"","BlurType":""}],"ChildItems":[]},{"Key":"MutiGroup","Title":"","Logic":1,"Items":[],"ChildItems":[{"Key":"3","Title":"","Logic":1,"Items":[{"Key":"'+year+'","Title":"'+year+'","Logic":2,"Name":"年","Operate":"","Value":"'+year+'","ExtendType":0,"ExtendValue":"","Value2":"","BlurType":""}],"ChildItems":[]}]}]}}','PageName': 'AdvSearch','DBCode': 'CJFQ',#'KuaKuCodes': '','CurPage': i}headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.70','Referer': 'https://kns.cnki.net/KNS8/AdvSearch?dbcode=CJFQ',# 'Cookie' : ''}url = 'https://kns.cnki.net/KNS8/Brief/GetGridTableHtml'response = requests.post(url, headers=headers, data=data).text#print(response)pattern = re.compile('<tr.*?<td class="name">.*?<a class="fz14" href=.*?&DbCode=(.*?)&dbname=(.*?)&filename=(.*?)&urlid.*?>.*?</a>.*?</td>.*?<td class="source">(.*?)</td>.*?<td class="date">(.*?)</td>.*?<td class="download">(.*?)</td>.*?</tr>',re.S)items = re.findall(pattern,response)for item in items:key_words = ' 'dbcode= item[0]dbname = item[1]filename = item[2]periodical = remove_tags(item[3]).strip() # 所属期刊date = str(item[4]).strip()  #发表日期download = remove_tags(item[5]).strip()  #下载数量if len(download) == 0 :download = 0#print(date,download)url_2 = 'https://kns.cnki.net/kcms/detail/detail.aspx?dbcode=' + dbcode + '&dbname=' + dbname + '&filename=' + filenameprint(url_2)# print(subject)response_2 = requests.get(url_2)text = response_2.content.decode('utf-8')pattern_2 = re.compile('<div class="brief">.*?<div class="wx-tit">.*?<h1>(.*?)</h1>.*?<h3.*?>(.*?)</h3>.*?<h3.*?>(.*?)</h3>.*?</div>.*?</div>',re.S)items_2 = re.findall(pattern_2, text)for item_2 in items_2:subject = remove_tags(item_2[0])  #题目author = remove_tags(item_2[1].replace('</span>','|')).replace('1','').replace('2','').replace('3','').replace('4','').replace('5','').replace('6','').replace('7','').replace('8','').replace('','').replace(",",'')print(author)company = remove_tags(item_2[2].replace('</span>','|')).replace('&nbsp;','').replace('1. ','').replace('2. ','|').replace('3. ','|').replace('4. ','|').replace('5. ','|').replace('6. ','|')#单位pattern_3 = re.compile('<div class="row"><span class="rowtit">.*?class="abstract-text">(.*?)</span>.*?</script>',re.S)items_3 = re.findall(pattern_3,text)if len(items_3)==0:abstract = ' 'else:abstract = items_3[0].replace('&lt;正&gt;','')html = etree.HTML(text)kkk = html.xpath("//p[@class='keywords']")  #关键词ggg = html.xpath("//div[@class='row']/ul/li")if len(ggg) == 0:classification_Number = ' 'special = ' 'else:classification_Number = str(ggg[-1].xpath("./p/text()")).replace('\'','').replace('[','').replace(']','').replace(';',' ')#分类号special = str(ggg[-2].xpath("./p/text()")).replace('\'','').replace('[','').replace(']','').replace(';',' ') #专题if len(kkk) == 0:with open(file, 'a+', encoding='utf-8-sig', newline='') as f:writer = csv.writer(f)writer.writerow([subject,author,company,periodical,date,download,abstract,key_words,special,classification_Number])else:key_words = str(kkk[0].xpath("./a/text()"))key_words = key_words.replace(' ','').replace('\'','').replace(';','').replace('[','').replace(']','').replace('\\','').replace('rn','').replace(',',' ')with open(file, 'a+', encoding='utf-8-sig', newline='') as f:writer = csv.writer(f)writer.writerow([subject,author,company,periodical,date,download,abstract,key_words,special,classification_Number])
name = '铁路运输管理工程.csv'
file = write_data(name)
for year in range(1998,2022):year=str(year)number = int(get_one(year))for i in range(1,number+1):get_two(year,i,file)time.sleep(1)
print('下载完成')
# year = '2021' 8051
# file = write_data(year)
# number = get_one(year)
# number = int(number)
# for i in range(number):
#     get_two(i,file)
#     time.sleep(1)
# print('下载完成')#改进:1.按年份自动搜索
#     2.年份进入之后自动统计总页数 然后按页数进行爬取

使用Python爬取知网信息相关推荐

  1. Python爬取知网信息——Python+selenium爬取知网信息(文献名,作者,来源,发表日期,文献类型)

    # -*- coding: utf-8 -*- #时间:2019.5.1 #运行环境Python 3.* ''' 1.运行此代码前需要先下载Chrome浏览器,去百度搜索下载 2.我是利用seleni ...

  2. 用python爬取基金网信息数据,保存到表格,并做成四种简单可视化。(爬虫之路,永无止境!)

    用python爬取基金网信息数据,保存到表格,并做成四种简单可视化.(爬虫之路,永无止境!) 上次 2021-07-07写的用python爬取腾讯招聘网岗位信息保存到表格,并做成简单可视化. 有的人留 ...

  3. [python爬虫] BeautifulSoup和Selenium简单爬取知网信息测试

    作者最近在研究复杂网络和知识图谱内容,准备爬取知网论文相关信息进行分析,包括标题.摘要.出版社.年份.下载数和被引用数.作者信息等.但是在爬取知网论文时,遇到问题如下:   1.爬取内容总为空,其原因 ...

  4. 用python爬取网站_「自如网」关于用python爬取自如网信息的价格问题(已解决) - seo实验室...

    自如网 ###这是一篇求助文,我能获取图片并变成字符串,但是无法获取位移量### 前两坛突发奇想想要爬取自如网的租房数据,本来以为能够请求+美丽+ re能全部搞定,没想到这个网站的反爬机制有点让我搞不 ...

  5. 关于用python爬取自如网信息的价格问题(已解决)

    ###这是一篇求助文,我能获取图片并变成字符串,但是无法获取位移量### 前两坛突发奇想想要爬取自如网的租房数据,本来以为能够请求+美丽+ re能全部搞定,没想到这个网站的反爬机制有点让我搞不定先贴个 ...

  6. python爬虫知网实例-python爬取知网

    广告关闭 腾讯云双11爆品提前享,精选热门产品助力上云,云服务器首年88元起,买的越多返的越多,最高满返5000元! https:github.comgnemougdistribute_crawler ...

  7. python 爬取知网url

    由于知网存在异步加载,爬取需要获得queryid,cookies等 代码如下: #coding:utf-8 ''' Created on 2016-8-15@author: 刘帅 ''' import ...

  8. 爬虫练习(一)爬取知网、万方、维普文献标题、作者、刊物来源等信息

    刚刚开始学习Python,今天一周整,浏览了站内一些大侠博客相关内容,自己也尝试着写了一个爬虫. 直接借鉴了几位大侠的经验,如有冒犯之处恳请海涵. 先说说目的吧,本爬虫的目的是根据EXCEL文件的作者 ...

  9. python抓取文献关键信息,python爬虫——使用selenium爬取知网文献相关信息

    python爬虫--使用selenium爬取知网文献相关信息 写在前面: 本文章限于交流讨论,请不要使用文章的代码去攻击别人的服务器 如侵权联系作者删除 文中的错误已经修改过来了,谢谢各位爬友指出错误 ...

最新文章

  1. .NET2.0抓取网页全部链接【月儿原创】
  2. 【每日学习Mybatis中基础】trim标签使用
  3. python的none是什么-python中的none类型
  4. QT的QSignalBlocker类的使用
  5. xtrabackup对MySQL数据库的备份及恢复教程
  6. python具体工作内容_有没有人知道公司里用python工作的内容有什么
  7. php 常用正则运算
  8. php与ipa接口登录验证失败,thinkPHP5.0开发微信小程序登录接口signature验证失败
  9. php 队列 api,GitHub - shirakun/think-queue: ThinkPHP 队列支持
  10. python logging日志分割_Python3测井曲线切割,python3logging,日志
  11. python批量处理excel——给指定单元格填充颜色
  12. DHTMLET-Cascading Style Sheet 2.0
  13. ImageLoader(UIL)自定义HTTP Header信息
  14. 高速刹车失灵,特斯拉回应女子坐车顶维权
  15. 微型计算机与接口技术答案,微机原理及接口技术-习题答案
  16. syncthing搭建自己的同步云
  17. 第14章可迭代的对象,迭代器和生成器
  18. 3-2加法器、4-2压缩器、5-2压缩器
  19. echarts实现省市地图
  20. Formal Verification (五) coverage、sign-off flow

热门文章

  1. ubuntu 18.04 安装 xfce 4.14
  2. 【技术团队怎么带】技术团队领导者实操技能系列 (1)
  3. 从使命召唤手游国际版将于暑假公测 谈论代理IP在游戏中的作用
  4. 传统的关系型数据库优缺点
  5. 通信设备市场遭遇狼来了,华为面临三星挑战
  6. 操作系统0x08-内存管理概念
  7. 模拟器中指定应用打开闪退(之前没有问题),其他应用没有该问题
  8. python模拟linux键盘上键和回车_linux系统运维,挂载和分区概念太难?在虚机下操作一次全掌握...
  9. 商品sku 和批量编辑 js算法
  10. 【Linux篇】awk命令详解