谷歌学术文献信息爬取及文献下载

两个py文件

起主要作用的Search&Download.py

# -*- coding: utf-8 -*-import requests
from bs4 import BeautifulSoup
from Download import Hubber
import xlwt,os
from time import sleep
from tqdm import tqdmTotalNum=0
class Article(object):title = ""article_link = ""authors = ""authors_link = ""abstract = ""def __init__(self):title = "New Paper"def save_xls(sheet, paper):# 将数据按列存储入excel表格中global TotalNumsheet.write(TotalNum, 0, TotalNum)sheet.write(TotalNum, 1, paper.title)sheet.write(TotalNum, 2, paper.article_link)sheet.write(TotalNum, 3, paper.journal)sheet.write(TotalNum, 4, paper.authors_link)sheet.write(TotalNum, 5, paper.abstract)TotalNum += 1head = { \'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36' \}  # 20210607更新，防止HTTP403错误
article_titles = []
article_links = []def GetInfo(sheet,url):r = requests.get(url, headers=head)r.raise_for_status()r.encoding = r.apparent_encodingsoup = BeautifulSoup(r.text, "html.parser")#print("\n"+soup)articles = soup.find_all(class_="gs_ri")for article in articles:paper =Article()try:title = article.find('h3')paper.title = title.text#print("\n"+paper.title)article_titles.append(paper.title)paper.article_link = title.a.get('href')#print("\n"+paper.article_link)article_links.append(paper.article_link)journal = article.find(class_="gs_a")paper.journal =journal.text#print("\n"+paper.authors)authors_addrs = journal.find_all('a')for authors_addr in authors_addrs:#print("\n"+authors_addr.get('href'))paper.authors_link=paper.authors_link +(authors_addr.get('href'))+"\n"abstract = article.find(class_="gs_rs")paper.abstract = abstract.text#print("\n"+paper.abstract)except:continuesave_xls(sheet,paper)returndef getArticle(article_titles,article_links):dir = ".\\Articles\\" +keyword +"\\"#print (dir)if os.path.exists(dir) == False:os.mkdir(dir)for k in tqdm(range(len(article_titles))):article_titles[k]="{0}".format(article_titles[k].replace(':', ' ')).replace('.', '')path = dir + article_titles[k] + ".pdf"#print("\n"+path)try:Hubber.getPDF(article_links[k],path)sleep(0.5)except:continueif __name__ == '__main__':myxls = xlwt.Workbook()sheet1 = myxls.add_sheet(u'PaperInfo', True)column = ['序号', '文章题目','文章链接','期刊', '作者链接', '摘要']for i in range(0, len(column)):sheet1.write(TotalNum, i, column[i])TotalNum+=1keyword=input("keywords is?\n")#keyword = diabetes and conjunctiva and (microcirculation or microvasculature)#print("\n"+keyword)key = keyword.replace(" ","+")info = keyword + "_PaperInfo.xls"print("\n"+"检索中……")if os.path.exists(info) == True:print("\n" + "PaperInfo already exists!")else:start = 0for i in tqdm(range(10)):url = 'https://xs.dailyheadlines.cc/scholar?start=' + str(start) + '&q=' + key + '&hl=zh-CN&as_sdt=0,5'start = start + 10GetInfo(sheet1,url)myxls.save(keyword+'_PaperInfo.xls')sleep(0.5)print("\n"+"检索完成")print("\n"+"下载中……")if len(article_titles) != 0:getArticle(article_titles, article_links)else:import xlrddata = xlrd.open_workbook(info)table = data.sheet_by_index(0)article_titles = table.col_values(1)[1:]article_links = table.col_values(2)[1:]#print("\n"+article_titles)#print("\n"+article_links)getArticle(article_titles, article_links)print("\n"+ "下载完成")

起辅助作用的Download.py，可将更多网站补充进去！

import os.path
import re
import requests
from bs4 import BeautifulSoupclass Hubber:head = { \'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36' \}  # 20210607更新，防止HTTP403错误def pdf_hub(url,path):try:pdf = requests.get(url, headers=Hubber.head)with open(path, "wb") as f:f.write(pdf.content)print("\n"+"pdf found directly!")except:print("\n"+"failed to download pdf directly!\n" +url)Hubber.err_log(url)def sci_hub(path,doi):doi = str(doi).split("https://doi.org/")[1]url = "https://www.sci-hub.ren/doi:" + doi + "#"r = requests.get(url, headers=Hubber.head)r.raise_for_status()r.encoding = r.apparent_encodingsoup = BeautifulSoup(r.text, "html.parser")download_url = soup.iframe.attrs["src"]try:download_r = requests.get(download_url, headers=Hubber.head)download_r.raise_for_status()with open(path, "wb+") as temp:temp.write(download_r.content)print("\n"+"Article downloaded by doi!")except:print("\n"+"failed to download pdf by doi!\n" +url)Hubber.err_log(url)def err_log(url):with open("download_err.txt", "a+", encoding="utf-8") as error:error.write("PDF not found,download link may be: \n"+url +"\n")def getSoup(url):r = requests.get(url, headers=Hubber.head)r.raise_for_status()r.encoding = r.apparent_encodingsoup = BeautifulSoup(r.text, "html.parser")return soupdef getPDF(url,path):if os.path.exists(path) == True:print("\n" + "Article already exists")else:if (len(re.findall('pdf', url)) != 0):print ("\n"+'pdf link already!')Hubber.pdf_hub(url,path)elif re.match("https://www.sci-hub.ren/",url):print("\n" + 'sci_hub link!')url = str(url).replace("https://www.sci-hub.ren/","https://doi.org/")Hubber.sci_hub(path,url)#if pdf can be easily found!elif re.match("https://academic.oup.com/", url):soup = Hubber.getSoup(url)pdf_link ="https://academic.oup.com"+soup.find(class_="al-link pdf article-pdfLink").get('href')#print("\n"+pdf_link)Hubber.pdf_hub(pdf_link,path)'''doi = soup.select('div[class="ww-citation-primary"]')[0].a.get('href')#print("\n"+doi)Hubber.sci_hub(path,doi)'''elif re.match("https://content.iospress.com/", url):soup = Hubber.getSoup(url)pdf_link = soup.find(class_="btn btn-download btn-right get-pdf").get('href')# print("\n"+pdf_link)Hubber.pdf_hub(pdf_link, path)elif re.match("https://wwwnature.53yu.com/", url):soup = Hubber.getSoup(url)pdf_link = soup.find(class_="c-pdf-download__link").get('href')#print("\n"+pdf_link)Hubber.pdf_hub(pdf_link, path)elif re.match("https://bjo.bmj.com/", url):soup = Hubber.getSoup(url)pdf_link = soup.find(class_="article-pdf-download").get('href')pdf_link = "https://bjo.bmj.com" + pdf_link#print("\n"+pdf_link)Hubber.pdf_hub(pdf_link,path)elif re.match("https://jamanetwork.com/", url):soup = Hubber.getSoup(url)pdf_link = soup.find(class_="toolbar-tool toolbar-pdf al-link pdfaccess").get('data-article-url')pdf_link = "https://jamanetwork.com" + pdf_link#print("\n"+pdf_link)Hubber.pdf_hub(pdf_link, path)#if pdf can't be easily found,but doi can!elif re.match("https://sciencedirect.53yu.com/", url):soup = Hubber.getSoup(url)doi = soup.find(class_="doi").get('href')Hubber.sci_hub(path, doi)elif re.match("https://diabetes.diabetesjournals.org/", url):soup = Hubber.getSoup(url)doi = soup.select('.citation-doi')[0].a.get('href')Hubber.sci_hub(path, doi)elif re.match("https://journals.lww.com/", url):soup = Hubber.getSoup(url)doi = "https://doi.org/" + str(soup.find(id="ej-journal-doi").text).split("doi: ")[1]Hubber.sci_hub(path, doi)else:'''https://europepmc.org/https://iovs.arvojournals.org/https://linkspringer.53yu.com/'''print("\n"+"To be prettified!Download link may be: " +"\n" +url)Hubber.err_log(url)if __name__ == '__main__' :url = "https://www.nature.com/articles/s41598-021-87315-7.pdf"url1 = "https://www.sci-hub.ren/doi:10.1067/mva.2003.139#"url2 = "https://www.sci-hub.ren/doi:10.1067/mva.2003.139#"Hubber.getPDF(url,"test.pdf")Hubber.getPDF(url1,"test1.pdf")Hubber.getPDF(url2,"test2.pdf")

谷歌学术文献信息爬取及文献下载相关推荐

python爬取文献代码_爬取Pubmed文献及影响因子并尝试下载的脚本
Abstract 本脚本实现了通过Biopython爬取pubmed文献资料,并通过scholarscope爬取影响因子数据,最后尝试通过sci-hub下载文献,三位一体的方法. Introducti ...
基于scrapy下的租房信息爬取与数据展示工具的设计与实现
环境:python 3.6.0 Anaconda custom 64bit 4.3.0 Pycharm x64 专业版 2018.1.2 Web strom x64 专业版 2018.1.3 scra ...
生成osm文件_超酷城市肌理！地理数据信息爬取方法大全（B篇）DEM+POI+OSM
WENWEN:这一弹是对第一弹的补充和深化讲解,上一弹请点击常用的地理数据信息爬取方法大全(前期场地信息获取第一弹),关于DEM获取地形地理空间数据云提交任务一直在排队的问题,这个应该是官网的问题,不 ...
python爬虫京东，苏宁，小米众筹网站信息爬取
可代写python爬虫,收费可协商,用途需提前说明. 下面爬虫爬到的数据有100天左右,100家众筹的完整数据,需要的或者有写爬虫需求的同学可发邮件至starinsunriseabovesea@ali ...
12306车站信息爬取（4）——添加车票的票价信息
在前三篇文章的基础上: 12306车站信息爬取(1)--输入条件的判断,包括出发站,到达站,和出发时间,并获取车次信息的链接 12306车站信息爬取(2)--输入出发站,到达站和出发时间,获取车次信息 ...
python网易云热歌榜歌曲信息爬取（iframe框架内数据爬取，src为空）
为一线医护人员加油! 为武汉加油! 为中国加油! 为世界加油! 此爬虫是本人参考了了一位前辈的文章,并修改和优化了代码: 1.改为python3环境: 2.优化了抓取的歌曲时长中带一长串小数的问题: ...
python爬虫公众号_python爬虫_微信公众号推送信息爬取的实例
问题描述利用搜狗的微信搜索抓取指定公众号的最新一条推送,并保存相应的网页至本地. 注意点搜狗微信获取的地址为临时链接,具有时效性. 公众号为动态网页(JavaScript渲染),使用request ...
【2020-10-27】 scrapy爬虫之猎聘招聘信息爬取
声明:本文只作学习研究,禁止用于非法用途,否则后果自负,如有侵权,请告知删除,谢谢! scrapy爬虫之猎聘招聘信息爬取 1.项目场景目标网址:https://www.liepin.com/zhao ...
基于python的汽车信息爬取与可视化分析系统
温馨提示:文末有 CSDN 平台官方提供的学长 Wechat / QQ 名片 :) 1. 项目简介本项目利用网络爬虫技术从某汽车门户网站采集汽车数据,并利用 Flask + Echarts 前后端框 ...
广西人才网实习信息爬取与数据库存储实战
广西人才网实习信息爬取与数据库存储实战 https://www.gxrc.com/ 大家好,我是W 项目介绍:本项目为CrawlSpider结合MySQL.MongoDB爬取求职网站信息的项目,目标是 ...

谷歌学术文献信息爬取及文献下载

谷歌学术文献信息爬取及文献下载相关推荐

最新文章

热门文章