前言

软件：pycharm navicat

一、需求

获取相应关键词的数据
数据题目数据连接作者日期摘要数据库名字
将数据保存到数据库中在进行后续的数据处理

二、步骤

1.引入库

#selenium库
from selenium import webdriver
from selenium.webdriver.common.by import By#bs4
from bs4 import BeautifulSoup
import re#处理时间
import time
import pandas as pd
from datetime import datetime#连接数据库
import pymysql

2.selenium打开虚拟窗口

打开窗口

#发送请求
def askurl(url):#添加请求的头部options = webdriver.EdgeOptions()                              #开启启动参数useragent = ''                                                    #写入自己的useragentoptions.add_argument("user-agent:{}".format(useragent))options.add_argument("--proxy-server = http://{}".format(ip))    #代理ip ip要写自己的#开启模拟浏览器driver = webdriver.Edge(options = options)driver.get(url)#关闭所有不需要的窗口now = driver.current_window_handle                  #获取当前的主窗口all = driver.window_handles                           #获取所有窗口柄for i in all:if i != now:driver.switch_to.window(i)driver.close()time.sleep(1)#返回主窗口driver.switch_to.window(now)#返回数据return driver

数据处理

接收返回的信息

    url = 'https://www.webofscience.com/wos/woscc/basic-search'driver = tk.askurl(url)time.sleep(2)

点击开头的无关选项

    page = driver.page_source.encode('UTF-8')page = BeautifulSoup(page,'lxml')# print(page)if page.find_all('button',class_ = 'cookie-setting-link'):driver.find_element(By.CSS_SELECTOR,'#onetrust-accept-btn-handler').click()time.sleep(5)if page.find_all('button',class_ = 'bb-button _pendo-button-primaryButton _pendo-button'):driver.find_element(By.CSS_SELECTOR,'#pendo-button-59b176ac').click()

写入关键词点击检索按钮复制selector路径

    #写入关键词driver.find_element(By.CSS_SELECTOR,'#mat-input-0').send_keys('high-entropy alloy for hydrogen storage')time.sleep(3)#点击检索driver.find_element(By.CSS_SELECTOR,'#snSearchType > div.button-row > button.mat-focus-indicator.cdx-but-md.search.uppercase-button.mat-flat-button.mat-button-base.mat-primary').click()time.sleep(5)

题目a连接的获取

 n = 0while True:#翻页n = n+1page3 = change_page(driver,n)#获取页数page_num =  page3.find('span',class_ = 'end-page ng-star-inserted').textprint(page_num)# print(page3)if n > int(page_num):print('全部读取完了')breakelse:#第一页i = 0#进入到搜索得到的页面for da in page3.find_all('app-record',class_ = 'ng-star-inserted'):i = i + 1print('==================第'+str(i)+'条数据======================')#转换成lxml格式data = BeautifulSoup(str(da), 'lxml')#找到数据中题目的a连接data = data.select('a[class="title title-link font-size-18 ng-star-inserted"]')if data:# print(data)# articalinkdata = data[0]link = data.get('href')if link:articalink = 'https://www.webofscience.com' + str(link)print(articalink)else:articalink = ''# 点击连接 进入详情页面try:driver.find_element(By.CSS_SELECTOR,'body > app-wos > div > div > main > div > div.held > app-input-route > app-base-summary-component > div > div.results.ng-star-inserted > app-records-list > app-record:nth-child(' + str(i) + ') > div > div > div.data-section > div:nth-child(1) > app-summary-title > h3 > a').click()time.sleep(10)except:# 要在等一会刷新print('页面刷新 刷新一下')#滚动滚动条 刷新页面for s in range(0, 12000, 250):time.sleep(0.1)driver.execute_script('window.scrollTo(0, %s)' % s)time.sleep(10)try:driver.find_element(By.CSS_SELECTOR,'body > app-wos > div > div > main > div > div.held > app-input-route > app-base-summary-component > div > div.results.ng-star-inserted > app-records-list > app-record:nth-child(' + str(i) + ') > div > div > div.data-section > div:nth-child(1) > app-summary-title > h3 > a').click()time.sleep(6)except:print("两次点击失败......")

详情页的详情数据获取

                     # 解析数据page4 = driver.page_source.encode('UTF-8')page4 = BeautifulSoup(page4, 'lxml')pp = page4.find('div', class_='data-column ng-star-inserted')# print(type(pp))if pp:page5 = BeautifulSoup(str(pp), 'lxml')# articalnameartname = page5.find('h2', class_='title text--large')if artname:articalname = artname.textprint(articalname)else:articalname = ''# authoraur = page5.find('div', class_='authors-div')# print(aur)if aur:aur = aur.textauthor = str(aur).replace(' 作者:', '')print(author)else:author = ''# releasetimereltime = page5.find('span', id='FullRTa-pubdate')if reltime:try:timet = reltime.texttimet = pd.to_datetime(str(timet))except:timet = Nonereleasetime = timetprint(releasetime)else:releasetime = None# dbnamedbn = page5.find('span', id='FullRTa-doctype-0')if dbn:dbname = dbn.textprint(dbname)else:dbname = ''# abstractab = page5.find('div', id='FullRTa-abstract-basic')if ab:abstract = ab.textprint(abstract)else:abstract = ''print(articalname, articalink, author, releasetime, dbname, abstract)information(articalname, articalink, author, releasetime, dbname, abstract).connectmysql()# 返回上一个页面driver.back()time.sleep(10)#关闭浏览器！！！driver.quit()

点击翻页

def change_page(driver,n):print('=================================第'+str(n)+'页====================================')#第一页的话就不用滚if n == 1:# 滚动条滚18条数据要全刷出来for s in range(0, 15000, 250):time.sleep(0.1)driver.execute_script('window.scrollTo(0, %s)' % s)time.sleep(1)content = driver.page_source.encode('UTF-8')soup = BeautifulSoup(content,'lxml')time.sleep(2)return soupelse:#点击翻页a = driver.find_element(By.CSS_SELECTOR,'body > app-wos > div > div > main > div > div.held > app-input-route > app-base-summary-component > div > div.results.ng-star-inserted > app-page-controls:nth-child(4) > div > form > div > button:nth-child(4)')driver.execute_script("arguments[0].click();", a)time.sleep(6)# 18条数据要全刷出来for s in range(0, 15000, 250):time.sleep(0.1)driver.execute_script('window.scrollTo(0, %s)' % s)time.sleep(1)# 解析数据content = driver.page_source.encode('UTF-8')soup = BeautifulSoup(content, 'lxml')# print(soup)return soup

总结

暂时没有有问题随时私信我~

python+selenium+bs4爬取web of science的数据相关推荐

python + selenium多进程爬取淘宝搜索页数据
python + selenium多进程爬取淘宝搜索页数据 1. 功能描述按照给定的关键词,在淘宝搜索对应的产品,然后爬取搜索结果中产品的信息,包括:标题,价格,销量,产地等信息,存入mongodb ...
python+selenium+phantomJS爬取国家地表水水质自动监测实时数据发布系统——动态网页爬虫
一.关于phantomjs 1.介绍 PhantomJS是一个为自动化而生的利器,它本质上是一个基于webkit内核的无界面浏览器,并可使用JavaScript或CoffeeScript进行编程.由于 ...
[转载] python+selenium定时爬取丁香园的新冠病毒每天的数据，并制作出类似的地图（部署到云服务器）
参考链接: Python vars() python+selenium定时爬取丁香园的新冠病毒每天的数据,并制作出类似的地图(部署到云服务器) 声明:仅供技术交流,请勿用于非法用途,如有其它非法用途造 ...
python+selenium定时爬取丁香园的新冠病毒每天的数据，并制作出类似的地图（部署到云服务器）
python+selenium定时爬取丁香园的新冠病毒每天的数据,并制作出类似的地图(部署到云服务器) 声明:仅供技术交流,请勿用于非法用途,如有其它非法用途造成损失,和本博客无关目录 python ...
python + selenium +chrome爬取qq空间好友说说并存入mongodb数据库
python + selenium +chrome爬取qq空间好友说说并存入mongodb数据库准备阶段在正式开始在前需要先准备好做爬虫的工具,本例使用chrome无头浏览器进行爬取工作,也可使用 ...
python用bs4爬取豆瓣电影排行榜 Top 250的电影信息和电影图片，分别保存到csv文件和文件夹中
python用bs4爬取豆瓣电影排行榜 Top 250的电影信息和图片,分别保存到csv文件和文件夹中. 爬取的数据包括每个电影的电影名 , 导演 ,演员 ,评分,推荐语,年份,国家,类型. py如果 ...
Python requests+bs4爬取中药数据库TCMSP的资源获得清肺排毒汤的靶向基因(曲线救国)
Python requests+bs4 爬取TCMSP的资源获得清肺排毒汤的靶向基因为研究清肺排毒汤的中药材对于新冠肺炎的作用机制,需要收集相关数据,于是将目光洒向了TCMSP.. 检索首页是这样的 ...
python利用bs4爬取外国高清图片网站
python利用bs4爬取外国高清图片网站爬取高清图片爬取高清图片 import re import requests from bs4 import BeautifulSoup import o ...
基于selenium+scrapy爬取复仇者联盟4豆瓣影评数据
基于selenium+scrapy爬取复仇者联盟4豆瓣影评数据参考资料: 黑马程序员爬虫教程静觅爬虫教程 mac下anaconda安装selenium+PhantomJS scrapy下载中间件结 ...

python+selenium+bs4爬取web of science的数据

文章目录

前言

一、需求

二、步骤

1.引入库

2.selenium打开虚拟窗口

打开窗口

数据处理

点击翻页

总结

python+selenium+bs4爬取web of science的数据相关推荐

最新文章

热门文章