爬虫主方法

# -*- coding: utf-8 -*-
import sys
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests
import time
import random
import re
import os
import threading
import chardetcount1 = 0
lock = threading.Lock()# 用requests方法发送请求，获取年鉴目录数据
def get_result(ybcode, page=1):try:# 设定paramsdata = {'ybcode': ybcode, 'entrycode': '', 'page': page, 'pagerow': '20','Referer': 'http://data.cnki.net/Yearbook'}# 设定请求头headers = {'Content-Type': 'application/x-www-form-urlencoded','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',# Cookie或许需要不时更新'Cookie': 'Ecp_ClientId=2201106155502682665; Ecp_LoginStuts={"IsAutoLogin":false,"UserName":"sh0292",''"ShowName":"%e4%b8%ad%e5%9b%bd%e7%9f%bf%e4%b8%9a%e5%a4%a7%e5%ad%a6%e5%9b%be%e4%b9%a6%e9%a6%86",''"UserType":"bk","BUserName":"","BShowName":"","BUserType":"","r":"R7eKrF"}; ''c_m_LinID=LinID=WEEvREcwSlJHSldSdmVqeVpQWEhjK2JqNWVTcFpPTFJSTFVLMnUxWGMyQT0''=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!&ot=11/06/2020 17:39:14; ''LID=WEEvREcwSlJHSldSdmVqeVpQWEhjK2JqNWVTcFpPTFJSTFVLMnUxWGMyQT0''=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!; c_m_expire=2020-11-06 17:39:14; ''Hm_lvt_911066eb2f53848f7d902db7bb8ac4d7=1604650989,1604651116,1604651168,1604654428; ''ASP.NET_SessionId=3d0xpwff2pt0exxcclmw3we4; SID=009023; ''Hm_lpvt_911066eb2f53848f7d902db7bb8ac4d7=1604654428','Referer': 'https://login.cnki.net/login/?platform=kns&ForceReLogin=1&ReturnURL=https://www.cnki.net/',}# 访问查询年鉴目录的地址url = "https://data.cnki.net/Yearbook/PartialGetCatalogResult"# 对params进行编码params = urllib.parse.urlencode(data).encode(encoding='utf-8')# 封装请求req = urllib.request.Request(url, params, headers)# 发送请求并接收结果r = urllib.request.urlopen(req)# 读取结果res = str(r.read(), 'utf-8')# print(res)return res# 此处接收异常直接退出，方便监视程序重新调起爬虫except Exception as e:sys.exit(0)# 获取总页数
def get_pageno(ybcode):soup = BeautifulSoup(get_result(ybcode), 'lxml')pages = int(soup.select('.s_p_listl')[0].get_text().split("共")[2].split('页')[0])print('总共' + str(pages) + '页')return pages# 数据的清理，除去文本中所有的\n和\r
def dataclear(data):data = re.sub('\n+', '', data)data = re.sub('\r+', '', data)data = re.sub(' +', '', data)data = re.sub('>+', '-', data)return data# 下载知网的统计年鉴之类的所有excel表
def filedata(yearBook, yearBookName):ybcode = yearBook.get('ybcode')dictionaryName = os.getcwd() + '/' + yearBook.get('year') + yearBookNamedictionaryName = dataclear(dictionaryName)pageno = get_pageno(ybcode)print(os.getcwd())if os.path.isdir(dictionaryName) == 0:os.mkdir(dictionaryName)os.chdir(dictionaryName)for i in range(1, pageno + 1, 1):print('######当前第' + str(i) + '页######')# 用BS获取对应元素soup = BeautifulSoup(get_result(ybcode, i), 'lxml')# print(soup)for j in soup.select('tr'):s = BeautifulSoup(str(j), 'lxml')# print(s)if len(s.select('img[src="/resources/design/images/nS_down2.png"]')) == 0:passelse:try:# 判断到年鉴下载地址所在的tdif len(BeautifulSoup(str(j), 'lxml').select('td:nth-of-type(3) > a')) >= 1:# 获取表标题title = str(BeautifulSoup(str(j), 'lxml').select('td:nth-of-type(1) > a')[0].get_text())# 获取链接url = 'http://data.cnki.net' + BeautifulSoup(str(j), 'lxml').select('td:nth-of-type(3) > a')[1].get('href')# 获取表的对应序号代码code = BeautifulSoup(str(j), 'lxml').select('td:nth-of-type(3) > a')[1].get('href').split("=")[1]# 若不清洗数据，则文件名中会包含\n等特殊字符，导致文件下载错误title = dataclear(title)# 判断文件是否已存在，存在则跳过if not os.path.isfile(dictionaryName + '/' + title + '.xls'):# 判断文件是否是附录，由于有的附录自带二级地址，会导致报错，此处暂时不下载附录if '附录' not in title:# 随机等待一阵time.sleep(random.random() * 4 + 8)print(filedown(title, url, code))else:print('已存在：' + title)except Exception as e:print('error:-------------------' + str(e))sys.exit(0)os.chdir(os.path.abspath(os.path.dirname(os.getcwd())))def count():global count1count1 = count1 + 1print('=====已下载：' + str(count1) + '个')# 文件下载函数
def filedown(title, url, code):# 首先判断指定code的文件是否已经存在path = os.getcwd()for file in os.listdir(path):if code in file:print("文件已存在")olddir = os.path.join(path, file)newdir = os.path.join(path, title + '.xls')print(olddir)print(newdir)os.rename(olddir, newdir)print('重命名：' + title)count()return "已完成"global browser# 现采用selenium直接模拟点击操作options = webdriver.ChromeOptions()# 设置为0表示禁止弹出窗口，设置文件下载路径# 设置默认下载目录prefs = {'profile.default_content_settings.popups': 0,'download.default_directory': os.getcwd()}# 设置为无头模式，不显示浏览器options.add_experimental_option('prefs', prefs)options.add_argument('headless')desired_capabilities = DesiredCapabilities.CHROME  # 修改页面加载策略desired_capabilities["pageLoadStrategy"] = "none"# 声明浏览器对象browser = webdriver.Chrome(desired_capabilities=desired_capabilities, options=options)wait = WebDriverWait(browser, 10)try:# 发送下载请求browser.get(url)# 找到登录按钮wait.until(EC.presence_of_element_located((By.ID, 'Button2')))loginButton = browser.find_element_by_id('Button2')# 点击登录按钮loginButton.click()countdown = 15while code not in "".join(os.listdir(path)):time.sleep(1)countdown = countdown - 1print("=====倒计时：" + str(countdown))if countdown < 1:browser.quit()return "下载失败：超时"else:# 重命名下载的文件for file in os.listdir(path):if code in file:olddir = os.path.join(path, file)newdir = os.path.join(path, title + '.xls')print(olddir)print(newdir)if not os.path.isfile(newdir):os.rename(olddir, newdir)browser.quit()count()return '下载完成,重命名：' + titleelse:browser.quit()return '文件已存在'print('不应该运行到这里')browser.quit()sys.exit(0)except Exception as e:print(e)browser.quit()sys.exit(0)def spider():# 确定年鉴名称yearBooksName = '中国能源统计年鉴'# 根据年鉴名称获取文件夹名dictionaryName = os.getcwd() + '/' + yearBooksName# 若文件夹不存在则新建if os.path.isdir(dictionaryName) == 0:os.mkdir(dictionaryName)os.chdir(dictionaryName)# 要爬取的年鉴列表，年份和对应的编号从网站审查元素获取yearBooks = [# {'ybcode': 'N2018070147', 'year': '2017年'},# {'ybcode': 'N2017110016', 'year': '2016年'},# {'ybcode': 'N2016120537', 'year': '2015年'},# {'ybcode': 'N2015110114', 'year': '2014年'},{'ybcode': 'N2014030143', 'year': '2013年'},{'ybcode': 'N2013020081', 'year': '2012年'},# {'ybcode': 'N2012020066', 'year': '2011年'},# {'ybcode': 'N2011030123', 'year': '2010年'},# {'ybcode': 'N2010080088', 'year': '2009年'},# {'ybcode': 'N2009060138', 'year': '2008年'},# {'ybcode': 'N2008070077', 'year': '2007年'},# {'ybcode': 'N2009100078', 'year': '2006年'},# {'ybcode': 'N2009100028', 'year': '2005年'},# {'ybcode': 'N2006050898', 'year': '2004年'},# {'ybcode': 'N2006050897', 'year': '2000-2002年'},# {'ybcode': 'N2005120868', 'year': '1997-1999年'},# {'ybcode': 'N2010040156', 'year': '1991年'},{'ybcode': 'N2005120869', 'year': '1991-1996年'},{'ybcode': 'N2005120761', 'year': '1989年'},{'ybcode': 'N2006010708', 'year': '1986年'},]# 循环执行爬取for yearBook in yearBooks:filedata(yearBook, yearBooksName)if __name__ == '__main__':spider()

持久化运行,监测爬虫程序运行状态，如果停止了自动重新开始，直接调用cmd，用绝对路径运行爬虫，并将日志输出获取。

# -*- coding: UTF-8 -*-
#!DATE: 2018/10/9
#!@Author: yingying
#keeprunning.py
import os
import subprocess# logging
# require python2.6.6 and later
import logging
from logging.handlers import RotatingFileHandler## log settings: SHOULD BE CONFIGURED BY config
LOG_PATH_FILE = "D:\pyCharm\studyTool\my_service_mgr.log"
LOG_MODE = 'a'
LOG_MAX_SIZE = 10 * 1024 * 1024  # 10M per file
LOG_MAX_FILES = 10  # 10 Files: my_service_mgr.log.1, printmy_service_mgrlog.2, ...
LOG_LEVEL = logging.DEBUGLOG_FORMAT = "%(asctime)s %(levelname)-10s[%(filename)s:%(lineno)d(%(funcName)s)] %(message)s"handler = RotatingFileHandler(LOG_PATH_FILE, LOG_MODE, LOG_MAX_SIZE, LOG_MAX_FILES)
formatter = logging.Formatter(LOG_FORMAT)
handler.setFormatter(formatter)Logger = logging.getLogger()
Logger.setLevel(LOG_LEVEL)
Logger.addHandler(handler)# color output
#
pid = os.getpid()def print_error(s):print('\033[31m[%d: ERROR] %s\033[31;m' % (pid, s))def print_info(s):print( '\033[32m[%d: INFO] %s\033[32;m' % (pid, s))def print_warning(s):print ('\033[33m[%d: WARNING] %s\033[33;m' % (pid, s))def start_child_proc(command, merged):try:if command is None:raise (OSError, "Invalid command")child = Noneif merged is True:# merge stdout and stderrchild = subprocess.Popen(command)# child = subprocess.Popen(command,#                          stderr=subprocess.STDOUT,  # 表示子进程的标准错误也输出到标准输出#                          stdout=subprocess.PIPE  # 表示需要创建一个新的管道#                          )else:# DO NOT merge stdout and stderrchild = subprocess.Popen(command)# child = subprocess.Popen(command,#                          stderr=subprocess.PIPE,#                          stdout=subprocess.PIPE)return childexcept subprocess.CalledProcessError:pass  # handle errors in the called executableexcept OSError:raise (OSError, "Failed to run command!")def run_forever(command):print_info("start child process with command: " + ' '.join(command))Logger.info("start child process with command: " + ' '.join(command))merged = Falsechild = start_child_proc(command, merged)failover = 0while True:while child.poll() != None:failover = failover + 1print_warning("child process shutdown with return code: " + str(child.returncode))Logger.critical("child process shutdown with return code: " + str(child.returncode))print_warning("restart child process again, times=%d" % failover)Logger.info("restart child process again, times=%d" % failover)child = start_child_proc(command, merged)# read child process stdout and log itout, err = child.communicate()returncode = child.returncodeif returncode != 0:for errorline in err.slitlines():Logger.info(errorline)else:Logger.info("execute child process failed")Logger.exception("!!!should never run to this!!!")if __name__ == "__main__":cmd = 'py D:\pyCharm\studyTool\cnkiCrawler.py'run_forever(cmd)

Python 爬取zw年鉴相关推荐

python爬取链家网的房屋数据
python爬取链家网的房屋数据爬取内容爬取源网站爬取内容爬取思路爬取的数据代码获取房屋url 获取房屋具体信息爬取内容爬取源网站北京二手房 https://bj.lianjia. ...
python爬取顶点小说简单版
python爬取顶点小说简单版爬取网络资源首先要下载requests库因为这里面也有数据提取和分析所以也要有etree库,re库下载库的代码是:pip install 库名如:pip inst ...
python爬取电影评分_用Python爬取猫眼上的top100评分电影
代码如下: # 注意encoding = 'utf-8'和ensure_ascii = False,不写的话不能输出汉字 import requests from requests.exception ...
用Python爬取好奇心日报
用Python爬取好奇心日报本项目最后更新于2018-7-24,可能会因为没有更新而失效.如已失效或需要修正,请联系我! 本项目已授权微信公众号"菜鸟学Python"发表文章爬 ...
python爬取新闻并归数据库_Python爬取数据并写入MySQL数据库操作示例
Python爬取数据并写入MySQL数据库的实例首先我们来爬取 http://html-color-codes.info/color-names/ 的一些数据. 按 F12 或 ctrl+u 审查元 ...
Python 爬取北京二手房数据，分析北漂族买得起房吗？（附完整源码）
来源:CSDN 本文约3500字,建议阅读9分钟. 本文根据Python爬取了赶集网北京二手房数据,R对爬取的二手房房价做线性回归分析,适合刚刚接触Python&R的同学们学习参考. 房价高是 ...
python爬取天气_python3爬取各类天气信息
本来是想从网上找找有没有现成的爬取空气质量状况和天气情况的爬虫程序,结果找了一会儿感觉还是自己写一个吧. 主要是爬取北京包括北京周边省会城市的空气质量数据和天气数据. 过程中出现了一个错误:Unico ...
html如何获取请求头变量的值。_如何使用 Python 爬取微信公众号文章
我比较喜欢看公众号,有时遇到一个感兴趣的公众号时,都会感觉相逢恨晚,想一口气看完所有历史文章.但是微信的阅读体验挺不好的,看历史文章得一页页的往后翻,下一次再看时还得重复操作,很是麻烦. 于是便想着能 ...
python爬取网页书籍名称代码_python爬取亚马逊书籍信息代码分享
我有个需求就是抓取一些简单的书籍信息存储到mysql数据库,例如,封面图片,书名,类型,作者,简历,出版社,语种. 我比较之后,决定在亚马逊来实现我的需求. 我分析网站后发现,亚马逊有个高级搜索的功能 ...
python爬取资料_Python爬取FLASH播放器中的资料
Python爬取FLASH播放器中的资料. 一.首先了解一下AMF协议:AMF(Action Message Format)是Flash与服务端通信的一种常见的二进制编码模式,其传输效率高,可以在HT ...

Python 爬取zw年鉴

Python 爬取zw年鉴

爬虫主方法

持久化运行,监测爬虫程序运行状态，如果停止了自动重新开始，直接调用cmd，用绝对路径运行爬虫，并将日志输出获取。

Python 爬取zw年鉴相关推荐

最新文章

热门文章