Webdriver 爬取新浪滚动新闻

初始想法

本人现在是国际关系学院2016级的本科生，学的是信息管理与信息系统。讲道理不知道这个专业到底是干啥的，现在选择的后续方向是数据科学与工程，并且在老师的自然语言处理小组。爬虫是做自然语言处理的基础嘛，学习机器学习之前先学学怎么爬取内容还是挺有意义的。本来开始想着爬一下新浪微博的内容，但是又涉及到滚动爬取，账号登陆之类的繁琐问题，还是先玩玩滚动新闻吧。其实讲道理中国新闻网的滚动新闻做的比新浪的好多了，界面也好看，不过这都是爬完之后才发现的哈哈哈哈哈哈哈

背景介绍

本项目为基于新浪滚动新闻（https://news.sina.com.cn/roll/#pageid=153&lid=2509&k=&num=50&page=1）进行网页爬取，以新闻标题为文件名称，以新闻内容为文件正文存储。

内容介绍

环境要求

环境要求：python3.7+
安装包要求：time，requests，random，Beautifulsoup，selenium等

文件介绍

main：调取其他所有文件相关函数，输入初始url，并计算爬取全部网页耗时
date_helper：对网页日期进行调整实现自动翻页
data_helper：对数据的所有路径进行调整
spider：爬取网页的主文件，调用Webdriver获取主索引页的子页面并获取网页内容
article_spider：爬取新闻正文内容

代码

main

from date_helper import date_processing
from data_helper import pickle_writer
from spider import *
import timestart = time.clock()
if __name__ == '__main__':url_org = 'http://roll.news.sina.com.cn/s/channel.php?ch=01#col=89&spec=&type=&date={}&ch=01&k=&offset_page=0&offset_num=0&num=60&asc=&page='while True:date = date_processing()  # 获取日期output_list = []          # 存放输出序列 listurl = url_org.format(date) # 生成待爬取URLsina(url,output_list,date) # 爬虫print(output_list)print(len(output_list))file_name = ''.format(date)pickle_writer(output_list, file_name)  # 写入临时文件存放
end = time.clock()
print('Running:%s seconds.'%(end - start))

date_helper

from selenium import webdriver
import re
import time
import calendar
import re
import codecs
from data_helper import *def count_days(year, month):cal = calendar.monthrange(year, month)pattern = re.compile(r'\d+')days = pattern.findall(str(cal))[1]return daysdef month_sub(year,month):if month > 10:month -= 1month = str(month)elif month <= 10 and month > 1 :month -= 1month = '0'+str(month)else:year -= 1month = 12return year,monthdef date_sub(year,month,day):if day > 10:day -= 1day = str(day)elif day <= 10 and day > 1:day -= 1day = '0'+str(day)else:year, month = month_sub(int(year),int(month))days = count_days(year, int(month))day = daysdate = str(year)+'-'+str(month) +'-'+str(day)  #新浪滚动新闻return datedef date_processing():date_txt = ""last_date = txt_load(date_txt)date = str(last_date[0])year = int(date.split("-")[0])month = date.split("-")[1]day = int(date.split("-")[2])date = date_sub(year, month, day)writer = codecs.open(date_txt,'w','UTF-8')writer.write(date)writer.flush()return date

data_helper

import re
import pickle
import codecs
import jieba
'''
读取原始数据
'''
def txt_load(path):reader = codecs.open(path,'r','UTF-8')lines = reader.readlines()return linesdef join_list(ss):c = ""for k in ss:c+=kreturn cdef pickle_writer(input_,name):''':param input_: 待保存的数据:param name:  存放路径'''writer = open(name,"wb")pickle.dump(input_,writer)writer.close()print("finish to write data")# 定义读plk文件函数
def pickle_load(input_):''':param input_: 路径:return:  原始数据'''raeder = open(input_,"rb")content = pickle.load(raeder)raeder.close()print("finish to read data")return contentdef jieba_cut(content):''':param content: str 句子 待分词:return: 分好词的list'''cut = jieba.cut(content)l = []for con in cut:if con!=" ":l.append(con)return ldef is_chinese(uchar):"""判断一个unicode是否是汉字"""if uchar >= u'\u4e00' and uchar <= u'\u9fa5':return ucharelif uchar == re.sub('[^a-zA-Z]', '', uchar):return str(uchar).lower()else:return ''

spider

# -*- coding: utf-8 -*-from selenium import webdriver
from article_spider import *
import redef get_pages(driver,url):''':param driver: Webdriver页面:param url: 指定日期的链接:return page_num: 指定日期内页面的数量'''start_url = url + '1'driver.get(start_url)time.sleep(2)driver.refresh()time.sleep(2)page_html = driver.page_sourcepagelist = re.findall('onclick="newsList.page.goTo(.*?);return false', page_html, re.S)pattern = re.compile('\d+')  # 获取页码数page_num = pattern.findall(pagelist[len(pagelist)-1])[0]return (page_num)def Get_content(driver,page_num,url,output_list,date):''':param driver: Webdriver页面:param page_num: 指定日期内页面的数量:param url: 指定日期的链接:param output_list: 输出list :param date: 指定日期'''k = 1while k <= int(page_num):driver.get(url + str(k))time.sleep(2.5)driver.refresh()for i in range(1, 11):for j in range(1, 6):classfy_cn = driver.find_element_by_xpath('//*[@id="d_list"]/ul[' + str(i) + ']/li[' + str(j) + ']/span[1]').texttitle = driver.find_element_by_xpath('//*[@id="d_list"]/ul[' + str(i) + ']/li[' + str(j) + ']/span[2]/a').texthref = driver.find_element_by_xpath('//*[@id="d_list"]/ul[' + str(i) + ']/li[' + str(j) + ']/span[2]/a').get_attribute('href')times = driver.find_element_by_xpath('//*[@id="d_list"]/ul[' + str(i) + ']/li[' + str(j) + ']/span[3]').textpubtime = times.split(" ")[1]content, classfy_en = get_article(href)content_list = [classfy_cn, classfy_en, date, pubtime, title, href, content]test = '' + title + '.txt'with open(test, 'w') as f:for content_list_element in content_list:f.write(content_list_element)output_list.append(content_list)print(len(output_list))k = k + 1def sina(url,output_list,date):''':param url: 待爬取的url:param output_list: 输出list:param date: 日期:return:'''driver = webdriver.Chrome()page_num = get_pages(driver, url)Get_content(driver, page_num, url, output_list, date)driver.close()

article_spider

#-*- coding:utf-8 -*-from bs4 import BeautifulSoup
from user_agents import agents
import requests
import time
import randomdef get_article(url):''':param url: 指定日期的链接:return content: 文本的内容:return classfy: 文本的类型'''try:classfy = url.split('.')[0].split('//')[1]agent = random.choice(agents)header = {'User-Agent': agent}res = requests.get(url.rsplit('\r\n')[0], headers=header)time.sleep(1)res.encoding = 'utf-8'soup = BeautifulSoup(res.text, 'html.parser')newsArticle = getnewsArticle(soup.select('.art_p'))content = ''for con in newsArticle:content = content + conreturn content, classfyexcept Exception as e:print(e)def getnewsArticle(news):''':param news: 新闻主题内容链接:return newsArticle: 新闻主题内容'''newsArticle = []for p in news:newsArticle.append(p.text.strip())return newsArticle

后记

爬取下来的内容还算ok，虽然这样看起来又繁琐又蛋疼，而且好像如果新闻标题中含有" / "这个字符的话，就会报错，显示没有这个文件夹，或许第二版代码我再考虑怎么加一下中文匹配（？）。代码里面的agent就随便网上找点就行，不用太在意。
第一次写，想来也有很多很多毛病，如果有人看到，还请指出，感恩的心，感谢有你。