新浪新闻简单抓取备忘录

一，抓取新浪新闻的简单方法

1、requests
2、pyquery解析
3、自定义日志类logger.py

代码中分别使用了python多进程multiprocessing、gevent和普通的循环进行对比提取。

先提取文章列表，然后调用spiderDetail.py 中的方法云获取新闻具体内容

#!/usr/bin/python3
# -*- coding: utf-8 -*-import requests
from pyquery import PyQuery as pq
from logger import *
from spiderDetail import *
import time
from multiprocessing import Pool
import gevent.pool
import gevent.monkeygevent.monkey.patch_all()sina_forex_url = 'http://finance.sina.com.cn/forex/'
def get_index_pages():response = requests.get(sina_forex_url)if response.status_code == 200:response.encoding = 'utf-8'content = response.textnewsSet = set()#用于存储全部的新闻页面url，因新浪新闻可能出现在多个版块中，因此执行去重操作# 获取头条新闻hot_ad_link = pq(content)('#wrap > div:nth-child(24) > div.Center.headline > h1 > a')hot_ad_url = hot_ad_link.attr('href')hot_ad_title = hot_ad_link.text()logger.debug('头条新闻')print(hot_ad_url)newsSet.add(hot_ad_url)# 获取焦点新闻列表logger.debug('获取到焦点新闻')focus_news_lists = get_focus_news(content)for new in focus_news_lists:newsSet.add(new['url'])logger.debug(new)#获取滚动新闻logger.debug('获取到滚动新闻')roll_news_lists = get_roll_news(content)for new in roll_news_lists:newsSet.add(new['url'])logger.debug(new)# 获取24新闻快递logger.debug('获取到24小时新闻')hours24_news_lists = get_24hours_news(content)for new in hours24_news_lists:newsSet.add(new['url'])logger.debug(new)# 获取分析数据新闻logger.debug('获取到分析数据')analysis_news_lists = get_analysis_news(content)for new in analysis_news_lists:newsSet.add(new['url'])logger.debug(new)# 获取机构观点新闻logger.debug('获取到机构观点')institution_opinion_news_lists = get_institution_opinion_news(content)for new in institution_opinion_news_lists:newsSet.add(new['url'])logger.debug(new)# 获取专家观点新闻logger.debug('获取到专家观点')specialist_opinion_news_lists = get_specialist_opinion_news(content)for new in specialist_opinion_news_lists:newsSet.add(new['url'])logger.debug(new)# 获取人民币汇率新闻logger.debug('获取到人民币汇率')rmb_exchange_news_lists = get_RMB_exchange_news(content)for new in rmb_exchange_news_lists:newsSet.add(new['url'])logger.debug(new)#提取文章详细内容logger.debug('抓取新闻共计：' + str(len(newsSet)))#采用基本的循环模式#for url in newsSet:#    get_page_detail(url)#采用多进程模式#pool = Pool(5)#pool.map(get_page_detail,newsSet)#pool.close()#pool.join()#采用gevent多协程pool = gevent.pool.Pool(5)data = pool.map(get_page_detail,newsSet)return len(newsSet)else:logger.info('请求新浪外汇首页失误')#获取焦点新闻
def get_focus_news(content):focus_news_list = pq(content)('#wrap > div:nth-child(24) > div.Center.headline > div.ListB > ul > li')for li in focus_news_list.items():yield {'title':li.text(),'url':li('a').attr('href')}
#获取滚动新闻
def get_roll_news(content):roll_news_list = pq(content)('#wrap > div:nth-child(25) > div.Center > div.ListB > ul > li')for li in roll_news_list.items():yield {'title':li.text(),'url':li('a').attr('href')}
#获取24新闻快递
def get_24hours_news(content):roll_news_list = pq(content)('#wrap > div.PartA.Top10 > div.CenterB > div.ListB.ListE > ul > li')for li in roll_news_list.items():yield {'title':li('a').text() + li('span').text(),'url':li('a').attr('href')}
#获取分析数据新闻
def get_analysis_news(content):roll_news_list = pq(content)('#wrap > div:nth-child(28) > div.Center > div.ListE > ul > li')for li in roll_news_list.items():yield {'title':li('a').text() + li('span').text(),'url':li('a').attr('href')}
#获取机构观点新闻
def get_institution_opinion_news(content):roll_news_list = pq(content)('#wrap > div:nth-child(29) > div.Center > div.ListE > ul > li')for li in roll_news_list.items():yield {'title':li('a').text() + li('span').text(),'url':li('a').attr('href')}
#获取专家观点新闻
def get_specialist_opinion_news(content):roll_news_list = pq(content)('#wrap > div:nth-child(30) > div.Center > div.ListE > ul > li')for li in roll_news_list.items():yield {'title':li('a').text() + li('span').text(),'url':li('a').attr('href')}
#获取人民币汇率新闻
def get_RMB_exchange_news(content):roll_news_list = pq(content)('#wrap > div:nth-child(31) > div.Center > div.ListE > ul > li')for li in roll_news_list.items():yield {'title':li('a').text() + li('span').text(),'url':li('a').attr('href')}#get_index_pages()

View Code

提取文章详细内容类spiderDetail.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-import requests
from requests.exceptions import RequestException
from pyquery import PyQuery as pq
import re
from logger import *
from DBHelper import *
from hashlib import md5
import json
from bs4 import BeautifulSoup as bsdef get_page_detail(url):try:response = requests.get(url)if response.status_code == 200:response.encoding = 'utf-8'content = response.textpqContent = pq(content)title = pqContent('#artibodyTitle').text()print(title)date = pqContent('#wrapOuter > div > div.page-info > span').text()[:16]print(date)cnt = bs(content,"lxml")body = cnt.find(id='artibody')blockquote = cnt.find('blockquote')if blockquote:new_tag = cnt.new_tag("<b>")new_tag.string = "<a href='www.mysite.com'>替换成自己网站的名称和地址</a>"body.blockquote.replace_with(new_tag)#print(str(body))
articleContent = pq(''.join(str(body)))#print(type(articleContent))#print(articleContent)if body:# 获取正文内容regex = re.compile('<!-- 原始正文start -->(.*)<!-- 原始正文end -->',re.S)match = re.findall(regex,str(body))if match:match = match[0].strip()images = pq(match)('img')for img in images:img_name = get_page_img(pq(img).attr('src'))if img_name:r = re.subn(pq(img).attr('src'), 'img/' + img_name, match)match = r[0]content_url = write_to_file(match, url)dict = {'title':title,'content':content_url,'date':date,'expired':'false'}insert(dict)else:logger.info('未能提取到文章正文:[%s]' % url)else:logger.info('未在该文章页面中查找到标签artibody:[%s]' % url)except RequestException:logger.info('请求文章正文出错',url)#获取文章内容页中的图片
def get_page_img(url):try:response = requests.get(url)if response.status_code == 200:actual_img_path = save_image(response.content)return actual_img_pathelse:return Noneexcept RequestException:logger.info('请求图片出错',url)return None#保存文章页中的全部插图
def save_image(content):img_folder = os.path.join(os.getcwd(), 'img')img_name = md5(content).hexdigest()img_path = '{0}/{1}.{2}'.format(img_folder,img_name,'jpg')if not os.path.exists(img_path):with open(img_path,'wb') as f:f.write(content)f.close()return img_name+'.jpg'else:return img_name
# 将正文内容保存至文件中
def write_to_file(content,url):content_folder = os.path.join(os.getcwd(),'files')file_name = md5(url.encode('utf-8')).hexdigest()file_path = '{0}/{1}.{2}'.format(content_folder, file_name,'txt')if not os.path.exists(file_path):with open(file_path,'w',encoding='utf-8') as f:f.write(json.dumps(content,ensure_ascii=False))f.close()logger.info('文件正文保存成功---新浪地址url:'+ url)return file_nameelse:return file_name

View Code

日志类 logger.py

#!usr/bin/python3
# -*- coding: utf-8 -*-import os
import logging
import timelogger = logging.getLogger()
logger.setLevel(logging.DEBUG)#按天保存记录日志
logFile = './log/log_{0}.txt'.format(time.strftime("%Y%m%d",time.localtime()))# if not os.path.exists(logFile):
#     os.mknod(logFile)
fh = logging.FileHandler(logFile,mode='a')
fh.setLevel(logging.INFO)ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)formater = logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s:%(message)s")
fh.setFormatter(formater)
ch.setFormatter(formater)logger.addHandler(fh)
logger.addHandler(ch)

View Code

其它的类，Models.py,DBHelper.py,config.py

DBHelper.py

#!/usr/bin/python3
# -*- coding: utf-8 -*-import pymysqlfrom config import *
from logger import *#
#扩展功能
#

def insert(article):db = pymysql.connect(host=HOST,port=POST,user=USERNAME,passwd=PASSWORD,db=DATABASE,charset='utf8',use_unicode=True)cursor = db.cursor()sql = """insert into articles(title,content,date,expired) values('%s','%s','%s','%s')""" % (article['title'],article['content'],article['date'],article['expired'])try:cursor.execute(sql)db.commit()logger.info('插入文章记录成功，执行命令[' + sql + ']')except:logger.error('文章记录插入错误,执行命令[' + sql + ']')db.rollback()db.close()

View Code

数据库配置类config.py

#database config items
HOST = '127.0.0.1'
POST = 6000            #数据库端口号，我更改为了6000
DATABASE = '数据库名'
USERNAME = '数据库账号'
PASSWORD = '数据库密码'

View Code

Models.py(本实例未使用，可扩展编写，采用类操作文章)

#!usr/bin/python3
# -*- coding: utf-8 -*-class Article:ID = None,Title = '',SubTitle = '',Summary = '',Content = '',Date = '',Author = '',ForumID = 0,StickyPost = 'false',Expired = 'false'

View Code

二、配置nginx

请求访问入口app.py（采用nginx+uwsgi）

#!/usr/bin/env python3
# -*- coding: utf-8 -*-from sinaForex import *
import time
from logger import *def application(env,start_response):start_response('200 ok',[('Content_Type','text/html')])s = time.time()    len = get_index_pages()e = time.time()logger.info("-----------------共爬取新闻{0}条，耗时：{1}-----------------".format(len,round(e-s,3)))    rst =   "共爬取新闻{0}条，耗时：{1}".format(len,round(e-s,3))print(time.localtime(time.time()))return [b'%s' % rst.encode('utf-8')]

View Code

nginx配置，例如使用nginx默认配置:../nginx/conf/vhost/py.mysite.com.conf

server {
listen 80;
root /www/web/sina_forex;
server_name py.mysite.com;
index index.html index.php index.htm;
error_page 400 /errpage/400.html;
error_page 403 /errpage/403.html;
error_page 404 /errpage/404.html;
error_page 503 /errpage/503.html;

location /spider {
uwsgi_pass 127.0.0.1:8001;
include uwsgi_params;
}
location / {
try_files $uri @apache;
}
}

uwsgi的配置（/etc/uwsgi8001.ini）

[uwsgi]
socket = :8001    #web服务端口
chdir = /www/web/sina_forex   #网站根目录
wsgi-file = app.py         #请求处理类
vhost = true
master = true
processes = 5
pidfile = /www/web/sina_forex/uwsgi8001.pid
daemonize = /www/web/sina_forex/log/uwsgi8001.log

View Code

这样web服务即可正常获取，根据nginx的访问规则http://py.mysite.com/spider

三、定时收集新闻

写一个定时执行的类autoSpiderTimer.py,来定时执行web请求,而进程的管理采用supervisor

#!/usr/bin/env python
# -*- coding: utf-8 -*-import requests
import timedef timer():response = requests.get('http://py.mysite.com/spider')    print(response.text)print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))time.sleep(3600*24)timer()   #每天定时收集一次

View Code

转载于:https://www.cnblogs.com/ameile/p/8250301.html

新浪新闻简单抓取备忘录相关推荐

新浪新闻页面抓取（JAVA-Jsoup）
1.使用gradle建立工程: 工程格式如下: include ':spider-demo'rootProject.name = 'my-spider-demo' settings def void ...
（python爬虫）新浪新闻数据爬取与清洗+新浪新闻数据管理系统+MySQL
新浪新闻数据爬取与清洗+新浪新闻数据管理系统设计要求新浪新闻数据爬取与清洗基本要求:完成新浪新闻排行中文章的数据爬取,包括标题.媒体.时间.内容. 进阶要求:对最近一周出现次数最多的关键字排名并 ...
php抓取新浪数据,php 抓取新浪新闻的程序代码
抓取就是采集了,今天因为我们要做一个实时采集新浪新闻的功能,下面整理了一个非常了得的php 抓取新浪新闻的程序,我们来看看. 首先,需要下载一个simple_html_dom第三方扩展库,具体下载方式 ...
【API爬虫】30分钟百万条新浪新闻信息爬取。python得极速之旅
请参考源码,文字是最先得想法,没有再做更改.源码以更新前期准备:requests库:使用pip install requests 安装. pymongo库:使用pip install pymongo ...
php每天扒取当天新闻_php 抓取新浪新闻的程序代码
Jquery中文网 > 脚本编程 > php > 正文 php 抓取新浪新闻的程序代码 php 抓取新浪新闻的程序代码发布时间:2016-10-09 编辑:www.jque ...
爬取新浪新闻[内容笔记代码整理]
学习视频:网易云 Python网络爬虫实战环境:python3.5,requests,bs4,json,pandas,re,datetime 主要完成内容:爬取了新浪新闻-国内新闻版块的新闻信息,包 ...
Python爬虫爬取新浪新闻内容
首先感谢丘祐玮老师在网易云课堂的Python网络爬虫实战课程,接下来也都是根据课程内容而写.一来算是自己的学习笔记,二来分享给大家参考之用. 课程视频大概是在16年11月录制的,现在是18年2月.其中 ...
requests, Beautifusoup 爬取新浪新闻资讯
###1.爬取新浪新闻首页的新闻标题时间和链接 1 import requests 2 from bs4 import BeautifulSoup 3 4 res = requests.get('ht ...
python爬取新浪新闻
最近公司项目比较少,楼主闲了好长时间了,作为一个刚毕业几个月的新人,心里很烦躁,只能自己找点新东西去学了.看到周围好多人都接触了爬虫,再加上楼主最近沉迷吴宣仪不可自拔,每天投票投票,投票的同时需要监控 ...

新浪新闻简单抓取备忘录

新浪新闻简单抓取备忘录相关推荐

最新文章

热门文章