新浪新闻简单抓取备忘录
一,抓取新浪新闻的简单方法
1、requests
2、pyquery解析
3、自定义日志类logger.py
代码中分别使用了python多进程multiprocessing、gevent和普通的循环进行对比提取。
#!/usr/bin/python3 # -*- coding: utf-8 -*-import requests from pyquery import PyQuery as pq from logger import * from spiderDetail import * import time from multiprocessing import Pool import gevent.pool import gevent.monkeygevent.monkey.patch_all()sina_forex_url = 'http://finance.sina.com.cn/forex/' def get_index_pages():response = requests.get(sina_forex_url)if response.status_code == 200:response.encoding = 'utf-8'content = response.textnewsSet = set()#用于存储全部的新闻页面url,因新浪新闻可能出现在多个版块中,因此执行去重操作# 获取头条新闻hot_ad_link = pq(content)('#wrap > div:nth-child(24) > div.Center.headline > h1 > a')hot_ad_url = hot_ad_link.attr('href')hot_ad_title = hot_ad_link.text()logger.debug('头条新闻')print(hot_ad_url)newsSet.add(hot_ad_url)# 获取焦点新闻列表logger.debug('获取到焦点新闻')focus_news_lists = get_focus_news(content)for new in focus_news_lists:newsSet.add(new['url'])logger.debug(new)#获取滚动新闻logger.debug('获取到滚动新闻')roll_news_lists = get_roll_news(content)for new in roll_news_lists:newsSet.add(new['url'])logger.debug(new)# 获取24新闻快递logger.debug('获取到24小时新闻')hours24_news_lists = get_24hours_news(content)for new in hours24_news_lists:newsSet.add(new['url'])logger.debug(new)# 获取分析数据新闻logger.debug('获取到分析数据')analysis_news_lists = get_analysis_news(content)for new in analysis_news_lists:newsSet.add(new['url'])logger.debug(new)# 获取机构观点新闻logger.debug('获取到机构观点')institution_opinion_news_lists = get_institution_opinion_news(content)for new in institution_opinion_news_lists:newsSet.add(new['url'])logger.debug(new)# 获取专家观点新闻logger.debug('获取到专家观点')specialist_opinion_news_lists = get_specialist_opinion_news(content)for new in specialist_opinion_news_lists:newsSet.add(new['url'])logger.debug(new)# 获取人民币汇率新闻logger.debug('获取到人民币汇率')rmb_exchange_news_lists = get_RMB_exchange_news(content)for new in rmb_exchange_news_lists:newsSet.add(new['url'])logger.debug(new)#提取文章详细内容logger.debug('抓取新闻共计:' + str(len(newsSet)))#采用基本的循环模式#for url in newsSet:# get_page_detail(url)#采用多进程模式#pool = Pool(5)#pool.map(get_page_detail,newsSet)#pool.close()#pool.join()#采用gevent多协程pool = gevent.pool.Pool(5)data = pool.map(get_page_detail,newsSet)return len(newsSet)else:logger.info('请求新浪外汇首页失误')#获取焦点新闻 def get_focus_news(content):focus_news_list = pq(content)('#wrap > div:nth-child(24) > div.Center.headline > div.ListB > ul > li')for li in focus_news_list.items():yield {'title':li.text(),'url':li('a').attr('href')} #获取滚动新闻 def get_roll_news(content):roll_news_list = pq(content)('#wrap > div:nth-child(25) > div.Center > div.ListB > ul > li')for li in roll_news_list.items():yield {'title':li.text(),'url':li('a').attr('href')} #获取24新闻快递 def get_24hours_news(content):roll_news_list = pq(content)('#wrap > div.PartA.Top10 > div.CenterB > div.ListB.ListE > ul > li')for li in roll_news_list.items():yield {'title':li('a').text() + li('span').text(),'url':li('a').attr('href')} #获取分析数据新闻 def get_analysis_news(content):roll_news_list = pq(content)('#wrap > div:nth-child(28) > div.Center > div.ListE > ul > li')for li in roll_news_list.items():yield {'title':li('a').text() + li('span').text(),'url':li('a').attr('href')} #获取机构观点新闻 def get_institution_opinion_news(content):roll_news_list = pq(content)('#wrap > div:nth-child(29) > div.Center > div.ListE > ul > li')for li in roll_news_list.items():yield {'title':li('a').text() + li('span').text(),'url':li('a').attr('href')} #获取专家观点新闻 def get_specialist_opinion_news(content):roll_news_list = pq(content)('#wrap > div:nth-child(30) > div.Center > div.ListE > ul > li')for li in roll_news_list.items():yield {'title':li('a').text() + li('span').text(),'url':li('a').attr('href')} #获取人民币汇率新闻 def get_RMB_exchange_news(content):roll_news_list = pq(content)('#wrap > div:nth-child(31) > div.Center > div.ListE > ul > li')for li in roll_news_list.items():yield {'title':li('a').text() + li('span').text(),'url':li('a').attr('href')}#get_index_pages()
View Code
#!/usr/bin/env python3 # -*- coding: utf-8 -*-import requests from requests.exceptions import RequestException from pyquery import PyQuery as pq import re from logger import * from DBHelper import * from hashlib import md5 import json from bs4 import BeautifulSoup as bsdef get_page_detail(url):try:response = requests.get(url)if response.status_code == 200:response.encoding = 'utf-8'content = response.textpqContent = pq(content)title = pqContent('#artibodyTitle').text()print(title)date = pqContent('#wrapOuter > div > div.page-info > span').text()[:16]print(date)cnt = bs(content,"lxml")body = cnt.find(id='artibody')blockquote = cnt.find('blockquote')if blockquote:new_tag = cnt.new_tag("<b>")new_tag.string = "<a href='www.mysite.com'>替换成自己网站的名称和地址</a>"body.blockquote.replace_with(new_tag)#print(str(body)) articleContent = pq(''.join(str(body)))#print(type(articleContent))#print(articleContent)if body:# 获取正文内容regex = re.compile('<!-- 原始正文start -->(.*)<!-- 原始正文end -->',re.S)match = re.findall(regex,str(body))if match:match = match[0].strip()images = pq(match)('img')for img in images:img_name = get_page_img(pq(img).attr('src'))if img_name:r = re.subn(pq(img).attr('src'), 'img/' + img_name, match)match = r[0]content_url = write_to_file(match, url)dict = {'title':title,'content':content_url,'date':date,'expired':'false'}insert(dict)else:logger.info('未能提取到文章正文:[%s]' % url)else:logger.info('未在该文章页面中查找到标签artibody:[%s]' % url)except RequestException:logger.info('请求文章正文出错',url)#获取文章内容页中的图片 def get_page_img(url):try:response = requests.get(url)if response.status_code == 200:actual_img_path = save_image(response.content)return actual_img_pathelse:return Noneexcept RequestException:logger.info('请求图片出错',url)return None#保存文章页中的全部插图 def save_image(content):img_folder = os.path.join(os.getcwd(), 'img')img_name = md5(content).hexdigest()img_path = '{0}/{1}.{2}'.format(img_folder,img_name,'jpg')if not os.path.exists(img_path):with open(img_path,'wb') as f:f.write(content)f.close()return img_name+'.jpg'else:return img_name # 将正文内容保存至文件中 def write_to_file(content,url):content_folder = os.path.join(os.getcwd(),'files')file_name = md5(url.encode('utf-8')).hexdigest()file_path = '{0}/{1}.{2}'.format(content_folder, file_name,'txt')if not os.path.exists(file_path):with open(file_path,'w',encoding='utf-8') as f:f.write(json.dumps(content,ensure_ascii=False))f.close()logger.info('文件正文保存成功---新浪地址url:'+ url)return file_nameelse:return file_name
View Code
#!usr/bin/python3 # -*- coding: utf-8 -*-import os import logging import timelogger = logging.getLogger() logger.setLevel(logging.DEBUG)#按天保存记录日志 logFile = './log/log_{0}.txt'.format(time.strftime("%Y%m%d",time.localtime()))# if not os.path.exists(logFile): # os.mknod(logFile) fh = logging.FileHandler(logFile,mode='a') fh.setLevel(logging.INFO)ch = logging.StreamHandler() ch.setLevel(logging.DEBUG)formater = logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s:%(message)s") fh.setFormatter(formater) ch.setFormatter(formater)logger.addHandler(fh) logger.addHandler(ch)
View Code
其它的类,Models.py,DBHelper.py,config.py
#!/usr/bin/python3 # -*- coding: utf-8 -*-import pymysqlfrom config import * from logger import *# #扩展功能 # def insert(article):db = pymysql.connect(host=HOST,port=POST,user=USERNAME,passwd=PASSWORD,db=DATABASE,charset='utf8',use_unicode=True)cursor = db.cursor()sql = """insert into articles(title,content,date,expired) values('%s','%s','%s','%s')""" % (article['title'],article['content'],article['date'],article['expired'])try:cursor.execute(sql)db.commit()logger.info('插入文章记录成功,执行命令[' + sql + ']')except:logger.error('文章记录插入错误,执行命令[' + sql + ']')db.rollback()db.close()
View Code
#database config items HOST = '127.0.0.1' POST = 6000 #数据库端口号,我更改为了6000 DATABASE = '数据库名' USERNAME = '数据库账号' PASSWORD = '数据库密码'
View Code
#!usr/bin/python3 # -*- coding: utf-8 -*-class Article:ID = None,Title = '',SubTitle = '',Summary = '',Content = '',Date = '',Author = '',ForumID = 0,StickyPost = 'false',Expired = 'false'
View Code
二、配置nginx
请求访问入口app.py(采用nginx+uwsgi)
#!/usr/bin/env python3 # -*- coding: utf-8 -*-from sinaForex import * import time from logger import *def application(env,start_response):start_response('200 ok',[('Content_Type','text/html')])s = time.time() len = get_index_pages()e = time.time()logger.info("-----------------共爬取新闻{0}条,耗时:{1}-----------------".format(len,round(e-s,3))) rst = "共爬取新闻{0}条,耗时:{1}".format(len,round(e-s,3))print(time.localtime(time.time()))return [b'%s' % rst.encode('utf-8')]
View Code
server {
listen 80;
root /www/web/sina_forex;
server_name py.mysite.com;
index index.html index.php index.htm;
error_page 400 /errpage/400.html;
error_page 403 /errpage/403.html;
error_page 404 /errpage/404.html;
error_page 503 /errpage/503.html;
location /spider {
uwsgi_pass 127.0.0.1:8001;
include uwsgi_params;
}
location / {
try_files $uri @apache;
}
}
[uwsgi] socket = :8001 #web服务端口 chdir = /www/web/sina_forex #网站根目录 wsgi-file = app.py #请求处理类 vhost = true master = true processes = 5 pidfile = /www/web/sina_forex/uwsgi8001.pid daemonize = /www/web/sina_forex/log/uwsgi8001.log
View Code
这样web服务即可正常获取,根据nginx的访问规则http://py.mysite.com/spider
三、定时收集新闻
写一个定时执行的类autoSpiderTimer.py,来定时执行web请求,而进程的管理采用supervisor
#!/usr/bin/env python # -*- coding: utf-8 -*-import requests import timedef timer():response = requests.get('http://py.mysite.com/spider') print(response.text)print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))time.sleep(3600*24)timer() #每天定时收集一次
View Code
转载于:https://www.cnblogs.com/ameile/p/8250301.html
新浪新闻简单抓取备忘录相关推荐
- 新浪新闻页面抓取(JAVA-Jsoup)
1.使用gradle建立工程: 工程格式如下: include ':spider-demo'rootProject.name = 'my-spider-demo' settings def void ...
- (python爬虫)新浪新闻数据爬取与清洗+新浪新闻数据管理系统+MySQL
新浪新闻数据爬取与清洗+新浪新闻数据管理系统 设计要求 新浪新闻数据爬取与清洗 基本要求:完成新浪新闻排行中文章的数据爬取,包括标题.媒体.时间.内容. 进阶要求:对最近一周出现次数最多的关键字排名并 ...
- php抓取新浪数据,php 抓取新浪新闻的程序代码
抓取就是采集了,今天因为我们要做一个实时采集新浪新闻的功能,下面整理了一个非常了得的php 抓取新浪新闻的程序,我们来看看. 首先,需要下载一个simple_html_dom第三方扩展库,具体下载方式 ...
- 【API爬虫】30分钟百万条新浪新闻信息爬取。python得极速之旅
请参考源码,文字是最先得想法,没有再做更改.源码以更新 前期准备:requests库:使用pip install requests 安装. pymongo库:使用pip install pymongo ...
- php每天扒取当天新闻_php 抓取新浪新闻的程序代码
Jquery中文网 > 脚本编程 > php > 正文 php 抓取新浪新闻的程序代码 php 抓取新浪新闻的程序代码 发布时间:2016-10-09 编辑:www.jque ...
- 爬取新浪新闻[内容笔记代码整理]
学习视频:网易云 Python网络爬虫实战 环境:python3.5,requests,bs4,json,pandas,re,datetime 主要完成内容:爬取了新浪新闻-国内新闻版块的新闻信息,包 ...
- Python爬虫爬取新浪新闻内容
首先感谢丘祐玮老师在网易云课堂的Python网络爬虫实战课程,接下来也都是根据课程内容而写.一来算是自己的学习笔记,二来分享给大家参考之用. 课程视频大概是在16年11月录制的,现在是18年2月.其中 ...
- requests, Beautifusoup 爬取新浪新闻资讯
###1.爬取新浪新闻首页的新闻标题时间和链接 1 import requests 2 from bs4 import BeautifulSoup 3 4 res = requests.get('ht ...
- python爬取新浪新闻
最近公司项目比较少,楼主闲了好长时间了,作为一个刚毕业几个月的新人,心里很烦躁,只能自己找点新东西去学了.看到周围好多人都接触了爬虫,再加上楼主最近沉迷吴宣仪不可自拔,每天投票投票,投票的同时需要监控 ...
最新文章
- TensorFlow 2.0新特性解读,Keras API成核心
- Java was not the perfect solution for every pro...
- 【CentOS Linux 7】实验3【网络配置管理】
- Mysql日期差函数,Mysql选择两个日期字段相差大于或小于一定时间
- SparseArrayE详解
- 【C++ Primer | 15】面试问题
- Please make sure you have the correct access rights and the repository exists.
- JS正则表达式 替换首尾和中间的空格
- linux中什么是进程名,Linux进程是什么
- Listary Pro - 能极大幅度提高你 Windows 文件浏览与搜索速度效率的「超级神器」...
- matlab 全局变量(global)数据类型报错问题
- Ubuntu虚拟机如何与主机复制粘贴?
- 华为交换机eth口作用_华为交换机 eth-trunk
- 【分享】精通并发与Netty教程
- Gesture-Recognition
- summernote 添加emoji表情包
- 思维方式-《学会提问》书中的精髓:批判性思维是人类对抗人工智能的最后阵地。
- rails 创建_使用Rails和Icecast创建在线流媒体广播
- 优化策略5 Label Smoothing Regularization_LSR原理分析
- php 递归 递归方式与算法