参考代码

#qiushibaike.py#import urllib
#import re
#import chardetimport requests
from lxml import etreepage = 1
url = 'http://www.qiushibaike.com/8hr/page/' + str(page)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36','Accept-Language': 'zh-CN,zh;q=0.8'}try:response = requests.get(url, headers=headers)resHtml = response.texthtml = etree.HTML(resHtml)result = html.xpath('//div[contains(@id,"qiushi_tag")]')for site in result:item = {}imgUrl = site.xpath('./div/a/img/@src')[0].encode('utf-8')username = site.xpath('./div/a/@title')[0].encode('utf-8')#username = site.xpath('.//h2')[0].textcontent = site.xpath('.//div[@class="content"]/span')[0].text.strip().encode('utf-8')# 投票次数vote = site.xpath('.//i')[0].text#print site.xpath('.//*[@class="number"]')[0].text# 评论信息comments = site.xpath('.//i')[1].textprint imgUrl, username, content, vote, commentsexcept Exception, e:print e

演示效果

多线程糗事百科案例

案例要求参考上面糗事百科单进程案例

Queue（队列对象）

Queue是python中的标准库，可以直接import Queue引用;队列是线程间最常用的交换数据的形式

python下多线程的思考

对于资源，加锁是个重要的环节。因为python原生的list,dict等，都是not thread safe的。而Queue，是线程安全的，因此在满足使用条件下，建议使用队列

初始化： class Queue.Queue(maxsize) FIFO 先进先出
包中的常用方法:
- Queue.qsize() 返回队列的大小
- Queue.empty() 如果队列为空，返回True,反之False
- Queue.full() 如果队列满了，返回True,反之False
- Queue.full 与 maxsize 大小对应
- Queue.get([block[, timeout]])获取队列，timeout等待时间
创建一个“队列”对象
- import Queue
- myqueue = Queue.Queue(maxsize = 10)
将一个值放入队列中
- myqueue.put(10)
将一个值从队列中取出
- myqueue.get()

多线程示意图

# -*- coding:utf-8 -*-
import requests
from lxml import etree
from Queue import Queue
import threading
import time
import jsonclass thread_crawl(threading.Thread):'''抓取线程类'''def __init__(self, threadID, q):threading.Thread.__init__(self)self.threadID = threadIDself.q = qdef run(self):print "Starting " + self.threadIDself.qiushi_spider()print "Exiting ", self.threadIDdef qiushi_spider(self):# page = 1while True:if self.q.empty():breakelse:page = self.q.get()print 'qiushi_spider=', self.threadID, ',page=', str(page)url = 'http://www.qiushibaike.com/8hr/page/' + str(page) + '/'headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36','Accept-Language': 'zh-CN,zh;q=0.8'}# 多次尝试失败结束、防止死循环timeout = 4while timeout > 0:timeout -= 1try:content = requests.get(url, headers=headers)data_queue.put(content.text)breakexcept Exception, e:print 'qiushi_spider', eif timeout < 0:print 'timeout', urlclass Thread_Parser(threading.Thread):'''页面解析类；'''def __init__(self, threadID, queue, lock, f):threading.Thread.__init__(self)self.threadID = threadIDself.queue = queueself.lock = lockself.f = fdef run(self):print 'starting ', self.threadIDglobal total, exitFlag_Parserwhile not exitFlag_Parser:try:'''调用队列对象的get()方法从队头删除并返回一个项目。可选参数为block，默认为True。如果队列为空且block为True，get()就使调用线程暂停，直至有项目可用。如果队列为空且block为False，队列将引发Empty异常。'''item = self.queue.get(False)if not item:passself.parse_data(item)self.queue.task_done()print 'Thread_Parser=', self.threadID, ',total=', totalexcept:passprint 'Exiting ', self.threadIDdef parse_data(self, item):'''解析网页函数:param item: 网页内容:return:'''global totaltry:html = etree.HTML(item)result = html.xpath('//div[contains(@id,"qiushi_tag")]')for site in result:try:imgUrl = site.xpath('.//img/@src')[0]title = site.xpath('.//h2')[0].textcontent = site.xpath('.//div[@class="content"]/span')[0].text.strip()vote = Nonecomments = Nonetry:vote = site.xpath('.//i')[0].textcomments = site.xpath('.//i')[1].textexcept:passresult = {'imgUrl': imgUrl,'title': title,'content': content,'vote': vote,'comments': comments,}with self.lock:# print 'write %s' % json.dumps(result)self.f.write(json.dumps(result, ensure_ascii=False).encode('utf-8') + "\n")except Exception, e:print 'site in result', eexcept Exception, e:print 'parse_data', ewith self.lock:total += 1data_queue = Queue()
exitFlag_Parser = False
lock = threading.Lock()
total = 0def main():output = open('qiushibaike.json', 'a')#初始化网页页码page从1-10个页面pageQueue = Queue(50)for page in range(1, 11):pageQueue.put(page)#初始化采集线程crawlthreads = []crawlList = ["crawl-1", "crawl-2", "crawl-3"]for threadID in crawlList:thread = thread_crawl(threadID, pageQueue)thread.start()crawlthreads.append(thread)#初始化解析线程parserListparserthreads = []parserList = ["parser-1", "parser-2", "parser-3"]#分别启动parserListfor threadID in parserList:thread = Thread_Parser(threadID, data_queue, lock, output)thread.start()parserthreads.append(thread)# 等待队列清空while not pageQueue.empty():pass# 等待所有线程完成for t in crawlthreads:t.join()while not data_queue.empty():pass# 通知线程是时候退出global exitFlag_ParserexitFlag_Parser = Truefor t in parserthreads:t.join()print "Exiting Main Thread"with lock:output.close()if __name__ == '__main__':main()

Python爬虫实战糗事百科实例相关推荐

python爬虫案例——糗事百科数据采集
全栈工程师开发手册 (作者:栾鹏) python教程全解 python爬虫案例--糗事百科数据采集通过python实现糗事百科页面的内容采集是相对来说比较容易的,因为糗事百科不需要登陆,不需要coo ...
python爬虫之糗事百科
历经1个星期的实践,终于把python爬虫的第一个实践项目完成了,此时此刻,心里有的只能用兴奋来形容,后续将继续加工,把这个做成一个小文件,发给同学,能够在cmd中运行的文件.简化版程序,即单单爬取页 ...
Python爬取糗事百科段子+定时发送QQ邮箱
文章目录前言 1. 库导入及介绍 2. 获取网页源码 3. 提取需要的信息 4. 优化输出数据 5. 发送邮件 6. 实现定时发送 7. 源码前言学习Python爬虫也有段时间了,总想着搞点事做 ...
用Python爬取糗事百科段子，可视化后结果发现
大家好,我是小五???? 生活真是太苦了,需要找点快乐的精神食粮支撑社畜生活,听说糗事百科段子挺多,今天就来看一看! 糗事百科的段子栏目声称:幽默笑话大全__爆笑笑话__笑破你的肚子的搞笑段子,我们用 ...
Python爬取糗事百科段子
Python爬取糗事百科段子 Python2.7.15 今天我们来爬取糗事百科的段子一.获取糗事百科的网页源码首先,打开浏览器,进入糗事百科,复制它的网址. 然后我们翻个页,可以看到,网址变成了这 ...
Android实战——jsoup实现网络爬虫，糗事百科项目的起步
Android实战--jsoup实现网络爬虫,爬糗事百科主界面本篇文章包括以下内容: 前言 jsoup的简介 jsoup的配置 jsoup的使用结语前言对于Android初学者想要做项目时,最 ...
python爬虫练习1：通过python爬取糗事百科的搞笑图片
爬取糗百的图片最近在研究python,想着自学爬虫获取数据,到时候进行数据分析在爬取图片的时候有些曲折,但是最终还是被我攻克下来了话不多说,我们开始我们要爬取的网站是糗事百科的图片首先, ...
JavaScript获取本机浏览器UA助力Python爬取糗事百科首页
问题背景: 使用Python编写爬虫时,经常会遇到反爬机制,例如网站要求必须使用浏览器访问.就像下面的403错误: 或者下面这种错误信息: 一般来说,这是遇到反爬机制了,对方要求使用浏览器访问.这时可 ...
利用Python爬取糗事百科段子信息
文章来源:公众号-智能化IT系统. 爬虫技术目前越来越流行,这里介绍一个爬虫的简单应用. 爬取的内容为糗事百科文字内容中的信息,如图所示: 爬取糗事百科文字35页的信息,通过手动浏览,以下为前四页的网 ...

Python爬虫实战糗事百科实例

要求：

参考代码

演示效果

多线程糗事百科案例

Queue（队列对象）

多线程示意图

Python爬虫实战糗事百科实例相关推荐

最新文章

热门文章