python爬虫：利用多线程爬虫爬取下载进击的巨人图片

本文的文字及图片来源于网络,仅供学习、交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理

本品文章来自腾讯云作者：孤独的明月

文章目录

线程池
获取图片链接
下载图片
存在的问题

线程池

import contextlib
import glob
import os
import re
import threading
import time
from queue import Queue
from urllib import request
from bs4 import BeautifulSoup
import requestsclass ThreadPool(object):def __init__(self, max_num):self.StopEvent = 0  # 线程任务终止符，当线程从队列获取到StopEvent时，代表此线程可以销毁。可设置为任意与任务有区别的值。self.q = Queue()self.max_num = max_num  # 最大线程数self.terminal = False  # 是否设置线程池强制终止self.created_list = []  # 已创建线程的线程列表self.free_list = []  # 空闲线程的线程列表self.failed_tasks = Queue()  # 失败的任务列表self.Deamon = False  # 线程是否是后台线程self.recycle_failed_tasks = Falsedef run(self, func, args, callback=None):"""线程池执行一个任务:param func: 任务函数:param args: 任务函数所需参数:param callback::return: 如果线程池已经终止，则返回True否则None"""if len(self.free_list) == 0 and len(self.created_list) < self.max_num:self.create_thread()task = (func, args, callback,)self.q.put(task)def create_thread(self):"""创建一个线程"""t = threading.Thread(target=self.call)t.setDaemon(self.Deamon)t.start()self.created_list.append(t)  # 将当前线程加入已创建线程列表created_listdef call(self):"""循环去获取任务函数并执行任务函数"""current_thread = threading.current_thread()  # 获取当前线程对象event = self.q.get()  # 从任务队列获取任务while event != self.StopEvent:  # 判断获取到的任务是否是终止符func, arguments, callback = event  # 从任务中获取函数名、参数、和回调函数名try:result = func(*arguments)func_excute_status = True  # func执行成功状态except Exception as e:func_excute_status = Falseresult = Noneprint('函数执行产生错误', e)  # 打印错误信息self.failed_tasks.put(event)if func_excute_status:  # func执行成功后才能执行回调函数, 成功后才能执行回调函数, 才能执行回调函数if callback is not None:  # 判断回调函数是否是空的try:callback(result)except Exception as e:print('回调函数执行产生错误', e)  # 打印错误信息with self.worker_state(self.free_list, current_thread):# 执行完一次任务后，将线程加入空闲列表。然后继续去取任务，如果取到任务就将线程从空闲列表移除if self.terminal:  # 判断线程池终止命令，如果需要终止，则使下次取到的任务为StopEvent。event = self.StopEventelse:  # 否则继续获取任务event = self.q.get()  # 当线程等待任务时，q.get()方法阻塞住线程，使其持续等待print('remaining tasks: ', self.q.qsize())# 若线程取到的任务是终止符，就销毁线程。while ... else ... 语句# 将当前线程从已创建线程列表created_list移除self.created_list.remove(current_thread)def close(self):"""执行完所有的任务后，所有线程停止"""full_size = len(self.created_list)  # 按已创建的线程数量往线程队列加入终止符。while full_size:self.q.put(self.StopEvent)full_size -= 1def terminate(self):"""无论是否还有任务，终止线程"""self.terminal = Truewhile self.created_list:self.q.put(self.StopEvent)time.sleep(0.01)self.q.queue.clear()  # 清空任务队列, 主要是刚刚加入的大量终止信号def join(self):"""阻塞线程池上下文，使所有线程执行完后才能继续"""for t in self.created_list:t.join()@contextlib.contextmanager  # 上下文处理器，使其可以使用with语句修饰def worker_state(self, state_list, worker_thread):"""用于记录线程中正在等待的线程数"""state_list.append(worker_thread)try:yieldfinally:state_list.remove(worker_thread)

获取图片链接

if __name__ == '__main__':'''获取图片链接'''headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}def run(url, save_dir):time.sleep(1)html = requests.get(url, headers=headers, verify=False)raw = html.textimg = re.findall('mhurl="(.*?jpg)"', raw)prefix = 'http://p1.manhuapan.com/'if int(img[0].split('/')[0]) < 2016:prefix = 'http://p5.manhuapan.com/'img = prefix + img[0]path = os.path.join(save_dir, url.split('.')[-2].split('_')[-1] + '.jpg')return (img, path)def save(res):url, save_path = res[0], res[1]txt = save_path.replace('jpg', 'txt')with open(txt, 'w') as file:file.write(url)print('save {} to {}'.format(url, txt))path = '巨人/'root = 'https://manhua.fzdm.com/39/'html = requests.get(root).textbs = BeautifulSoup(html, features="lxml")titles = bs.find_all('li', {'class': 'pure-u-1-2 pure-u-lg-1-4'})catalogs = []for i in titles:href, title = i.a.get('href').strip('/'), i.a.textcatalogs.append((href, title))diry = path + titleif not os.path.exists(diry):os.makedirs(diry)tasks = []for i in catalogs:href, title = i[0], i[1]diry = path + titlefor j in range(100):u = root + href + '/index_' + str(j) + '.html'tasks.append((u,diry))start = time.time()pool = ThreadPool(100)for t in tasks:pool.run(func=run, args=t, callback=save)pool.close()pool.join()print("任务队列里任务数%s" % pool.q.qsize())print("当前存活子线程数量:%d" % threading.activeCount())print("当前线程创建列表:%s" % pool.created_list)print("当前空闲线程列表:%s" % pool.free_list)print("失败的任务列表:%s" % pool.failed_tasks.queue)print('total time: ', time.time() - start)

下载图片

    '''下载图片'''files = glob.glob(path+'*/*.txt')print(files)def download(filename):time.sleep(1)with open(filename,'r') as file:url = file.readline()req = request.Request(url, headers=headers)response = request.urlopen(req, timeout=10)path = filename.replace('txt','jpg')with open(path, 'wb') as f_save:f_save.write(response.read())f_save.flush()f_save.close()print('download: ', url)start = time.time()pool = ThreadPool(100)for t in files:pool.run(func=download, args=(t,), callback=None)pool.close()pool.join()print("任务队列里任务数%s" % pool.q.qsize())print("当前存活子线程数量:%d" % threading.activeCount())print("当前线程创建列表:%s" % pool.created_list)print("当前空闲线程列表:%s" % pool.free_list)print("失败的任务列表:%s" % pool.failed_tasks.queue)print('total time: ', time.time() - start)

存在的问题

response = request.urlopen(req, timeout=10)
with open(path, 'wb') as f_save:f_save.write(response.read())f_save.flush()f_save.close()

图片超时导致下载失败，保存了一个大小为 0 的图片

python爬虫：利用多线程爬虫爬取下载进击的巨人图片相关推荐

python爬虫——利用百度搜索引擎爬取所需图片
参考:python 爬取动态网页(百度图片) 说明:在上面这位博主的贴子的基础上做了一些改进,解决了有些URL无法访问导致的请求超时异常抛出致使程序退出的问题.话不多说,直接上代码. import r ...
【期末课设】python爬虫基础与可视化，使用python语言以及支持python语言的第三方技术实现爬虫功能，定向爬取网页的图片数据，并且实现批量自动命名分类下载。
1.大作业的内容本要求使用python语言以及支持python语言的第三方技术实现爬虫功能,定向爬取网页的图片数据,并且实现批量自动命名分类下载. 2.案例需求要求采用虚拟浏览器等动态爬虫技术,完 ...
python爬取图片教程-推荐|Python 爬虫系列教程一爬取批量百度图片
Python 爬虫系列教程一爬取批量百度图片https://blog.csdn.net/qq_40774175/article/details/81273198# -*- coding: utf-8 ...
python从网址爬图片协程_Python爬虫多任务协程爬取虎牙MM图片
查看: 4420|回复: 241 [作品展示] Python爬虫多任务协程爬取虎牙MM图片电梯直达发表于 2019-4-17 21:35:47 | 只看该作者 |倒序浏览 |阅读模式马上注册,结 ...
python3爬虫系列03之requests库：根据关键词自动爬取下载百度图片
python3爬虫系列03之requests库:根据关键词自动爬取下载百度图片 1.前言在上一篇文章urllib使用:根据关键词自动爬取下载百度图片当中,我们已经分析过了百度图片的搜索URL的变化 ...
python爬虫利用Scrapy框架爬取汽车之家奔驰图片--实战
先看一下利用scrapy框架爬取汽车之家奔驰A级的效果图 1)进入cmd命令模式下,进入想要存取爬虫代码的文件,我这里是进入e盘下的python_spider文件夹内 C:\Users\15538&g ...
Python爬虫学习，批量爬取下载抖音视频
这篇文章主要为大家详细介绍了python批量爬取下载抖音视频,具有一定的参考价值,感兴趣的小伙项目源码展示 ''' 注:如果你对python感兴趣,我这有个学习Python基地,里面有很多学习资料, ...
Python使用BeautifulSoup简单实现爬取妹子mm图片--初级篇
先来个效果截图(屈服在我的淫威之下吧!坏坏...嘿0.0) 因为是简易版而且是自己写着玩玩而已,自己也刚学,亦是笔记亦是分享,大佬轻喷就好.主要目的是希望更多人能够体验爬取一些seqing图片的快乐 ...
python自动换壁纸_Python爬取必应每日图片并实现Windows壁纸自动切换
不知道大家是否对每日一成不变的壁纸感到厌倦呢?反正对于我个人来说,如果每天打开电脑映入眼帘的都是不同的画面,那么科研热情都会被充分激发,从而提高自己的劳动生产力. 原来使用的是Deepin系统,自己写 ...

python爬虫：利用多线程爬虫爬取下载进击的巨人图片

线程池

获取图片链接

存在的问题

python爬虫：利用多线程爬虫爬取下载进击的巨人图片相关推荐

最新文章

热门文章