python爬取五百丁ppt模板（有图+有代码）

更新时间 2019年3月15日

五百丁首页

下载截图

python代码

# -*-coding:utf-8-*-
import datetime
import json
import os
import threading
import time
import traceback
from queue import Queueimport requests
from lxml import etree
from pip._vendor.retrying import retryclass _500d_ppt:count = 0  # 简单技术用def __init__(self, initUrl, page, savedir):self.initUrl = initUrlself.page = pageself.save_dir = savedir""""""# http://www.500d.me/ppt/moban/baogao/?pageNumber=2self.url_pattern = self.initUrl + "?pageNumber={}".format(self.page)self.detail_url_pattern = "http://www.500d.me/order/check_product_downtimes/?pid={}&_={}"self.headers = {"Accept": "application/json, text/javascript, */*; q=0.01","Accept-Encoding": "gzip, deflate, sdch","Accept-Language": "zh-CN,zh;q=0.8","Connection": "keep-alive","Cookie": "需要终身会员的cookie，终身会员39元，每天可以下载30套","Host": "www.500d.me","Referer": "http://www.500d.me/template/491.html","User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36","X-Requested-With": "XMLHttpRequest"}self.url_queue = Queue()self.html_queue = Queue()  # html结果self.list_queue = Queue()  # 列表结果self.data_queue = Queue()  # 最终数据,包含简历url和简历图片url@retry(stop_max_attempt_number=4)def _send_req(self, url):response = requests.get(url, headers=self.headers, timeout=30, verify=False)assert response.status_code == 200html = response.textprint(html)print("-------------------html获取成功-------------------")return htmldef get_html(self):while True:url = self.url_queue.get()# http://www.500d.me/ppt/moban/dabian/?pageNumber=2print(url)try:html = self._send_req(url)except Exception as e:traceback.print_exc()html = None# 第三次还是失败了,就存Noneself.html_queue.put(html)self.url_queue.task_done()  # 配合get计数减少1def parse_list(self):while True:html = self.html_queue.get()if html is not None:# 立马出队列# html_encode = html.encode()etree_html = etree.HTML(html)list = etree_html.xpath("//li[@class='listItem']")for tr in list:item = {}title = tr.xpath("./div[@class='item']/p/a/text()")[0]item["title"] = title;  # title# print(title)if self.isFileExist(title):print("已经存在 " + title)continueimgUrl = tr.xpath("./div/a/img/@src")[0]item["imgUrl"] = imgUrl;  # imgUrl# print(imgUrl)href = tr.xpath("./div[@class='item']/a/@href")[0]item["href"] = href;  # href# print(href)if 'ppt' not in href:continueend = href.rfind('.html')start = href.rfind('/') + 1id = href[start:end]item["id"] = id;  # id# print(id)print(item)self.list_queue.put(item)self.html_queue.task_done()# 判断文件是否存在def isFileExist(self, title):names = self.file_names(self.save_dir)for name in names:if title in name:return Truereturn Falsedef file_names(self, file_dir):for root, dirs, files in os.walk(file_dir):# print(root) #当前目录路径# print(dirs) #当前路径下所有子目录# print(files) #当前路径下所有非目录子文件return files# 发起网络请求请求地址def parse_detail(self):while True:item = self.list_queue.get()print(item)timestamp = int(time.time());id_ = item['id']detail_url = self.detail_url_pattern.format(id_, timestamp)print("detail_url:" + detail_url)response = requests.get(detail_url, headers=self.headers)if (response.status_code == 200):course_detail_content = response.content.decode()data = json.loads(course_detail_content)type_ = data['type']if 'error' == type_:print("zip地址解析失败,今天下载次数已经用完")self.list_queue.task_done()continuezipUrl = data['content']item['zipUrl'] = zipUrlprint(item)else:print("detail 失败:" + str(response.status_code))self.data_queue.put(item)self.list_queue.task_done()def save_data(self):while True:item = self.data_queue.get()print(item)# 计数统计self.count += 1zipUrl = item['zipUrl']title = item['title']imgUrl = item['imgUrl']############################图片下载################################# 这是一个图片的urlprint("开始下载 img" + " " + imgUrl)response = requests.get(imgUrl)# 获取的文本实际上是图片的二进制文本img = response.content# 下载图片with open(self.save_dir + title + ".png", 'wb') as f:f.write(img)############################zip下载################################print("开始下载 zip" + " " + zipUrl)reponse = requests.get(zipUrl)with open(self.save_dir + title + ".zip", "wb") as code:code.write(reponse.content)self.data_queue.task_done()def get_url_list(self):self.url_queue.put(self.url_pattern)def run(self):print("--------500丁简历下载 多线程版 begin--------")thread_list = []# 1.准备urlprint("----------------1.准备url---------------")for i in range(1):t_url = threading.Thread(target=self.get_url_list)thread_list.append(t_url)# 2遍历，发送请求，获取响应print("----------------2遍历，发送请求，获取响应---------------")for i in range(1):t_get_html = threading.Thread(target=self.get_html)thread_list.append(t_get_html)#  3.提取列表print("----------------3.提取列表---------------")for i in range(1):t_parse_list = threading.Thread(target=self.parse_list)thread_list.append(t_parse_list)# 4.解析详情print("----------------4.解析详情---------------")for i in range(10):  # content_queue里面有几十个taskt_parse_detail = threading.Thread(target=self.parse_detail)thread_list.append(t_parse_detail)# 5.保存print("----------------5.保存---------------")for i in range(10):t_save_data = threading.Thread(target=self.save_data)thread_list.append(t_save_data)print("-----开启了 " + str(len(thread_list)) + " 个线程执行任务-----")for t in thread_list:t.setDaemon(True)  # 设置守护线程，说明该线程不重要，主线程结束，子线程结束t.start()  # 线程启动for q in [self.url_queue, self.html_queue, self.list_queue,self.data_queue]:q.join()  # 等待，让主线程等待，队列计数为0之后才会结束，否则会一直等待print("--------一共获取{}份ppt模板--------".format(self.count))# 过滤字符串def filterString(self, str):str = str.replace("\r", "")str = str.replace("\n", "")str = str.replace("\t", "")str = str.replace(" ", "")return strif __name__ == '__main__':starttime = datetime.datetime.now()initUrl = "http://www.500d.me/ppt/moban/baogao/"  # 初始地址order = "工作汇报"  # 需要的类型pageStart = 0  # 开始页数pageEnd = 2  # 结束页数for page in range(pageStart, pageEnd):savedir = "D:\\0五百丁-ppt\\" + order + "-" + str(page) + "\\"if os.path.exists(savedir) is False:os.makedirs(savedir)ding = _500d_ppt(initUrl, page, savedir)ding.run()# 结束时间endtime = datetime.datetime.now()print("程序耗时： ", endtime - starttime)

python爬取五百丁ppt模板（有图+有代码）相关推荐

五百丁-ppt模板-创业融资-top45 下载
更新时间 2019年3月15日五百丁首页下载截图下载地址 https://download.csdn.net/download/billycoder/11021603 https://downl ...
五百丁-word模板-推荐简历-top58 下载
更新时间 2019年3月15日五百丁首页下载截图简历地址 https://download.csdn.net/download/billycoder/11021283
为避免尬聊，我用Python爬取了一千多张斗图！
前几天和女神聊天的时候实在是太尬了,因为没有足够的斗图表情包,整个聊天的气氛都带动不起来,所以抑郁不得志! 为了追到心目中的完美女神,我爬了一千多张斗图表情包,只为下一次聊天的时候,主动权都在我的手上 ...
python 爬取_我用Python爬取了妹子网100G的套图
前言最近在做监控相关的配套设施,发现很多脚本都是基于Python的.很早之前就听说其大名,人生苦短,我学Python,这并非一句戏言.随着人工智能.机器学习.深度学习的崛起,目前市面上大部分的人工智 ...
我用Python爬取了妹子网200G的套图
前言最近在做监控相关的配套设施,发现很多脚本都是基于Python的.很早之前就听说其大名,人生苦短,我学Python,这并非一句戏言.随着人工智能.机器学习.深度学习的崛起,目前市面上大部分的人工智 ...
我用Python爬取了妹子网100G的套图
前言最近在做监控相关的配套设施,发现很多脚本都是基于Python的.很早之前就听说其大名,人生苦短,我学Python,这并非一句戏言.随着人工智能.机器学习.深度学习的崛起,目前市面上大部分的人工智 ...
Python爬取百度搜索的标题和真实URL的代码和详细解析
网页爬取主要的是对网页内容进行分析,这是进行数据爬取的先决条件,因此博客主要对爬取思路进行下解析,自学的小伙伴们可以一起来学习,有什么不足也可以指出,都是在自学Ing,回归正题今天我们要来爬取百度搜索 ...
python爬取微博热搜显示到折线图_Python爬取新浪微博热搜榜-Go语言中文社区
我们如何爬取这50条热搜呢?今天写一个简单的方法供感兴趣的朋友们参考! 引用库: requests json lxml.etree bs4.BeautifulSoup引用方法如下: 如果没有下载的需要 ...
python爬取微博热搜显示到折线图_微博热搜榜前20信息数据爬取进行数据分析与可视化...
一.设计方案 1.主题式网络爬虫名称:微博热搜榜前20信息数据爬取进行数据分析与可视化 2.爬取内容与数据特征分析:爬取微博热搜榜前20热搜事件.排名与热度,数据呈一定规律排序. 3.设计方案概述:思 ...
Python爬取酷狗音乐-详解(多图预警)
目录 1.前言 2.分析一下 1. 2. 3. 3.代码解释 4.完整代码 5.结语 1.前言前面发布了一篇关于QQ音乐爬取的教程,但对于我们这种文艺青年来说,一个平台的歌曲怎么够我们听的,也是因为 ...

python爬取五百丁ppt模板（有图+有代码）

五百丁首页

下载截图

python代码

python爬取五百丁ppt模板（有图+有代码）相关推荐

最新文章

热门文章