scrapy下载斗鱼主播图片
spiders文件夹下的爬虫文件(自己在spiders下创建) # -*- coding: utf-8 -*- import scrapy import json from Douyu.items import DouyuItemclass DouyuspiderSpider(scrapy.Spider):name = 'douyuspider'allowed_domains = ['douyucdn.cn']basicUrl="http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset="offset=0start_urls = [basicUrl+str(offset)]def parse(self, response):data_list=json.loads(response.body)["data"]if not len(data_list):returnfor data in data_list:item=DouyuItem()item["nickname"]=data["nickname"]item["imagelink"]=data["vertical_src"]yield itemself.offset+=20yield scrapy.Request(self.basicUrl+str(self.offset),callback=self.parse)
items文件
# -*- coding: utf-8 -*-# Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.htmlimport scrapyclass DouyuItem(scrapy.Item):# define the fields for your item here like:# name = scrapy.Field()nickname=scrapy.Field()imagelink=scrapy.Field()
pipelines文件
# -*- coding: utf-8 -*-# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import os #用来重命名 import json import scrapy from settings import IMAGES_STORE as images_store from scrapy.pipelines.images import ImagesPipeline #专门用来下载图片的函数 class DouyuPipeline(ImagesPipeline):def get_media_requests(self,item,info):image_link=item["imagelink"]yield scrapy.Request(image_link)def item_completed(self, results, item, info): #用来重命名#取出图片信息results里的图片的path路径信息,OK表示results里的trueimage_path=[x["path"]for ok,x in results if ok]os.rename(images_store+image_path[0],images_store+item["nickname"]+".jpg")#重命名方法return item
settings文件
# -*- coding: utf-8 -*-# Scrapy settings for Douyu project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.htmlBOT_NAME = 'Douyu'SPIDER_MODULES = ['Douyu.spiders'] NEWSPIDER_MODULE = 'Douyu.spiders'IMAGES_STORE="D:\PycharmProjects\Douyu\images" #自己设置图片保存的路径# Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'Douyu (+http://www.yourdomain.com)'# Obey robots.txt rules ROBOTSTXT_OBEY = False# Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32# Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16# Disable cookies (enabled by default) #COOKIES_ENABLED = False# Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False# Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #}# Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'Douyu.middlewares.DouyuSpiderMiddleware': 543, #}# Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'Douyu.middlewares.DouyuDownloaderMiddleware': 543, #}# Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #}# Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = {'Douyu.pipelines.DouyuPipeline': 300, }# Enable and configure the AutoThrottle extension (disabled by default) # See https://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False# Enable and configure HTTP caching (disabled by default) # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
scrapy下载斗鱼主播图片相关推荐
- python爬虫(五)---斗鱼主播图片下载并重命名
目的:爬取照片用主播名进行重命名 url:http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset=0 (一)基本步骤 步骤 ...
- scrapy抓斗鱼主播的图片
1.该项目通过此网站获取信息 http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset=0 打开是这样子的,(如果现实乱码, ...
- python爬取斗鱼主播图片
今天闲来无事,爬取一下斗鱼女主播的图片,之前学习scrapy的时候写过一个找不到了,今天使用requests和bs4重新写了一份,闲话不多说,直奔主题. 首先用Chrome浏览器访问斗鱼官网: 当然是 ...
- python爬取斗鱼主播图片_F_hawk189_新浪博客
今天闲来无事,爬取一下斗鱼女主播的图片,之前学习scrapy的时候写过一个找不到了,今天使用requests和bs4重新写了一份,闲话不多说,直奔主题. 首先用Chrome浏览器访问斗鱼官网: 当然是 ...
- 使用scrapy爬取手机版斗鱼主播的房间图片及昵称
目的:通过fiddler在电脑上对手机版斗鱼主播进行抓包,爬取所有主播的昵称和图片链接 关于使用fiddler抓取手机包的设置: 把手机和装有fiddler的电脑处在同一个网段(同一个wifi),手机 ...
- scrapy 斗鱼 主播信息爬取
原文链接: scrapy 斗鱼 主播信息爬取 上一篇: scrapy 妹子图网站 全站图片爬取 下一篇: TensorFlow models 的slim 模块 使用预训练模型进行识别 api http ...
- python怎么爬虎牙_Python爬虫:爬取虎牙星秀主播图片
动态爬取思路讲解 1.简单的爬虫只需要访问网站搜索栏处的url,就可以在开发者工具(F12)处,利用正则表达式.Xpath.css等进行定位并抓取数据: 2.虎牙星秀页面不同于简单的网页,随时都在更新 ...
- Python爬取美女主播图片适合初学者
Python爬取虎牙女主播图片,非常适合初学者,代码少,思路清晰 开发环境Pycharm import time import requests from lxml import etree from ...
- 斗鱼直播画面怎么弄到自己网页上_“集战!创界山勇者”斗鱼主播招募活动开始啦!...
关注微信公众号:梦幻模拟战手游 Langrisser传说,由你书写! <梦幻模拟战>x<魔神英雄传>联动活动火热来袭!"小救星"战部渡与伙伴剑部武一郎.忍部 ...
最新文章
- 常见java相关问题
- java聊天室小程序论文_在Java项目中利用continue与break制作一个聊天室小程序
- 008_JsonConfig对象
- python 递归 分叉_浅谈Python 递归算法指归
- 7-37 图形卡片排序游戏 (40 分)
- ios地图小例子和手势的使用 供大家参考一下呦
- Cloud一分钟 | 电商月将至,腾讯云DCDB助力电商企业应对支付洪峰
- MySQL无法启动服务器(1067)
- GLSurfaceView源码分析以及简单使用
- 【Redis】redis-3.0.0安装以及集群的搭建
- 输入某二叉树的前序遍历和中序遍历的结果,请重建出该二叉树。
- 霍尼韦尔门禁说明书_霍尼韦尔指纹锁说明书
- [青海、甘南之行散记] 当风吹过高原,一颗心在说话
- 自然语言处理NLP简介
- Excel中ISEVEN函数用法之判断数值奇偶性
- 【第163期】游戏策划做游戏:用UnityBolt实现游泳功能
- caffe常用层:Reduction层
- Flink大数据实时计算系列-Flink的Operator Chains的优化机制
- 计算机管理损坏的图像,win7系统提示损坏的图像的解决方法
- [cocos2d-x] -- Cocos2d-x简介
热门文章
- 输入一串字符,将其中的大写变成小写,若不为大写则原样输出
- Cadence(virtuoso)集成电路设计软件基本操作——库和库文件
- matlab中频域信号IFFT,MATLAB中ifft函数用法、性质、特性-以及与fft的组合应用全面深入解析(含程序)...
- C++标准库分析总结(一)——<标准库简介>
- ₣Y2XAEfuV1₳ 这缎,登陸块守,友爱
- redisson + CacheManager缓存管理
- Go语言开发学习笔记(持续更新中)
- duilib 关于wke 控件焦点问题
- 抖音素材哪里收集_抖音素材哪里找?最全攻略来了
- Json对象和string之间的转换