scrapy框架爬取王者荣耀英雄数据
scrapy框架爬取王者荣耀英雄属性
爬虫工程
爬虫文件
import scrapy
from theKingPro.items import ThekingproItemclass ThekingSpider(scrapy.Spider):name = 'theKing'# allowed_domains = ['www.xxx.com']start_urls = ['https://db.18183.com/wzry/']def parse(self, response):li_list = response.xpath('/html/body/div[2]/div/div[2]/ul/li')for li in li_list:hero_name = li.xpath("./a[1]/p/text()").extract_first()hero_detail_url = 'https://db.18183.com' + li.xpath('./a[1]/@href').extract_first()item = ThekingproItem()item["hero_name"] = hero_nameyield scrapy.Request(hero_detail_url, callback=self.parse_detail, meta={"item": item})def parse_detail(self, response):li_list = response.xpath('/html/body/div[2]/div/div[2]/div[2]/div[2]/div[5]/div/ul/li')all_list=[]item = response.meta["item"]hero_type=response.xpath('/html/body/div[2]/div/div[2]/div[1]/div[1]/div/p//text()')if len(hero_type)>=2:hero_type=''.join(hero_type.extract())all_list.append("英雄类型:%s"%hero_type)else:all_list.append("英雄类型:%s"%(hero_type.extract_first()))for li in li_list:hero_attr = li.xpath("./p/text()").extract_first()all_list.append(hero_attr)item["hero_attr"]=set(all_list)yield item
管道
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import csvclass ThekingproPipeline:fp=Nonedef open_spider(self,spider):print("开始爬取...")self.fp = open('./theKing.csv', 'w',newline="", encoding='utf-8')count=0def process_item(self, item, spider):hero_attr=item["hero_attr"]hero_attr=list(hero_attr)hero_dict_attr={}hero_dict_attr["英雄名称"]=item["hero_name"]for attr in hero_attr:res=attr.split(":")hero_dict_attr[res[0]]=res[1]# 指定csv文件写入到的地方与其他字典对应的列名,按照fieldnames参数指定的列的顺序存储w = csv.DictWriter(self.fp, fieldnames=("英雄名称","英雄类型",'最大生命', '最大法力', '物理攻击', '法术攻击', '物理防御', '物理减伤率', '法术防御', '法术减伤率', '移速', '物理护甲穿透', '法术护甲穿透', '攻速加成', '暴击几率','暴击效果', '物理吸血', '法术吸血', '冷却缩减', '攻击范围', '韧性', '生命回复', '法力回复'))self.count+=1#用于判断是否是第一次写文件,如果是第一次写入,那么就要加上列名,否则就不用加if self.count==1:w.writeheader() # 写入列名w.writerow(hero_dict_attr)else:w.writerow(hero_dict_attr)#判断有表头,就直接将值写入到文件中return itemdef close_spider(self,spider):self.fp.close()print("爬取完毕...")
item
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.htmlimport scrapyclass ThekingproItem(scrapy.Item):# define the fields for your item here like:# name = scrapy.Field()hero_name=scrapy.Field()hero_attr=scrapy.Field()
配置文件
# Scrapy settings for theKingPro project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.htmlBOT_NAME = 'theKingPro'SPIDER_MODULES = ['theKingPro.spiders']
NEWSPIDER_MODULE = 'theKingPro.spiders'# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'theKingPro (+http://www.yourdomain.com)'# Obey robots.txt rules
ROBOTSTXT_OBEY = FalseLOG_LEVEL="ERROR"USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16# Disable cookies (enabled by default)
#COOKIES_ENABLED = False# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {# 'theKingPro.middlewares.ThekingproSpiderMiddleware': 543,
#}# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {# 'theKingPro.middlewares.ThekingproDownloaderMiddleware': 543,
#}# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {# 'scrapy.extensions.telnet.TelnetConsole': None,
#}# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {'theKingPro.pipelines.ThekingproPipeline': 300,
}# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
####效果展示
技术交流加Q:1936705477
scrapy框架爬取王者荣耀英雄数据相关推荐
- python-scrapy爬虫框架爬取王者荣耀英雄皮肤图片和技能信息
1.创建工程 将路径切换到想要保存爬虫项目的文件夹内,运行scrapy startproject WZRY新建一个名为WZRY的工程. 2.产生爬虫 将路径切换至新创建的spiders文件夹中,运行s ...
- scrapy框架爬取王者荣耀皮肤
图片采集地址请点击这里 先展示采集结果 获取github详细项目代码点击这里 创建项目 创建项目命令:scrapy startproject wangzhePhotoMax创建爬虫:scrapy ge ...
- Python爬取王者荣耀英雄的皮肤数据并下载皮肤图片项目
Python爬取王者荣耀英雄的皮肤数据,并下载皮肤图片!高清的图片用来做桌面也不错哟~ 网址:https://pvp.qq.com/web201605/herolist.shtml 1.获得英雄信息, ...
- 教你用python爬取王者荣耀英雄皮肤图片,并将图片保存在各自英雄的文件夹中。(附源码)
教你用python爬取王者荣耀英雄皮肤图片,并将图片保存在各自英雄的文件夹中.(附源码) 代码展示: 保存在各自的文件夹中 美么? 让我们开始爬虫之路 开发环境 windows 10 python3. ...
- Python批量爬取王者荣耀英雄高清壁纸
Python批量爬取王者荣耀英雄高清壁纸 文章目录 Python批量爬取王者荣耀英雄高清壁纸 前言 爬虫步骤 python代码实现 总结 前言 很多喜欢玩王者的朋友很希望把王者荣耀的英雄图片拿来做壁纸 ...
- 爬虫爬取王者荣耀 英雄故事 和技能
初识爬虫 爬取王者荣耀英雄故事和技能 爬取王者荣耀英雄故事和技能 源码奉上 import requests import re import os from lxml import etree if ...
- Python爬虫——手把手教你爬取王者荣耀英雄皮肤
大家好!我是霖hero 大家知道目前最火的手游是哪个嘛,没错,就是王者荣耀,这款手游想必大家都听过或者玩过吧,里面有106个英雄,几百个英雄皮肤,今天我来手把手教你们把几百个皮肤都爬取下来. 目录 P ...
- Python爬取 | 王者荣耀英雄皮肤海报
这里只展示代码,具体介绍请点击下方链接. Python爬取 | 王者荣耀英雄皮肤海报 import requests import re import os import time import wi ...
- Java爬虫 --- 爬取王者荣耀英雄图片
Java爬虫 - 爬取王者荣耀英雄图片 import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Docu ...
最新文章
- 圣何塞与 Microsoft 宣布该市为超过 5,000 名市府公务员选择 Office 365、Windows Azure 和 StorSimple...
- apache workprefork
- DHCP的安装与分配
- Selenium测试专项三班隆重开班
- 别被忽悠了!我来谈谈大数据平台的4个要点,你们写的都不是干货
- 微信小程序报错:invalid credential, access_token is invalid or not latest
- 区块链技术指南:术语
- Android动画分类与总结
- OWASP Top 10 2022介绍
- 用stream流来遍历处理list,筛选出符合条件的list
- Java小白常问的问题大全
- Python3脚本抢票
- 网站托管收费是否有标准
- 广告术语(持续更新...)
- STM32F3 GPIO的八种模式及工作原理
- 命题公式的主合取范式C语言,用C或C++编写程序,要求:输入命题公式,给出它的主合取范式和主析取范式....
- 支持C/C++、Java、python、Matlab等语言的第三方机器学习库汇总
- Android 科大讯飞开发相关
- 一个demo让你彻底搞懂观察者模式
- 结构体与typedef关键字