使用scrapy 爬取酷狗音乐歌手及歌曲名并存入mongodb中
备注还没来得及写,共爬取八千多的歌手,每名歌手平均三十首歌曲算,大概二十多万首歌曲
run.py
1 #!/usr/bin/env python 2 # -*- coding: utf-8 -*- 3 __author__ = 'Zqf' 4 from dingdian_simple.spiders.dingdian_spider import DingdianSimpleSpider 5 from scrapy.crawler import CrawlerProcess 6 from scrapy.utils.project import get_project_settings 7 8 # 获取settings.py模块的设置 9 settings = get_project_settings() 10 process = CrawlerProcess(settings=settings) 11 12 # 可以添加多个spider 13 process.crawl(DingdianSimpleSpider) 14 15 # 启动爬虫,会阻塞,直到爬取完成 16 process.start()
kugou.py
1 #!/usr/bin/env python 2 # -*- coding: utf-8 -*- 3 import re 4 5 __author__ = 'Zqf' 6 7 import scrapy 8 from kugoumusic.items import KugoumusicItem 9 from scrapy.linkextractors import LinkExtractor 10 from scrapy.spiders import Rule 11 12 13 class KugouSpiders(scrapy.spiders.CrawlSpider): 14 name = 'kugou' 15 16 start_urls = ['http://www.kugou.com/'] 17 18 rules = ( 19 Rule(LinkExtractor(allow=['http://www.kugou.com/yy/html/singer.html', 20 'http://www.kugou.com/yy/singer/index/\d-([a-z]|null)-1.html'])), 21 Rule(LinkExtractor(allow=['http://www.kugou.com/yy/singer/home/\d+.html']), callback='parse_item') 22 ) 23 24 def parse_item(self, response): 25 singer = response.xpath('//div/div[@class="clear_fix"]/strong/text()').extract_first() 26 print(singer) 27 songs = response.xpath('//ul[@id="song_container"]/li//span[@class="text"]/i/text()').extract() 28 print(songs) 29 30 item = KugoumusicItem() 31 item['singer'] = singer 32 item['songs'] = songs 33 34 yield item
items.py
1 # -*- coding: utf-8 -*- 2 3 # Define here the models for your scraped items 4 # 5 # See documentation in: 6 # https://doc.scrapy.org/en/latest/topics/items.html 7 8 import scrapy 9 10 11 class KugoumusicItem(scrapy.Item): 12 # define the fields for your item here like: 13 # name = scrapy.Field() 14 singer = scrapy.Field() 15 songs = scrapy.Field()
pipelines.py
1 # -*- coding: utf-8 -*- 2 3 # Define your item pipelines here 4 # 5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 from pymongo import MongoClient 8 9 10 class KugoumusicPipeline(object): 11 12 def open_spider(self, spider): 13 # mongo_config = spider.settings['MONGO_CONFIG'] 14 # host = '127.0.0.1', port = 27017 15 self.client = MongoClient(host='127.0.0.1', port=27017) 16 self.coll = self.client['student_db']['kugou'] 17 self.li = [] 18 19 def close_spider(self, spider): 20 self.insert() 21 self.client.close() 22 23 def insert(self): 24 self.coll.insert_many(self.li) 25 26 def process_item(self, item, spider): 27 if len(self.li) >= 100: 28 self.insert() 29 self.li = [] 30 print("成功插入100条数据-------------------------------------") 31 else: 32 self.li.append(dict(item)) 33 34 return item
settings.py
1 # -*- coding: utf-8 -*- 2 3 # Scrapy settings for kugoumusic project 4 # 5 # For simplicity, this file contains only settings considered important or 6 # commonly used. You can find more settings consulting the documentation: 7 # 8 # https://doc.scrapy.org/en/latest/topics/settings.html 9 # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 12 BOT_NAME = 'kugoumusic' 13 14 SPIDER_MODULES = ['kugoumusic.spiders'] 15 NEWSPIDER_MODULE = 'kugoumusic.spiders' 16 17 # MONGO_CONFIG = ['192.168.62.35:1806, ' 18 # '192.168.62.240:1806, ' 19 # '192.168.62.23:1806, ' 20 # '192.168.62.32:1806, ' 21 # '192.168.62.25:1806, ' 22 # '192.168.62.28:1806, ' 23 # '192.168.62.241:1806'] 24 25 # MONGO_CONFIG = { 26 # 'host': '127.0.0.1', 27 # 'port': 27017 28 # 'user': 'root', 29 # 'password': '123456', 30 # 'db': 's1806', 31 # 'charset': 'utf8' 32 # } 33 # Crawl responsibly by identifying yourself (and your website) on the user-agent 34 #USER_AGENT = 'kugoumusic (+http://www.yourdomain.com)' 35 36 # Obey robots.txt rules 37 ROBOTSTXT_OBEY = False 38 39 # Configure maximum concurrent requests performed by Scrapy (default: 16) 40 #CONCURRENT_REQUESTS = 32 41 42 # Configure a delay for requests for the same website (default: 0) 43 # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 44 # See also autothrottle settings and docs 45 #DOWNLOAD_DELAY = 3 46 # The download delay setting will honor only one of: 47 #CONCURRENT_REQUESTS_PER_DOMAIN = 16 48 #CONCURRENT_REQUESTS_PER_IP = 16 49 50 # Disable cookies (enabled by default) 51 #COOKIES_ENABLED = False 52 53 # Disable Telnet Console (enabled by default) 54 #TELNETCONSOLE_ENABLED = False 55 56 # Override the default request headers: 57 DEFAULT_REQUEST_HEADERS = { 58 'Connection': 'keep-alive', 59 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36', 60 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 61 'Accept-Encoding': 'gzip, deflate, br', 62 'Accept-Language': 'zh-CN,zh;q=0.9', 63 } 64 65 # Enable or disable spider middlewares 66 # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 67 #SPIDER_MIDDLEWARES = { 68 # 'kugoumusic.middlewares.KugoumusicSpiderMiddleware': 543, 69 #} 70 71 # Enable or disable downloader middlewares 72 # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 73 #DOWNLOADER_MIDDLEWARES = { 74 # 'kugoumusic.middlewares.KugoumusicDownloaderMiddleware': 543, 75 #} 76 77 # Enable or disable extensions 78 # See https://doc.scrapy.org/en/latest/topics/extensions.html 79 #EXTENSIONS = { 80 # 'scrapy.extensions.telnet.TelnetConsole': None, 81 #} 82 83 # Configure item pipelines 84 # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 85 ITEM_PIPELINES = { 86 'kugoumusic.pipelines.KugoumusicPipeline': 300, 87 } 88 89 # Enable and configure the AutoThrottle extension (disabled by default) 90 # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 91 #AUTOTHROTTLE_ENABLED = True 92 # The initial download delay 93 #AUTOTHROTTLE_START_DELAY = 5 94 # The maximum download delay to be set in case of high latencies 95 #AUTOTHROTTLE_MAX_DELAY = 60 96 # The average number of requests Scrapy should be sending in parallel to 97 # each remote server 98 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 99 # Enable showing throttling stats for every response received: 100 #AUTOTHROTTLE_DEBUG = False 101 102 # Enable and configure HTTP caching (disabled by default) 103 # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 104 #HTTPCACHE_ENABLED = True 105 #HTTPCACHE_EXPIRATION_SECS = 0 106 #HTTPCACHE_DIR = 'httpcache' 107 #HTTPCACHE_IGNORE_HTTP_CODES = [] 108 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
转载于:https://www.cnblogs.com/tttzqf/p/9638545.html
使用scrapy 爬取酷狗音乐歌手及歌曲名并存入mongodb中相关推荐
- Python爬虫案例:爬取酷狗音乐全排行榜歌曲
前言 本文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理 本次目标 爬取酷狗音乐全站排行榜歌曲 目标地址 https://www.ku ...
- Python爬取酷狗音乐歌手信息
前面我们说过用python爬取网易云音乐的歌手信息,Python爬取网易云音乐歌手信息 今天我们来爬取一下酷狗音乐的歌手信息(歌手id和歌手名),如果环境没有安装好,可以参照前面爬网易云环境配置作为参 ...
- scrapy_redis分布式爬取酷狗音乐
scrapy_redis分布式爬取酷狗音乐 前言 安装scrapy_redis 创建scrapy项目 spider模块 items模块.pipelines模块 setting.py 调试 运行 成果图 ...
- Python爬虫爬取酷狗音乐TOP500
Python大作业 内容简介: 用Python来爬取酷狗音乐TOP500的歌曲信息,统计这500首歌曲中出现的所有歌手,并做可视化处理生成词云 实验代码: import time import req ...
- Python爬虫入门——2. 2爬取酷狗音乐top1-500歌曲信息
有了第一个程序的基础,我们现在来爬取酷狗音乐top500的歌曲信息.连接http://www.kugou.com/yy/rank/home/1-8888.html 我们第一个程序只爬取了一个页面的数据 ...
- scrapy 爬取酷狗T500音乐
scrapy 爬取酷狗T500音乐 开始工作 代码的编写 开始工作 1.创建项目scrapy startproject kugouScrapy 2.创建spider cd kugou scrapy g ...
- Python爬取酷狗音乐-详解(多图预警)
目录 1.前言 2.分析一下 1. 2. 3. 3.代码解释 4.完整代码 5.结语 1.前言 前面发布了一篇关于QQ音乐爬取的教程,但对于我们这种文艺青年来说,一个平台的歌曲怎么够我们听的,也是因为 ...
- 爬虫训练(三):爬取酷狗音乐
今天趁机一鼓作气,把简单爬虫内容一次学习完毕,最后以爬取酷狗音乐排行榜歌曲作为结束,然后对此次学习做一个整理和总结.而且前两篇有些混乱,这里把内容做一次阶段性总结. 一.安装包 爬虫三大包:reque ...
- Python爬虫之爬取酷狗音乐歌曲
Python爬虫之爬取酷狗音乐歌曲 1.安装第三方库 在Python的语言库中, 分为Python标准库和Python的第三方库. Python标准库是在你安装Python的时候已经包含在了安装目录下 ...
最新文章
- 机器人操作系统ROS Indigo 入门学习(1)——安装ROS Indigo【转】
- C语言数组的一些运算*a,a+1,a+1,a+0
- 中国反渗透膜产业竞争现状与投资战略决策报告2021-2027年版
- Swing组件集合的事件处理(二)
- 从雷军那里反思,做什么样的公司?
- nginx.conf删除与否网页都能访问(nginx没有生效)的问题
- LeetCode:Remove Duplicates from Sorted List I II
- 前端很慌!React 称霸,Vue 凶猛,TypeScript 威逼 JavaScript
- 语音识别技术原理概述!
- 学习 尚硅谷-宋红康 Java基本语法(上): 变量与运算符
- ASP.NET MVC:WebViewPage.cs
- 通过Kali linux 模拟CC攻击进行WEB压力测试实战
- offer来了(原理篇)学习笔记-第9章设计模式
- linux dns配置srv记录,DNS之SRV记录
- cps评分和tps评分_一文总结:PD-1/PD-L1免疫检查点抑制剂和TPS、CPS、IPS
- U盘中毒后里面的数据怎样恢复
- 【论文阅读】Misshapen Pelvis Landmark Detection WithLocal-Global Feature Learning for DiagnosingDevelop
- 自定义 ViewGroup,实现多功能流式布局与网格布局
- 荣耀智慧屏鸿蒙远程控制电脑,荣耀智慧屏评测:鸿蒙OS加持 面向未来的超智能电视...
- 与声网面对面!声网Agora开发者吐槽大会招募中