使用scrapy 爬取酷狗音乐歌手及歌曲名并存入mongodb中

备注还没来得及写，共爬取八千多的歌手，每名歌手平均三十首歌曲算，大概二十多万首歌曲

run.py

 1 #!/usr/bin/env python
 2 # -*- coding: utf-8 -*-
 3 __author__ = 'Zqf'
 4 from dingdian_simple.spiders.dingdian_spider import DingdianSimpleSpider
 5 from scrapy.crawler import CrawlerProcess
 6 from scrapy.utils.project import get_project_settings
 7
 8 # 获取settings.py模块的设置
 9 settings = get_project_settings()
10 process = CrawlerProcess(settings=settings)
11
12 # 可以添加多个spider
13 process.crawl(DingdianSimpleSpider)
14
15 # 启动爬虫，会阻塞，直到爬取完成
16 process.start()

kugou.py

 1 #!/usr/bin/env python
 2 # -*- coding: utf-8 -*-
 3 import re
 4
 5 __author__ = 'Zqf'
 6
 7 import scrapy
 8 from kugoumusic.items import KugoumusicItem
 9 from scrapy.linkextractors import LinkExtractor
10 from scrapy.spiders import Rule
11
12
13 class KugouSpiders(scrapy.spiders.CrawlSpider):
14     name = 'kugou'
15
16     start_urls = ['http://www.kugou.com/']
17
18     rules = (
19         Rule(LinkExtractor(allow=['http://www.kugou.com/yy/html/singer.html',
20                                   'http://www.kugou.com/yy/singer/index/\d-([a-z]|null)-1.html'])),
21         Rule(LinkExtractor(allow=['http://www.kugou.com/yy/singer/home/\d+.html']), callback='parse_item')
22     )
23
24     def parse_item(self, response):
25         singer = response.xpath('//div/div[@class="clear_fix"]/strong/text()').extract_first()
26         print(singer)
27         songs = response.xpath('//ul[@id="song_container"]/li//span[@class="text"]/i/text()').extract()
28         print(songs)
29
30         item = KugoumusicItem()
31         item['singer'] = singer
32         item['songs'] = songs
33
34         yield item

items.py

 1 # -*- coding: utf-8 -*-
 2
 3 # Define here the models for your scraped items
 4 #
 5 # See documentation in:
 6 # https://doc.scrapy.org/en/latest/topics/items.html
 7
 8 import scrapy
 9
10
11 class KugoumusicItem(scrapy.Item):
12     # define the fields for your item here like:
13     # name = scrapy.Field()
14     singer = scrapy.Field()
15     songs = scrapy.Field()

pipelines.py

 1 # -*- coding: utf-8 -*-
 2
 3 # Define your item pipelines here
 4 #
 5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 from pymongo import MongoClient
 8
 9
10 class KugoumusicPipeline(object):
11
12     def open_spider(self, spider):
13         # mongo_config = spider.settings['MONGO_CONFIG']
14         # host = '127.0.0.1', port = 27017
15         self.client = MongoClient(host='127.0.0.1', port=27017)
16         self.coll = self.client['student_db']['kugou']
17         self.li = []
18
19     def close_spider(self, spider):
20         self.insert()
21         self.client.close()
22
23     def insert(self):
24         self.coll.insert_many(self.li)
25
26     def process_item(self, item, spider):
27         if len(self.li) >= 100:
28             self.insert()
29             self.li = []
30             print("成功插入100条数据-------------------------------------")
31         else:
32             self.li.append(dict(item))
33
34         return item

settings.py

  1 # -*- coding: utf-8 -*-
  2
  3 # Scrapy settings for kugoumusic project
  4 #
  5 # For simplicity, this file contains only settings considered important or
  6 # commonly used. You can find more settings consulting the documentation:
  7 #
  8 #     https://doc.scrapy.org/en/latest/topics/settings.html
  9 #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 10 #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 11
 12 BOT_NAME = 'kugoumusic'
 13
 14 SPIDER_MODULES = ['kugoumusic.spiders']
 15 NEWSPIDER_MODULE = 'kugoumusic.spiders'
 16
 17 # MONGO_CONFIG = ['192.168.62.35:1806, '
 18 #               '192.168.62.240:1806, '
 19 #               '192.168.62.23:1806, '
 20 #               '192.168.62.32:1806, '
 21 #               '192.168.62.25:1806, '
 22 #               '192.168.62.28:1806, '
 23 #               '192.168.62.241:1806']
 24
 25 # MONGO_CONFIG = { 26 #     'host': '127.0.0.1',
 27 #     'port': 27017
 28     # 'user': 'root',
 29     # 'password': '123456',
 30     # 'db': 's1806',
 31     # 'charset': 'utf8'
 32 # }
 33 # Crawl responsibly by identifying yourself (and your website) on the user-agent
 34 #USER_AGENT = 'kugoumusic (+http://www.yourdomain.com)'
 35
 36 # Obey robots.txt rules
 37 ROBOTSTXT_OBEY = False
 38
 39 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 40 #CONCURRENT_REQUESTS = 32
 41
 42 # Configure a delay for requests for the same website (default: 0)
 43 # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
 44 # See also autothrottle settings and docs
 45 #DOWNLOAD_DELAY = 3
 46 # The download delay setting will honor only one of:
 47 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 48 #CONCURRENT_REQUESTS_PER_IP = 16
 49
 50 # Disable cookies (enabled by default)
 51 #COOKIES_ENABLED = False
 52
 53 # Disable Telnet Console (enabled by default)
 54 #TELNETCONSOLE_ENABLED = False
 55
 56 # Override the default request headers:
 57 DEFAULT_REQUEST_HEADERS = {
 58     'Connection': 'keep-alive',
 59     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
 60     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
 61     'Accept-Encoding': 'gzip, deflate, br',
 62     'Accept-Language': 'zh-CN,zh;q=0.9',
 63 }
 64
 65 # Enable or disable spider middlewares
 66 # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 67 #SPIDER_MIDDLEWARES = { 68 #    'kugoumusic.middlewares.KugoumusicSpiderMiddleware': 543,
 69 #}
 70
 71 # Enable or disable downloader middlewares
 72 # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 73 #DOWNLOADER_MIDDLEWARES = { 74 #    'kugoumusic.middlewares.KugoumusicDownloaderMiddleware': 543,
 75 #}
 76
 77 # Enable or disable extensions
 78 # See https://doc.scrapy.org/en/latest/topics/extensions.html
 79 #EXTENSIONS = { 80 #    'scrapy.extensions.telnet.TelnetConsole': None,
 81 #}
 82
 83 # Configure item pipelines
 84 # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 85 ITEM_PIPELINES = {
 86    'kugoumusic.pipelines.KugoumusicPipeline': 300,
 87 }
 88
 89 # Enable and configure the AutoThrottle extension (disabled by default)
 90 # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
 91 #AUTOTHROTTLE_ENABLED = True
 92 # The initial download delay
 93 #AUTOTHROTTLE_START_DELAY = 5
 94 # The maximum download delay to be set in case of high latencies
 95 #AUTOTHROTTLE_MAX_DELAY = 60
 96 # The average number of requests Scrapy should be sending in parallel to
 97 # each remote server
 98 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 99 # Enable showing throttling stats for every response received:
100 #AUTOTHROTTLE_DEBUG = False
101
102 # Enable and configure HTTP caching (disabled by default)
103 # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
104 #HTTPCACHE_ENABLED = True
105 #HTTPCACHE_EXPIRATION_SECS = 0
106 #HTTPCACHE_DIR = 'httpcache'
107 #HTTPCACHE_IGNORE_HTTP_CODES = []
108 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

转载于:https://www.cnblogs.com/tttzqf/p/9638545.html

使用scrapy 爬取酷狗音乐歌手及歌曲名并存入mongodb中相关推荐

Python爬虫案例：爬取酷狗音乐全排行榜歌曲
前言本文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理本次目标爬取酷狗音乐全站排行榜歌曲目标地址 https://www.ku ...
Python爬取酷狗音乐歌手信息
前面我们说过用python爬取网易云音乐的歌手信息,Python爬取网易云音乐歌手信息今天我们来爬取一下酷狗音乐的歌手信息(歌手id和歌手名),如果环境没有安装好,可以参照前面爬网易云环境配置作为参 ...
scrapy_redis分布式爬取酷狗音乐
scrapy_redis分布式爬取酷狗音乐前言安装scrapy_redis 创建scrapy项目 spider模块 items模块.pipelines模块 setting.py 调试运行成果图 ...
Python爬虫爬取酷狗音乐TOP500
Python大作业内容简介: 用Python来爬取酷狗音乐TOP500的歌曲信息,统计这500首歌曲中出现的所有歌手,并做可视化处理生成词云实验代码: import time import req ...
Python爬虫入门——2. 2爬取酷狗音乐top1-500歌曲信息
有了第一个程序的基础,我们现在来爬取酷狗音乐top500的歌曲信息.连接http://www.kugou.com/yy/rank/home/1-8888.html 我们第一个程序只爬取了一个页面的数据 ...
scrapy 爬取酷狗T500音乐
scrapy 爬取酷狗T500音乐开始工作代码的编写开始工作 1.创建项目scrapy startproject kugouScrapy 2.创建spider cd kugou scrapy g ...
Python爬取酷狗音乐-详解(多图预警)
目录 1.前言 2.分析一下 1. 2. 3. 3.代码解释 4.完整代码 5.结语 1.前言前面发布了一篇关于QQ音乐爬取的教程,但对于我们这种文艺青年来说,一个平台的歌曲怎么够我们听的,也是因为 ...
爬虫训练（三）：爬取酷狗音乐
今天趁机一鼓作气,把简单爬虫内容一次学习完毕,最后以爬取酷狗音乐排行榜歌曲作为结束,然后对此次学习做一个整理和总结.而且前两篇有些混乱,这里把内容做一次阶段性总结. 一.安装包爬虫三大包:reque ...
Python爬虫之爬取酷狗音乐歌曲
Python爬虫之爬取酷狗音乐歌曲 1.安装第三方库在Python的语言库中, 分为Python标准库和Python的第三方库. Python标准库是在你安装Python的时候已经包含在了安装目录下 ...

使用scrapy 爬取酷狗音乐歌手及歌曲名并存入mongodb中

使用scrapy 爬取酷狗音乐歌手及歌曲名并存入mongodb中相关推荐

最新文章

热门文章