scrapy 爬虫-爬美食节

准备资料：

创建 scrapy 具体细节不解释了，准备mongodb，代码架构

  1 # -*- coding: utf-8 -*-
  2 import requests
  3 import scrapy
  4 from pyquery import PyQuery
  5 from ..items import FoodItem
  6 from ..utils.parse import parse, Home_cooking, othe_cooking, Dishes, Dishes_Details
  7 from scrapy.http import Request
  8 from traceback import format_exc
  9
 10 from pymongo import MongoClient
 11
 12
 13 class FoodTestSpider(scrapy.Spider):
 14     name = 'Food_test'
 15     allowed_domains = ['meishij.net']
 16     start_urls = ['https://www.meishij.net/chufang/diy/']
 17
 18     client = MongoClient()
 19     db = client.test  # 连接test数据库，没有则自动创建
 20     my_set = db.meishi  # 使用set集合，没有则自动创建
 21     result = {"": {"": []}}
 22
 23     def parse(self, response):
 24         url_list = parse(response)
 25         for url in url_list:
 26             item = Request(url_list[url],
 27                            callback=self.jiachagncai,
 28                            meta={"url": url},
 29                            errback=self.error_back, )
 30
 31             yield item
 32
 33             # with open('meishi.json', 'w', encoding='utf-8') as f:
 34             #     f.write(self.result)
 35
 36     def jiachagncai(self, response):
 37         # requests.post().url
 38
 39         if response.url == 'https://www.meishij.net/chufang/diy/':
 40             home = Home_cooking(response)
 41             # print(home)
 42         else:
 43             home = othe_cooking(response)
 44         for url in home:
 45             # print(url)
 46             name = response.meta["url"]
 47             # name_= self.result.get(name,{})
 48             # print(name_)
 49             # name__ = name_.get(name,{}).get(url,[])
 50             # name__.append(url)
 51             # name_[url] = name__
 52             # _name = {name:name_}
 53             # self.result.update(_name)
 54             # print(name_)
 55             # print(type(self.result.get(name)))
 56             yield Request(home[url],
 57                           callback=self.shangping_list,
 58                           meta={"temp": {"first": name, "se": url}},
 59                           errback=self.error_back, )
 60
 61     def shangping_list(self, response):
 62         date = Dishes(response)
 63         for url in date:
 64             yield Request(url,
 65                           callback=self.xiangqingye,
 66                           meta=response.meta['temp'],
 67                           errback=self.error_back, )
 68         url = PyQuery(response.text)('#listtyle1_w > div.listtyle1_page > div > a.next').attr('href')
 69         if url:
 70             yield Request(url,
 71                           callback=self.shangping_list,
 72                           meta={"temp": response.meta['temp']},
 73                           errback=self.error_back, )
 74
 75     def xiangqingye(self, response):
 76         first = response.meta['first']
 77         se = response.meta['se']
 78
 79         print('---------------------------', first,se)
 80         temp = Dishes_Details(response)
 81         self.my_set.insert({first:{se: temp}})
 82
 83         # if first == '家常菜谱':
 84         #     print('---------------------------', first)
 85         #     print('===========================', se)
 86         #     self.my_set.insert({'jiachang': {se: temp}})
 87         # elif first == '中华菜系':
 88         #     print('---------------------------', first)
 89         #     print('===========================', se)
 90         #     self.my_set.insert({'zhonghua': {se: temp}})
 91         # elif first == '各地小吃':
 92         #     print('---------------------------', first)
 93         #     print('===========================', se)
 94         #     self.my_set.insert({'gedi': {se: temp}})
 95         # elif first == '外国菜谱':
 96         #     print('---------------------------', first)
 97         #     print('===========================', se)
 98         #     self.my_set.insert({'waiguo': {se: temp}})
 99         # elif first == '烘焙':
100         #     print('---------------------------', first)
101         #     print('===========================', se)
102         #     self.my_set.insert({'hongbei': {se: temp}})
103         # else:
104         #     pass
105
106     def error_back(self, e):
107         _ = e
108         self.logger.error(format_exc())

food_test代码

  1 __author__ = 'chenjianguo'
  2 # -*- coding:utf-8 -*-
  3
  4 from pyquery import PyQuery
  5 import re
  6
  7
  8 def parse(response):
  9     """
 10     抓取美食tab 列表： https://www.meishij.net/chufang/diy/
 11     返回列 大 tab 信息
 12     :param:response
 13     :return
 14     """
 15     jpy = PyQuery(response.text)
 16
 17     tr_list = jpy('#listnav_ul > li').items()
 18
 19     result = dict()  #result为set集合（不允许重复元素）
 20     for tr in tr_list:
 21
 22         url = tr('a').attr('href')  #爬取美食tab的url
 23         text = tr('a').text()
 24         if url and 'https://www.meishij.net' not in url:
 25             url = 'https://www.meishij.net' + url
 26         if url and 'shicai' not in url and 'pengren' not in url:
 27             result[text]=url
 28     return result
 29
 30 def Home_cooking(response):
 31     '''
 32     家常菜的小tab列表 家常菜的页面元素与其他大tab 不一样需要特殊处理 https://www.meishij.net/chufang/diy/
 33     返回小tab 列表信息
 34     :param response:
 35     :return:
 36     '''
 37     jpy = PyQuery(response.text)
 38     tr_list = jpy('#listnav_con_c > dl.listnav_dl_style1.w990.bb1.clearfix > dd').items()
 39     result = dict()  # result为set集合（不允许重复元素）
 40     for tr in tr_list:
 41         url = tr('a').attr('href')  #爬取家常菜小 tab的url
 42         text = tr('a').text()
 43         result[text] = url
 44     return result
 45
 46 def othe_cooking(response):
 47     '''
 48     其他菜的小tab列表  https://www.meishij.net/china-food/caixi/
 49     返回小tab 列表信息
 50     :param response:
 51     :return:
 52     '''
 53     jpy = PyQuery(response.text)
 54     tr_list = jpy('#listnav > div > dl > dd').items()
 55     result = dict()  # result为set集合（不允许重复元素）
 56     for tr in tr_list:
 57         url = tr('a').attr('href')  # 爬取家常菜小 tab的url
 58         text = tr('a').text()
 59         result[text] = url
 60     return result
 61
 62
 63 def Dishes(response):
 64     '''
 65     菜品列表  https://www.meishij.net/chufang/diy/jiangchangcaipu/
 66     返回菜品信息
 67     :param response:
 68     :return:
 69     '''
 70     jpy = PyQuery(response.text)
 71     tr_list = jpy('#listtyle1_list > div').items()
 72     result = set()  # result为set集合（不允许重复元素）
 73     for tr in tr_list:
 74         url = tr('a').attr('href')  #爬取菜品的url
 75         result.add(url)
 76     # print(result,len(result))
 77     return result
 78
 79 def Dishes_Details(response):
 80     '''
 81     菜品的详细信息 https://www.meishij.net/zuofa/nanguaputaoganfagao_2.html
 82     返回 主要就是菜名、图片、用料、做法
 83     :param response:
 84     :return:
 85     '''
 86
 87     jpy = PyQuery(response.text)
 88     result = {'用料':{},'统计':{},'做法':{}}
 89
 90     result['主图'] =jpy('body > div.main_w.clearfix > div.main.clearfix > div.cp_header.clearfix > div.cp_headerimg_w > img').attr('src')
 91     result['菜名']=jpy('#tongji_title').text()
 92
 93     tongji = jpy('body > div.main_w.clearfix > div.main.clearfix > div.cp_header.clearfix > div.cp_main_info_w > div.info2 > ul > li').items()
 94     for i in tongji:
 95         result['统计'][i('strong').text()]=i('a').text()
 96
 97     Material = jpy('body > div.main_w.clearfix > div.main.clearfix > div.cp_body.clearfix > div.cp_body_left > div.materials > div > div.yl.zl.clearfix > ul').items()
 98     temp,tag = '',''
 99     for i in Material:
100         temp =(i('li > div > h4 > a').text()).replace(' ','#').split('#')
101         tag = (i('li > div > h4 > span').text()).replace(' ','#').split('#')
102     for k,v in enumerate(temp):
103         result['用料'][v]=tag[k]
104     k = jpy('body > div.main_w.clearfix > div.main.clearfix > div.cp_body.clearfix > div.cp_body_left > div.materials > div > div.yl.fuliao.clearfix > ul > li > h4 > a').text()
105     v = jpy('body > div.main_w.clearfix > div.main.clearfix > div.cp_body.clearfix > div.cp_body_left > div.materials > div > div.yl.fuliao.clearfix > ul > li > span').text()
106     result['用料'][k]=[v]
107
108     #Practice = jpy('div.measure > div > p').items() or jpy('div.measure > div > div > em').items()
109
110     Practice = jpy("em.step").items()
111     text =[]
112     count =1
113     for i in Practice:
114         if i.parent().is_("div"):
115             text = i.text() + i.parent()("p").text()
116             img = (i.parent()('img').attr('src'))
117             # result['做法'][text] = img
118             result['做法']['step_'+str(count)] = [text,img]
119             count +=1
120
121         elif i.parent().is_("p"):
122             text =i.parent()("p").text()
123             img =(i.parent().parent()('p')('img').attr('src'))
124             # result['做法'][text] = img
125             result['做法']['step_' + str(count)] = [text, img]
126             count += 1
127         else:
128             pass
129     # print(result, len(result))
130     return result
131
132
133 def fanye(response):
134     jqy = PyQuery(response.text)
135
136     tag = jqy('#listtyle1_w > div.listtyle1_page > div > a.next').attr('href')
137
138     return tag
139
140
141 if __name__ == '__main__':
142     import requests
143     # r = requests.get('https://www.meishij.net/zuofa/youmenchunsun_15.html')
144     # r = requests.get('https://www.meishij.net/zuofa/nanguaputaoganfagao_2.html')
145     r = requests.get('https://www.meishij.net/chufang/diy/?&page=56')
146     # Dishes_Details(r)
147     tag =fanye(r)
148     print(tag)

parse 代码

我这里没有用到中间件和管道，数据存储到 mongodb 中，数据做分类

结果

转载于:https://www.cnblogs.com/Baylor-Chen/p/9163903.html

scrapy 爬虫-爬美食节相关推荐

Python Scrapy 爬虫 - 爬取多级别的页面
Python Scrapy 爬虫 - 爬取多级别的页面互联网中众多的 scrapy 教程模板,都是爬取下一页 → \rightarrow →下一页形式的,很少有父级 → \rightarrow ...
Scrapy爬虫爬取电影天堂
Scrapy CrawlSpider爬取目标网址:http://www.dytt8.net 创建项目:scrapy startproject <爬虫项目文件的名字> 生成 CrawlSp ...
使用scrapy爬虫,爬取17k小说网的案例-方法一
无意间看到17小说网里面有一些小说小故事,于是决定用爬虫爬取下来自己看着玩,下图这个页面就是要爬取的来源. a 这个页面一共有125个标题,每个标题里面对应一个内容,如下图所示下面直接看最核心spi ...
Scrapy爬虫爬取豆瓣TOP250
文章目录分析网页创建Scrapy爬虫框架修改spider脚本修改items脚本修改settings脚本运行使用Scrapy爬虫框架爬取豆瓣电影TOP250 分析网页第一页 start= ...
在anaconda下创建我的第一个scrapy爬虫——爬取dmoz网站某一网址下的目录的链接名称以及链接地址...
这里我用的python工具是anaconda. 1.首先创建一个scrapy工程: 打开anaconda promt命令行(注意这里不是使用cmd打开windows下的命令行),进入到需要创建工程的目 ...
php 爬虫超市,scrapy爬虫爬取天猫进口零食网页
出于某些原因,想调戏下天猫的反爬虫机制,于是就有了这篇记录源码已传osgit ,感兴趣可以戳下正文开始分析目标(items) 解析路径(xpath) 目标为天猫超市的进口商品区研究一下待抓取网 ...
Python scrapy爬虫爬取伯乐在线全部文章，并写入数据库
伯乐在线爬虫项目目的及项目准备: 1.使用scrapy创建项目 2.创建爬虫,bole 域名 jobbole.com 3.Start_urls = ['http://blog.jobbole.com/ ...
使用scrapy爬虫,爬取17k小说网的案例-方法二
楼主准备爬取此页面的小说,此页面一共有125章我们点击进去第一章和第一百二十五章发现了一个规律我们看到此链接的 http://www.17k.com/chapter/271047/6336386 ...
福利向---Scrapy爬虫爬取多级图片网站
1.目标站分析目标站网址为https://52zfl.vip/zhaifuli/list_2_1.html 每页网址有若干链接,点击每个链接,是每部图片资源的详情页面,由于图片数量较多,涉及到翻页操 ...

scrapy 爬虫-爬美食节

scrapy 爬虫-爬美食节相关推荐

最新文章

热门文章