scrapy爬取华为商城所有商品信息--科技快人一步

华为商城 https://www.vmall.com/index.html

目标:华为商城下的商品信息

按主页的左边手机,笔记本&平板,智能穿戴……分类
每一个分类下的小分类
- 商品标题
- 商品价格
规格参数
- 主要参数
- 主体
- ……
- 商品编码
写入excel
设置好excel数据表,分析数据

代码如下（scrapy）：

# -*- coding: utf-8 -*-
import os
import re
import urllib.request
from copy import deepcopyimport scrapy
import xlrd
import xlwt
from ..items import HuaweiItemclass HuaWei(scrapy.Spider):name = 'huawei'allowed_domains = ['vmall.com', 'vmallres.com']start_urls = ['http://vmall.com/']def parse(self, response):self.new_xls()# 主页print("分割线-----------------------主页------------------------分割线")classify_list_A = response.xpath('//div[@id="category-block"]/div/ol/li')print("大分类长度:", len(classify_list_A))for i in classify_list_A:# print("现在位置:", classify_list_A)item = HuaweiItem()item['classify_A'] = i.xpath('.//input[2]/@value').extract_first()classify_list = i.xpath('.//div[2]//li[not(@class="subcate-btn")]')# classify_list = i.xpath('.//div[2]//li[last()]')for i in classify_list:item['classify_B'] = i.xpath('.//input[1]/@value').extract_first()href = "https://www.vmall.com" + str(i.xpath('.//a/@href').extract_first()) + '-1-3-0'# print("href:", href)yield scrapy.Request(href,callback=self.parse_A,meta={"item": deepcopy(item)})rb = xlrd.open_workbook('华为商城.xls')# 通过sheet_by_index()获取的sheetrs = rb.sheet_by_index(0)print("已爬取的商品数量:", rs.nrows - 1)def parse_A(self, response):# 中间页print("分割线-----------------------中间页------------------------分割线")li_list = response.xpath('//div[@class="layout"]/div[@class="channel-list"]/div[@class="pro-list clearfix"]/ul/li')if li_list:print("正在爬取页面链接:", response.request.url)print("此页面商品数量:", len(li_list))for i in li_list:item = response.meta["item"]rb = xlrd.open_workbook('华为商城.xls')# 通过sheet_by_index()获取的sheetrs = rb.sheet_by_index(0)cods = rs.col_values(0, start_rowx=0, end_rowx=None)item['title'] = i.xpath('./div[1]/p[2]/a/span[1]/text()').extract_first()# print("+++++++++++++++++++++++++++++++++++++++++++", item['title'])item['price'] = round(float(i.xpath('./div[1]/p[3]/b/text()').extract_first().split("¥")[1]) if i.xpath('./div[1]/p[3]/b/text()') else 0, 2)item['comments'] = int(i.xpath('./div[1]/div[@class="p-button clearfix"]//label//text()').extract_first().split("人")[0])item['img'] = i.xpath('./div[1]/p[1]/a/img/@src').extract_first()item['href'] = "https://www.vmall.com" + i.xpath('./div[1]/p[1]/a/@href').extract_first()item['coding'] = re.findall('[(]\'(.*?)\'[)]', i.xpath('./div[1]/p[1]/a/@onclick').extract_first())# for s in cods:#     if s == item['coding']:#         break# print(cods)# print(item['coding'])if item['coding'][0] not in cods:yield scrapy.Request(item['href'],callback=self.parse_B,meta={"item": deepcopy(item)})next_url_len = len(response.xpath('//ul[@id="page_ul"]/a'))# print("::::::::::::::::::::::::;;", next_url_len)if int(response.request.url.split("-")[2]) < next_url_len:href = response.request.url.split("-")[0] + "-" + response.request.url.split("-")[1] + "-" + str(int(response.request.url.split("-")[2]) + 1) + '-3-0'print("next_href:", href)yield scrapy.Request(href,callback=self.parse_A,meta={"item": deepcopy(item)})def parse_B(self, response):# 详情页print("分割线-----------------------详情页------------------------分割线")item = response.meta["item"]print("现在位置%s/%s" % (item["classify_A"], item["classify_B"]))print("正在爬取:", item['title'])content = response.xpath('//div[@id="product-property-recommand"]')if content:item['promotion'] = self.get_cx(response)item['coding'] = content.xpath('./div[@class="product-description clearfix"]/div[@class="fl"]/text()').extract_first().strip()item['explain'] = content.xpath('.//div[@id="skuPromWord"]//span/text()').extract_first()server_explain = content.xpath('.//div[@id="product-pulldown1"]/div[1]/div[@class="product-description-list clearfix"]/ul/li')item['server_explain'] = self.get_cm(server_explain)item['content'] = content.xpath('.//h1[@id="pro-name"]/text()').extract_first()cu_1 = re.findall(r'<script src="(.*?)" namespace="ec"></script>', response.text)[1]yield scrapy.Request(cu_1,callback=self.get_cu_1,meta={"item": deepcopy(item)},dont_filter=True)else:content = response.xpath('//div[@class="pro-meta-area"]')item['content'] = content.xpath('.//h1[@id="pro-name"]/text()').extract_first()item['explain'] = content.xpath('.//div[@id="skuPromWord"]//span/text()').extract_first()item['server_explain'] = content.xpath('.//div[@class="pro-service"]/text()').extract_first()item['promotion'] = "暂无活动"yield itemdef get_cx(self, response):print("获取促销")"""获取促销数据"""str = ""cu = re.findall(r'_groupPhotoList.push[(]{name:.*?}[)]; (_promotionsList+.*?); _prolongLst.push', response.text)# print(cu)if cu:try:cs = re.findall(r'"(.*?)"', cu[1])except:cs = re.findall(r'"(.*?)"', cu[0])print(cu)print(len(cu))# print(cs)index = 0pop_list = []for i in cs:# 遍历促销，去掉没用的数据# print("开始", index)i = i.replace(r'&#x2f', "/")if i.find('&#x') != -1:i = i.replace("&#x", "\\u")i = i.replace(";", "")i = i.replace("\n", "")i = i.replace("\t", "")i = i.replace(" ", "")i = i.encode().decode('unicode-escape')cs[index] = ielse:# print("添加了:", index)pop_list.append(index)index += 1# print("结束")for i in pop_list[::-1]:cs.pop(i)ins = 0for i in cs:# print("index:", cs.index(i))str += iins += 1if ins % 2 is 0:str += ";"elif ins % 2 is 1:str += ":"return str# cu_1 = re.findall(r'<script src="(.*?)" namespace="ec"></script>', response.text)[1]# print(cu_1)# yield scrapy.Request(#     cu_1,#     callback=self.get_cu_1,#     meta={"item": item, "str": str}# )# req = urllib.request.urlopen(cu_1)# req = req.read()# req = req.decode("utf-8")# str += self.get_cu_1(req)# print("str_s", str)# return str# item['promotion'] = str# print("cu_1", cu_1)# scrapy.Request(#     cu_1,#     callback=self.get_cu_1,#     meta={"item": item}# )# print(rs)## return str# yield scrapy.Request(#     cu_1,#     callback=self.get_cu_1,#     meta={"item": item, "str": str}# )def get_cu_1(self, response):#获取促销（购买可的积分）print("进入GET_CU_1")item = response.meta["item"]print(item)cu1 = re.findall(r' \\x3e\'[)],a.push[(](.*?")[)],', response.text)[0]cul_1 = re.findall(r'\\x3e(.*?)\\x3c', cu1)[0].encode().decode('unicode-escape')cul_2 = re.findall(r'a.push[(]"(.*?)"', cu1)[0].encode().decode('unicode-escape')str = cul_1 + ":" + cul_2 + ";"print("--------------------------str----------------------------------")item['promotion'] += strif item['promotion'] is '':item['promotion'] = "暂无活动"yield item# self.get_cu_1(self)def new_xls(self):"""创建表格"""if not os.path.exists("华为商城.xls"):print("正在创建。。。")wb = xlwt.Workbook(encoding='utf-8')# 括号内参数为表名ws = wb.add_sheet('商品数据')# 参数1：行数# 参数2：列数 从0开始计数# 参数3：值   即单元格的内容ws.write(0, 0, label='商品编码')ws.write(0, 1, label='祖分类')ws.write(0, 2, label='父分类')ws.write(0, 3, label='标题')ws.write(0, 4, label='图片')ws.write(0, 5, label='链接')ws.write(0, 6, label='价格')ws.write(0, 7, label='评价数量')ws.write(0, 8, label='内容')ws.write(0, 9, label='说明')ws.write(0, 10, label='服务说明')ws.write(0, 11, label="促销")wb.save('华为商城.xls')def get_cm(self, server_explain):# 获取服务说明cm = ""for i in server_explain:text = i.xpath('./text()')if len(text) > 1:mm = ""str_1 = re.findall(r'data=\'(.+?)\'>', str(text))if i.xpath('./span/text()'):for k in str_1:mm += kif str_1.index(k) == 0:mm += i.xpath('./span/text()').extract_first()cm += mmelse:cm += str(text.extract_first()) + ';'return cm

git地址

scrapy爬取华为商城所有商品信息--科技快人一步相关推荐

python+scrapy简单爬取淘宝商品信息
python结合scrapy爬取淘宝商品信息一.功能说明: 已实现功能: 通过scrapy接入selenium获取淘宝关键字搜索内容下的商品信息. 待扩展功能: 爬取商品中的全部其他商品信息. 二. ...
利用Selenium爬取淘宝商品信息
文章来源:公众号-智能化IT系统. 一. Selenium和PhantomJS介绍 Selenium是一个用于Web应用程序测试的工具,Selenium直接运行在浏览器中,就像真正的用户在操作一样. ...
爬取淘宝商品信息selenium+pyquery+mongodb
''' 爬取淘宝商品信息,通过selenium获得渲染后的源码,pyquery解析,mongodb存储 '''from selenium import webdriver from selenium. ...
Python爬虫自学之第（⑤）篇——爬取某宝商品信息
题外话: <Pi Network 免费挖矿国外热门项目一个π币大约值3元到10元>相信过去BTC的人,信不信未来的PI,了解一下,唯一一个高度与之持平的项目能看到这里说明快进入动态网页 ...
python爬虫scrapy爬取新闻标题及链接_python爬虫框架scrapy爬取梅花网资讯信息
原标题:python爬虫框架scrapy爬取梅花网资讯信息一.介绍本例子用scrapy-splash爬取梅花网(http://www.meihua.info/a/list/today)的资讯信息, ...
2021-11-16爬取淘宝商品信息时如何获取cookie
爬取淘宝商品信息时如何获取cookie ###一.基本环境 1.win10系统 2.火狐浏览器 3.编程软件anaconda 4.淘宝的robots:https://www.taobao.com/ro ...
Python高级特性与网络爬虫（二）：使用Selenium自动化测试工具爬取一号店商品信息
上一篇介绍了Ajax动态渲染的页面的分析和爬取,通过JavaScript动态渲染的页面的方式不只有ajax这一种,还有很多其他的方式,分析他们的网页结构和加密参数难度非常大,为了解决这样的页面的数据爬 ...
Scrapy爬取知乎用户信息以及人际拓扑关系
Scrapy爬取知乎用户信息以及人际拓扑关系 1.生成项目 scrapy提供一个工具来生成项目,生成的项目中预置了一些文件,用户需要在这些文件中添加自己的代码. 打开命令行,执行:scrapy sta ...
使用scrapy爬取斗鱼直播间信息
目录 1. 谷歌抓包工具的使用 1.1 打开Chrome开发者工具的方法 1.2 开发者工具的结构 1.3 network模块 2. 使用谷歌抓包工具抓取斗鱼数据 3. 使用scrapy爬取斗鱼直播间 ...

scrapy爬取华为商城所有商品信息--科技快人一步

华为商城 https://www.vmall.com/index.html

目标:华为商城下的商品信息

代码如下（scrapy）：

scrapy爬取华为商城所有商品信息--科技快人一步相关推荐

最新文章

热门文章