python 爬取苏宁易购商品信息和评论的详细流程

总体说明

爬取苏宁易购的最大难点就在于他这个页面是很多js动态加载的内容，寻找和构造链接比较麻烦，如果不用JS逆向工程。采用selenium等爬取的效率相对会比较差一点，下面我会先放上我找的js动态加载的url图片，然后重点说明一下这些url的构造，最后附上代码，希望对您能有帮助，不足之处希望多提意见。谢谢！

主页面翻页url构造

a = urllib.parse.quote("华为手机")
for i in range(50):print("第%s页" % i)url = "https://search.suning.com/"+a+"/&iy=0&isNoResult=0&cp=" + str(i)html = self.get_html(url)self.get_phone_data(html)

点击下一页的时候寻找下一页的link，然后定位翻页的参数位置

详情页面数据获取

这边需要的数据是详情页面中的手机的价格，手机的评论统计，买家的回复内容

获取手机价格的url构造

#获取手机的价格 手机价格的连接需要自己拼凑def get_price_html(self, goods_src):try:src_args = re.findall(r"com/(.*?).html", goods_src)[0]key0 = src_args.split("/")[0]key1 = src_args.split("/")[-1]price_src = "https://pas.suning.com/nspcsale_0_0000000" + key1 + "_0000000" + key1 + "_" + key0 + "_250_029_0290199_20089_1000257_9254_12006_Z001___R1901001_0.5_0___000060864___.html?callback=pcData&_=1581050220963"html = self.get_html(price_src)price = re.compile(r'"netPrice":"(.*?)"', re.S)price_ret = price.findall(html)return price_ret[0]except:return -1

获取手机评论的统计数据的url

#获取评论的总的数量def get_comment_num(self, clsid, goods_src):src_args = re.findall(r"com/(.*?).html", goods_src)[0]key1 = src_args.split("/")[-1]if clsid:url = "https://review.suning.com/ajax/review_count/cluster-"+str(clsid)+\"-0000000"+str(key1)+"-0000000000-----satisfy.htm?callback=satisfy"else:url = "http://review.suning.com/ajax/review_count/general--0000000"+str(key1)+"-0000000000-----satisfy.htm?callback=satisfy"html = self.get_html(url)# print(html)oneStarCount = re.findall(r'"oneStarCount":(.*?),', html)[0]twoStarCount = re.findall(r'"twoStarCount":(.*?),', html)[0]threeStarCount = re.findall(r'"fourStarCount":(.*?),', html)[0]fourStarCount = re.findall(r'"threeStarCount":(.*?),', html)[0]fiveStarCount = re.findall(r'"fiveStarCount":(.*?),', html)[0]picFlagCount = re.findall(r'"picFlagCount":(.*?),', html)[0]totalCount = re.findall(r'"totalCount":(.*?),', html)[0]againCount = re.findall(r'"againCount":(.*?),', html)[0]return totalCount, picFlagCount, int(fiveStarCount)+int(fourStarCount), int(twoStarCount)+int(threeStarCount), oneStarCount, againCount

获取评论信息的url

#获取评论数据def get_comment_data(self, goods_src, clsid, num, redata):src_args = re.findall(r"com/(.*?).html", goods_src)[0]key1 = src_args.split("/")[-1]for i in range(1, int(num/10)):url = "http://review.suning.com/ajax/cluster_review_lists/cluster-"+str(clsid)+"-0000000"+str(key1)+"-0000000000-total-"+str(i)+"-default-10-----reviewList.htm?callback=reviewList"html = self.get_html(url)print(html)content = re.findall(r'"content":"(.*?)","publishTime', html)publishTime = re.findall(r'"publishTime":"(.*?)",.*?sourceSystem"', html)username = re.findall(r'"nickName":"(.*?)"', html)color = re.findall(r'"charaterDesc1":"(.*?)"', html)edition = re.findall(r'charaterDesc2":"(.*?)"', html)labelnames = re.findall(r'"labelNames":(.*?),"score"', html)if len(content) == 0:breakfor val in zip(content, publishTime, username, color, edition, labelnames):result = redata + list(val)print(len(result), result)# # 数据保存到csv文件self.write_data(result)time.sleep(1)

完整的代码如下

# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import time
import urllib
import re
import csvclass SNProcess():def __init__(self):self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}self.run()def get_html(self, url):res = requests.get(url, headers=self.headers)# res.encoding = 'utf-8'return res.textdef write_data(self, data):with open("retdata.csv", "a+", encoding="utf-8", errors='ignore', newline="") as f:f_csv = csv.writer(f)f_csv.writerow(data)#获取评论的总的数量def get_comment_num(self, clsid, goods_src):src_args = re.findall(r"com/(.*?).html", goods_src)[0]key1 = src_args.split("/")[-1]if clsid:url = "https://review.suning.com/ajax/review_count/cluster-"+str(clsid)+\"-0000000"+str(key1)+"-0000000000-----satisfy.htm?callback=satisfy"else:url = "http://review.suning.com/ajax/review_count/general--0000000"+str(key1)+"-0000000000-----satisfy.htm?callback=satisfy"html = self.get_html(url)# print(html)oneStarCount = re.findall(r'"oneStarCount":(.*?),', html)[0]twoStarCount = re.findall(r'"twoStarCount":(.*?),', html)[0]threeStarCount = re.findall(r'"fourStarCount":(.*?),', html)[0]fourStarCount = re.findall(r'"threeStarCount":(.*?),', html)[0]fiveStarCount = re.findall(r'"fiveStarCount":(.*?),', html)[0]picFlagCount = re.findall(r'"picFlagCount":(.*?),', html)[0]totalCount = re.findall(r'"totalCount":(.*?),', html)[0]againCount = re.findall(r'"againCount":(.*?),', html)[0]return totalCount, picFlagCount, int(fiveStarCount)+int(fourStarCount), int(twoStarCount)+int(threeStarCount), oneStarCount, againCount#获取手机的信息 里面的获取clusterid这个很关键 主要是后面的评论和评论统计数据url构造中都有这个参数def get_goods_title(self, url):html = self.get_html("https:" + url)soup = BeautifulSoup(html, 'lxml')# print(html)title = soup.find_all('title')[0].get_text()clusterId = re.compile(r'"clusterId":"(.*?)"', re.S)clusterId_ret = clusterId.findall(html)try:args0 = soup.find_all("dd", attrs={"class": "r-info"})[0].get_text()args1 = soup.find_all("dd", attrs={"class": "r-info"})[1].get_text()args2 = soup.find_all("dd", attrs={"class": "r-info"})[2].get_text()except:args0, args1, args2 = ["无参数"] * 3return clusterId_ret[0],title, args0, args1, args2#获取手机的价格 手机价格的连接需要自己拼凑def get_price_html(self, goods_src):try:src_args = re.findall(r"com/(.*?).html", goods_src)[0]key0 = src_args.split("/")[0]key1 = src_args.split("/")[-1]price_src = "https://pas.suning.com/nspcsale_0_0000000" + key1 + "_0000000" + key1 + "_" + key0 + "_250_029_0290199_20089_1000257_9254_12006_Z001___R1901001_0.5_0___000060864___.html?callback=pcData&_=1581050220963"html = self.get_html(price_src)price = re.compile(r'"netPrice":"(.*?)"', re.S)price_ret = price.findall(html)return price_ret[0]except:return -1#获取评论数据def get_comment_data(self, goods_src, clsid, num, redata):src_args = re.findall(r"com/(.*?).html", goods_src)[0]key1 = src_args.split("/")[-1]for i in range(1, int(num/10)):url = "http://review.suning.com/ajax/cluster_review_lists/cluster-"+str(clsid)+"-0000000"+str(key1)+"-0000000000-total-"+str(i)+"-default-10-----reviewList.htm?callback=reviewList"html = self.get_html(url)print(html)content = re.findall(r'"content":"(.*?)","publishTime', html)publishTime = re.findall(r'"publishTime":"(.*?)",.*?sourceSystem"', html)username = re.findall(r'"nickName":"(.*?)"', html)color = re.findall(r'"charaterDesc1":"(.*?)"', html)edition = re.findall(r'charaterDesc2":"(.*?)"', html)labelnames = re.findall(r'"labelNames":(.*?),"score"', html)if len(content) == 0:breakfor val in zip(content, publishTime, username, color, edition, labelnames):result = redata + list(val)print(len(result), result)# # 数据保存到csv文件self.write_data(result)time.sleep(1)#主页面数据获取  关键函数def get_phone_data(self, html):soup = BeautifulSoup(html, 'lxml')li = soup.find_all('ul', attrs={'class': 'general clearfix'})[0].find_all("li")# print(li[1])for i in range(len(li)):try:src = li[i].find_all("a", attrs={"target": "_blank"})[0].get("href")print(src)comment_num = li[i].find_all("div", attrs={"class": "info-evaluate"})[0].find_all("a")[0].get_text()# print(comment_num)is_self_support = li[i].find_all("div", attrs={"class": "store-stock"})[0].find_all("a")[0].get_text()# print(is_self_support)price = self.get_price_html(src)# print(price)clusterId, title, args0, args1, args2 = self.get_goods_title(src)# print(title)totalCount, picFlagCount, five_fourStarCount, two_threeStarCount, oneStarCount, againCount = self.get_comment_num(clusterId, src)ret_data = [title, comment_num, price, is_self_support, args0, args1, args2, totalCount, picFlagCount, five_fourStarCount, two_threeStarCount, oneStarCount, againCount]# print(ret_data)self.get_comment_data(src, clusterId, int(totalCount), ret_data)except:print("数据异常")continuedef run(self):a = urllib.parse.quote("华为手机")for i in range(50):print("第%s页" % i)url = "https://search.suning.com/"+a+"/&iy=0&isNoResult=0&cp=" + str(i)html = self.get_html(url)self.get_phone_data(html)
if __name__ == "__main__":SNProcess()

python 爬取苏宁易购商品信息和评论的详细流程相关推荐

scrapy深入爬取苏宁易购图书信息
深入爬取图书信息,大分类到小分类再到详细信息页 (从中午放学一直做到了晚上18点,无线火力都不玩了) 主要代码如下 # -*- coding: utf-8 -*- import scrapy from ...
使用scrapy爬取苏宁易购图书信息
理论基础详见:https://blog.csdn.net/apollo_miracle/article/details/84987459 # -*- coding: utf-8 -*- import ...
Python爬虫20-Scrapy爬取苏宁易购图书
Scrapy爬取苏宁易购图书 1.创建一个scrapy项目 scrapy startproject book 2.生成一个爬虫文件 scrapy genspider su book.suning.co ...
python爬取并分析淘宝商品信息
python爬取并分析淘宝商品信息背景介绍一.模拟登陆二.爬取商品信息 1. 定义相关参数 2. 分析并定义正则 3. 数据爬取三.简单数据分析 1.导入库 2.中文显示 3.读取数据 4.分 ...
爬虫项目十：Python苏宁易购商品数据、评论数据爬取
文章目录前言一.商品数据 1.分析url 2.解析数据 3.实现翻页二.评论数据前言利用Python对苏宁易购商品数据评价数据实现爬取提示:以下是本篇文章正文内容,下面案例可供参考一.商 ...
Python爬虫——实战三：爬取苏宁易购的商品价格(渲染引擎方法)
苏宁易购的商品价格请求URL为 https://pas.suning.com/nspcsale_0_000000000152709847_000000000152709847_0000000000_1 ...
基于python苏宁易购商品信息爬取
本文思路来源崔庆才老师的淘宝商品爬取 -首先打开苏宁易购网站,找到搜索框以及搜索按钮接口,模拟人工操作,输入关键词,并进行点击搜索操作,进入到商品的详细页. from selenium import ...
python秒杀神器苏宁_Python爬虫——实战三：爬取苏宁易购的商品价格
苏宁易购的商品价格请求URL为 https://pas.suning.com/nspcsale_0_000000000152709847_000000000152709847_0000000000_1 ...
爬虫：Scrapy分类爬取苏宁易购书城
目录 1.scrapy框架概述 2.Scrapy爬取苏宁图书案例 3.设置配置文件 4.如何获取USER_AGENT 5.编写items.py文件 6.编写爬虫suning.py程序 1.s ...