分析淘宝网

这次选择的是淘宝网热卖而不是淘宝网,二者虽然名字有不同,但是数据还是一样的,区别就在于前者把后者的所有店铺和商品的海量数据按照销售量、好评度、信誉度综合测评、重新计算、重新排序展现给买家的一个导购网站。

找到准确数据:

请求参数对比:

jsv: 2.4.0
appKey: 12574478
t: 1597626852590
sign: 457c09a81147eb493d1f58ad6ab64d52
api: mtop.alimama.union.sem.landing.pc.items
v: 1.0
AntiCreep: true
dataType: jsonp
type: jsonp
ecode: 0
callback: mtopjsonp1
data: {"keyword":"女装","ppath":"","loc":"","minPrice":"","maxPrice":"","ismall":"","ship":"","itemAssurance":"","exch

jsv: 2.4.0

appKey: 12574478

t: 1597626856031

sign: 4a87f4308903ded469a1328275ded458

api: mtop.alimama.union.sem.landing.pc.items

v: 1.0

AntiCreep: true

dataType: jsonp

type: jsonp

ecode: 0

callback: mtopjsonp1

data: {"keyword":"女装","ppath":"","loc":"","minPrice":"","maxPrice":"","ismall":"","ship":"","itemAssurance":"","exchange7":"","custAssurance":"","b":"","clk1":"91a160066daf9f8b69646bd610341da7","pvoff":"","pageSize":"100","page":"","elemtid":"1","refpid":"mm_26632258_3504122_32538762","pid":"430673_1006","featureNames":"spGoldMedal,dsrDescribe,dsrDescribeGap,dsrService,dsrServiceGap,dsrDeliver, dsrDeliverGap","ac":"5TGcF6nyjHkCAXt4MaK1QH5c","wangwangid":"","catId":""}

发现请求数据有所改变的地方只有t和sign以及data中的page和ac,其他均没有发生改变,t参数明眼就能猜出是时间戳,sign参数还得通过调试和阅读源码进行逆向分析。

解决t和sign参数

发现sign和t以及其他参数都在这里面构造的。

所以我们现在开始下断点调试这些参数,得出sign参数的构造过程。可以发现t就是当前时间戳,然后sign是由h(d.token + "&" + i + "&" + g + "&" + c.data)构成,d.token是this.options.token也就是cookie值中的_m_h5_tk参数的前部分,i是时间戳,g就是12574478,c.data就是this.params.data。

this.options.token = e4a0bbef377f5792e1852f92581f769c(参考值)

this.params.data = {"keyword":"女装","ppath":"","loc":"","minPrice":"","maxPrice":"","ismall":"","ship":"","itemAssurance":"","exchange7":"","custAssurance":"","b":"","clk1":"91a160066daf9f8b69646bd610341da7","pvoff":"","pageSize":"100","page":"","elemtid":"1","refpid":"mm_26632258_3504122_32538762","pid":"430673_1006","featureNames":"spGoldMedal,dsrDescribe,dsrDescribeGap,dsrService,dsrServiceGap,dsrDeliver, dsrDeliverGap","ac":"5TGcF6nyjHkCAXt4MaK1QH5c","wangwangid":"","catId":""}

h函数就是下图所示。

function h(a) {function b(a, b) {return a << b | a >>> 32 - b}function c(a, b) {var c, d, e, f, g;return e = 2147483648 & a,f = 2147483648 & b,c = 1073741824 & a,d = 1073741824 & b,g = (1073741823 & a) + (1073741823 & b),c & d ? 2147483648 ^ g ^ e ^ f : c | d ? 1073741824 & g ? 3221225472 ^ g ^ e ^ f : 1073741824 ^ g ^ e ^ f : g ^ e ^ f}function d(a, b, c) {return a & b | ~a & c}function e(a, b, c) {return a & c | b & ~c}function f(a, b, c) {return a ^ b ^ c}function g(a, b, c) {return b ^ (a | ~c)}function h(a, e, f, g, h, i, j) {return a = c(a, c(c(d(e, f, g), h), j)),c(b(a, i), e)}function i(a, d, f, g, h, i, j) {return a = c(a, c(c(e(d, f, g), h), j)),c(b(a, i), d)}function j(a, d, e, g, h, i, j) {return a = c(a, c(c(f(d, e, g), h), j)),c(b(a, i), d)}function k(a, d, e, f, h, i, j) {return a = c(a, c(c(g(d, e, f), h), j)),c(b(a, i), d)}function l(a) {for (var b, c = a.length, d = c + 8, e = (d - d % 64) / 64, f = 16 * (e + 1), g = new Array(f - 1), h = 0, i = 0; c > i; )b = (i - i % 4) / 4,h = i % 4 * 8,g[b] = g[b] | a.charCodeAt(i) << h,i++;return b = (i - i % 4) / 4,h = i % 4 * 8,g[b] = g[b] | 128 << h,g[f - 2] = c << 3,g[f - 1] = c >>> 29,g}function m(a) {var b, c, d = "", e = "";for (c = 0; 3 >= c; c++)b = a >>> 8 * c & 255,e = "0" + b.toString(16),d += e.substr(e.length - 2, 2);return d}function n(a) {a = a.replace(/\r\n/g, "\n");for (var b = "", c = 0; c < a.length; c++) {var d = a.charCodeAt(c);128 > d ? b += String.fromCharCode(d) : d > 127 && 2048 > d ? (b += String.fromCharCode(d >> 6 | 192),b += String.fromCharCode(63 & d | 128)) : (b += String.fromCharCode(d >> 12 | 224),b += String.fromCharCode(d >> 6 & 63 | 128),b += String.fromCharCode(63 & d | 128))}return b}var o, p, q, r, s, t, u, v, w, x = [], y = 7, z = 12, A = 17, B = 22, C = 5, D = 9, E = 14, F = 20, G = 4, H = 11, I = 16, J = 23, K = 6, L = 10, M = 15, N = 21;for (a = n(a),x = l(a),t = 1732584193,u = 4023233417,v = 2562383102,w = 271733878,o = 0; o < x.length; o += 16)p = t,q = u,r = v,s = w,t = h(t, u, v, w, x[o + 0], y, 3614090360),w = h(w, t, u, v, x[o + 1], z, 3905402710),v = h(v, w, t, u, x[o + 2], A, 606105819),u = h(u, v, w, t, x[o + 3], B, 3250441966),t = h(t, u, v, w, x[o + 4], y, 4118548399),w = h(w, t, u, v, x[o + 5], z, 1200080426),v = h(v, w, t, u, x[o + 6], A, 2821735955),u = h(u, v, w, t, x[o + 7], B, 4249261313),t = h(t, u, v, w, x[o + 8], y, 1770035416),w = h(w, t, u, v, x[o + 9], z, 2336552879),v = h(v, w, t, u, x[o + 10], A, 4294925233),u = h(u, v, w, t, x[o + 11], B, 2304563134),t = h(t, u, v, w, x[o + 12], y, 1804603682),w = h(w, t, u, v, x[o + 13], z, 4254626195),v = h(v, w, t, u, x[o + 14], A, 2792965006),u = h(u, v, w, t, x[o + 15], B, 1236535329),t = i(t, u, v, w, x[o + 1], C, 4129170786),w = i(w, t, u, v, x[o + 6], D, 3225465664),v = i(v, w, t, u, x[o + 11], E, 643717713),u = i(u, v, w, t, x[o + 0], F, 3921069994),t = i(t, u, v, w, x[o + 5], C, 3593408605),w = i(w, t, u, v, x[o + 10], D, 38016083),v = i(v, w, t, u, x[o + 15], E, 3634488961),u = i(u, v, w, t, x[o + 4], F, 3889429448),t = i(t, u, v, w, x[o + 9], C, 568446438),w = i(w, t, u, v, x[o + 14], D, 3275163606),v = i(v, w, t, u, x[o + 3], E, 4107603335),u = i(u, v, w, t, x[o + 8], F, 1163531501),t = i(t, u, v, w, x[o + 13], C, 2850285829),w = i(w, t, u, v, x[o + 2], D, 4243563512),v = i(v, w, t, u, x[o + 7], E, 1735328473),u = i(u, v, w, t, x[o + 12], F, 2368359562),t = j(t, u, v, w, x[o + 5], G, 4294588738),w = j(w, t, u, v, x[o + 8], H, 2272392833),v = j(v, w, t, u, x[o + 11], I, 1839030562),u = j(u, v, w, t, x[o + 14], J, 4259657740),t = j(t, u, v, w, x[o + 1], G, 2763975236),w = j(w, t, u, v, x[o + 4], H, 1272893353),v = j(v, w, t, u, x[o + 7], I, 4139469664),u = j(u, v, w, t, x[o + 10], J, 3200236656),t = j(t, u, v, w, x[o + 13], G, 681279174),w = j(w, t, u, v, x[o + 0], H, 3936430074),v = j(v, w, t, u, x[o + 3], I, 3572445317),u = j(u, v, w, t, x[o + 6], J, 76029189),t = j(t, u, v, w, x[o + 9], G, 3654602809),w = j(w, t, u, v, x[o + 12], H, 3873151461),v = j(v, w, t, u, x[o + 15], I, 530742520),u = j(u, v, w, t, x[o + 2], J, 3299628645),t = k(t, u, v, w, x[o + 0], K, 4096336452),w = k(w, t, u, v, x[o + 7], L, 1126891415),v = k(v, w, t, u, x[o + 14], M, 2878612391),u = k(u, v, w, t, x[o + 5], N, 4237533241),t = k(t, u, v, w, x[o + 12], K, 1700485571),w = k(w, t, u, v, x[o + 3], L, 2399980690),v = k(v, w, t, u, x[o + 10], M, 4293915773),u = k(u, v, w, t, x[o + 1], N, 2240044497),t = k(t, u, v, w, x[o + 8], K, 1873313359),w = k(w, t, u, v, x[o + 15], L, 4264355552),v = k(v, w, t, u, x[o + 6], M, 2734768916),u = k(u, v, w, t, x[o + 13], N, 1309151649),t = k(t, u, v, w, x[o + 4], K, 4149444226),w = k(w, t, u, v, x[o + 11], L, 3174756917),v = k(v, w, t, u, x[o + 2], M, 718787259),u = k(u, v, w, t, x[o + 9], N, 3951481745),t = c(t, p),u = c(u, q),v = c(v, r),w = c(w, s);var O = m(t) + m(u) + m(v) + m(w);return O.toLowerCase()}

发现这个h函数就是个md5加密方式,这样的话也就太容易了。

解决page和ac参数

page参数一看就知道是页数了,第一页为空,第二页为2,如:访问第一页时,page:"",访问第二页时,page:"2",后面以此类推。

ac参数就是我们cookie值中的cna参数值,cna参数值得访问网站获得https://log.mmstat.com/eg.js,获得
代码如下:

import requestssession = requests.Session()
headers = {'authority': 'log.mmstat.com','method': 'GET','path': '/eg.js','scheme': 'https','accept': '*/*','accept-encoding': 'gzip, deflate, br','accept-language': 'zh-CN,zh;q=0.9','referer': "https://uland.taobao.com/sem/tbsearch?spm=a2e15.8261149.07626516003.1.7ab029b4ZS1aM4&refpid=mm_26632258_3504122_32538762&keyword=%E6%B7%98%E5%AE%9D&clk1=f1a5efd94339f0e6e7cba63ed79b8554&upsid=f1a5efd94339f0e6e7cba63ed79b8554&page=2&_input_charset=utf-8",'sec-fetch-dest': 'script','sec-fetch-mode': 'no-cors','sec-fetch-site': 'cross-site','user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'
}
session.get('https://log.mmstat.com/eg.js', headers=headers)
print(session.cookies.get('cna'))发现这个h函数就是个md5加密方式,这样的话也就太容易了。在这里插入图片描述
在这里插入图片描述解决page和ac参数
page参数一看就知道是页数了,第一页为空,第二页为2,如:访问第一页时,page:"",访问第二页时,page:"2",后面以此类推。ac参数就是我们cookie值中的cna参数值,cna参数值得访问网站获得https://log.mmstat.com/eg.js,获得
代码如下:import requestssession = requests.Session()
headers = {'authority': 'log.mmstat.com','method': 'GET','path': '/eg.js','scheme': 'https','accept': '*/*','accept-encoding': 'gzip, deflate, br','accept-language': 'zh-CN,zh;q=0.9','referer': "https://uland.taobao.com/sem/tbsearch?spm=a2e15.8261149.07626516003.1.7ab029b4ZS1aM4&refpid=mm_26632258_3504122_32538762&keyword=%E6%B7%98%E5%AE%9D&clk1=f1a5efd94339f0e6e7cba63ed79b8554&upsid=f1a5efd94339f0e6e7cba63ed79b8554&page=2&_input_charset=utf-8",'sec-fetch-dest': 'script','sec-fetch-mode': 'no-cors','sec-fetch-site': 'cross-site','user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'
}
session.get('https://log.mmstat.com/eg.js', headers=headers)
print(session.cookies.get('cna'))

有人可能说我为什么会知道这些,因为我认真的分析了整个网站,这个部分的分析过程就不放出来了。

Python进行爬取

前面已经分析了整个网站了,废话不多说了,直接开始上开始上全代码了,复制过去就可以直接运行的代码,淘宝对每个结果最多只显示100页,所以这里也只有100页,如果你需要搜索的与我的不同自己直接修改即可。

import hashlib
import time
import requests
from urllib import parse
import jsonclass TaoBao():def __init__(self,keyword):self.keyword = keywordself.session = requests.Session()self.cookies = {}self.data = {}self.params = {}self.headers = {}for page in range(1,100+1):if page == 1:self.get_url()self.get_m_h5_tk()self.get_cna()self.get_data(page)else:self.get_data(page)def to_cookie(self):cookie = ''for key, value in self.cookies.items():if '_m_h5' in key or 'cna' == key:cookie += key + '=' + value + '; 'return cookiedef h(self,a):md5 = hashlib.md5()md5.update(a.encode())return md5.hexdigest()# 获取uland地址和clk1值def get_url(self):headers = {"Host":"redirect.simba.taobao.com","Upgrade-Insecure-Requests":"1","User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36","Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9","Accept-Encoding":"gzip, deflate","Accept-Language":"zh-CN,zh;q=0.9","Connection":"keep-alive"}url = 'http://redirect.simba.taobao.com/rd?c=un&w=unionsem&k=74175f13c74157df&p=mm_26632258_3504122_32538762&b=R9JQC0peugGMqgRlL5O&s={keyword}&f=https%3A%2F%2Fuland.taobao.com%2Fsem%2Ftbsearch%3Frefpid%3Dmm_26632258_3504122_32538762%26keyword{keywords}'.format(keyword=parse.quote(self.keyword),keywords=parse.quote(parse.quote('='+self.keyword)))response = self.session.get(url,headers=headers,allow_redirects=False)self.clk1 = response.headers.get('Location').split('&')[2]self.uland_url = response.headers.get('Location')# 获取_m_h5_tk和_m_h5_tk_encdef get_m_h5_tk(self):self.data = {"keyword": self.keyword,"ppath": "", "loc": "","minPrice": "","maxPrice": "","ismall": "","ship": "","itemAssurance": "","exchange7": "","custAssurance": "","b": "","clk1": self.clk1,"pvoff": "","pageSize": "100","page": "","elemtid": "1","refpid": "mm_26632258_3504122_32538762","pid": "430673_1006","featureNames": "spGoldMedal,dsrDescribe,dsrDescribeGap,dsrService,dsrServiceGap,dsrDeliver, dsrDeliverGap","ac": "","wangwangid": "","catId": ""}self.timestamp = str(time.time()).replace('.', '')[:13]self.params = {'jsv': '2.4.0','appKey': '12574478','t': self.timestamp,'sign': self.h('undefined' + "&" + self.timestamp + "&" + '12574478' + "&" + json.dumps(self.data,ensure_ascii=False).replace(' ', '')),'api': 'mtop.alimama.union.sem.landing.pc.items','v': '1.0','AntiCreep': 'true','dataType': 'jsonp','type': 'jsonp','ecode': '0','callback': 'mtopjsonp1','data': json.dumps(self.data, ensure_ascii=False).replace(' ', '')}self.headers = {'host': 'h5api.m.taobao.com','accept': '*/*','accept-encoding': 'gzip, deflate, br','accept-language': 'zh-Hans-CN, zh-Hans; q=0.8, en-US; q=0.5, en; q=0.3','referer': self.uland_url,'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36',}self.session.get('https://h5api.m.taobao.com' + '/h5/mtop.alimama.union.sem.landing.pc.items/1.0/?' + parse.urlencode(self.params), headers=self.headers)for key, value in self.session.cookies.items():self.cookies[key] = valueself.token = self.cookies['_m_h5_tk'].split('_')[0]# 获取cnadef get_cna(self):headers = {'authority': 'log.mmstat.com','method': 'GET','path': '/eg.js','scheme': 'https','accept': '*/*','accept-encoding': 'gzip, deflate, br','accept-language': 'zh-CN,zh;q=0.9','referer': self.uland_url,'sec-fetch-dest': 'script','sec-fetch-mode': 'no-cors','sec-fetch-site': 'cross-site','user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}self.session.get('https://log.mmstat.com/eg.js', headers=headers)for key, value in self.session.cookies.items():self.cookies[key] = value# 获取数据def get_data(self, page):self.data['page'] = "" if page == 1 else pageself.data['ac'] = "" if page == 1 else self.cookies.get('cna')self.timestamp = str(time.time()).replace('.', '')[:13]self.params['t'] = self.timestampself.params['sign'] = self.h(self.token + "&" + self.timestamp + "&" + '12574478' + "&" + json.dumps(self.data,ensure_ascii=False).replace(' ',''))self.params['data'] = json.dumps(self.data, ensure_ascii=False).replace(' ', '')self.headers['cookie'] = self.to_cookie()response = self.session.get('https://h5api.m.taobao.com' + '/h5/mtop.alimama.union.sem.landing.pc.items/1.0/?' + parse.urlencode(self.params), headers=self.headers)json_data = json.loads(response.text.replace('mtopjsonp1(','').replace(')',''))print(json_data)if __name__ == '__main__':TaoBao('男装')     # 搜索关键词

Python3爬取淘宝网商品数据!相关推荐

  1. python爬取淘宝商品做数据挖掘

    作业要求:https://edu.cnblogs.com/campus/gzcc/GZCC-16SE1/homework/3159 项目内容: 本项目选择 淘宝商品类目:零食 数量:一共100页,44 ...

  2. 爬取淘宝商品信息之数据分析篇

    淘宝爬虫请戳:https://blog.csdn.net/weixin_43746433/article/details/97623511 Github:https://github.com/why1 ...

  3. 利用Python爬虫爬取淘宝商品做数据挖掘分析实战篇,超详细教程

    项目内容 本案例选择>> 商品类目:沙发: 数量:共100页  4400个商品: 筛选条件:天猫.销量从高到低.价格500元以上. 项目目的 1. 对商品标题进行文本分析 词云可视化 2. ...

  4. python电商数据挖掘_Python 爬取淘宝商品数据挖掘分析实战

    作者 孙方辉 本文为CDA志愿者投稿作品,转载需授权 项目内容 本案例选择>> 商品类目:沙发: 数量:共100页 4400个商品: 筛选条件:天猫.销量从高到低.价格500元以上. 项目 ...

  5. python电商数据挖掘_利用Python爬取淘宝商品并数据挖掘与分析实战!此乃大型项目!...

    项目内容 本案例选择>> 商品类目:沙发: 数量:共100页 4400个商品: 筛选条件:天猫.销量从高到低.价格500元以上. 项目目的 1. 对商品标题进行文本分析 词云可视化 2. ...

  6. [Python3网络爬虫开发实战] 7-动态渲染页面爬取-4-使用Selenium爬取淘宝商品

    在前一章中,我们已经成功尝试分析Ajax来抓取相关数据,但是并不是所有页面都可以通过分析Ajax来完成抓取.比如,淘宝,它的整个页面数据确实也是通过Ajax获取的,但是这些Ajax接口参数比较复杂,可 ...

  7. python爬虫淘宝手机_【Python3 爬虫】14_爬取淘宝上的手机图片

    现在我们想要使用爬虫爬取淘宝上的手机图片,那么该如何爬取呢?该做些什么准备工作呢? 首先,我们需要分析网页,先看看网页有哪些规律 我们可以看到左侧是主题市场,将鼠标移动到[女装/男装/内衣]这一栏目, ...

  8. python爬取淘宝淘女郎图片

    网上有许多爬取淘宝淘女郎的代码,发现有的都不可用,就自己改写了一个,当前可用日期为2017.07.30 前提 chromedriver.exe(本文用的是2.30,有两种使用方式,一种是直接绝对路径引 ...

  9. 使用Appium爬取淘宝App数据

    0x01.介绍说明 1.简介 Appium是一个自动化测试开源工具.通过WebDriver协议驱动IOS.Android.Windows Phone平台上的原生应用.混合应用和web应用. 2.App ...

最新文章

  1. HDU1212(大数取模-秦九昭算法)
  2. “性能调优”坑惨了几十万程序员
  3. STM32下一次程序后J-link不能识别问题解决
  4. vivo X30系列发布会邀请函曝光:名副其实的“望远镜”
  5. 使用TensorFlow.js进行AI在网络摄像头中翻译手势和手语
  6. mysql 锁怎么使用_MySQL锁的用法之行级锁
  7. C语言求一个文件的长度,求二进制文件的长度
  8. .net socket与完成端口、异步发送相关研究
  9. Python开发之用户密码存储
  10. 前端学习笔记之三PS
  11. 微信小说,微信游戏系统域名被屏蔽是怎么回事
  12. Hung-yi Li Machine Learning 2019 Task1
  13. 【数据异常校验】T检验或T测试(T-test)
  14. 《STL源码剖析》-- stl_uninitialized.h
  15. python批量添加qq好友_python实现QQ批量登录功能
  16. xampp v3.2.2 php版本,xampp 3.2.2下载
  17. 2019-2020-1 1823《程序设计与数据结构》第一周作业总结
  18. 基于Python医学院校二手书管理毕业设计-附源码201704
  19. Java数据结构之栈详解
  20. springboot表单验证

热门文章

  1. [附源码]Java计算机毕业设计SSM大学生健康管理系统的设计与实现
  2. 过滤器与拦截器的区别?
  3. 矩阵分析L1 线性空间基础
  4. 《MetaSploit渗透测试魔鬼训练营》之WEB应用渗透技术
  5. 函数柯里化与反柯里化
  6. 3个最基础的APP技术框架
  7. 基金从业资格考试(科目二):证券投资基金(第二版)上册 学习笔记
  8. php js实现流程图,详解js中构造流程图的核心技术JsPlumb(2)_javascript技巧
  9. linux命令--文件夹重命名
  10. 展锐智能机平台sc9820e调试pwm背光所遇问题小结