榛果美团登录爬虫 requests session

所有美团方面旗下的登陆都采用重定向来解决登陆问题

即利用session 对话来解决登陆问题

当然也可以每次都模拟他的cookie来进行登陆

我用的代理是阿布云代理你们也可以选择别代理

这次是爬取的美团旗下的榛果民宿

  1 import requests
  2 from urllib.parse import urlencode
  3 import json
  4 import time, datetime
  5 import logging
  6 from lxml import etree
  7 import pymysql
  8 from pymysql.err import IntegrityError
  9
 10 proxies_ = {
 11     'http': '@http-dyn.abuyun.com:9020',
 12     'https': '@http-dyn.abuyun.com:9020',
 13 }
 14 headers = {
 15     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36 OPR/48.0.2685.52'
 16 }
 17 session = requests.Session()
 18
 19
 20 def session_get(url, header=headers, tab=12):
 21     if tab == 0:
 22         return False
 23     try:
 24         response = session.get(url, headers=header, proxies=proxies_)
 25         time.sleep(2)
 26         return response if response.status_code == 200 else session_get(url, header, tab - 1)
 27     except Exception as e:
 28         if tab == 1:
 29             logging.exception(e)
 30         return session_get(url, header, tab - 1)
 31
 32
 33 def session_post(url, header=headers, data=None, tab=12):
 34     if tab == 0:
 35         return False
 36     try:
 37         response = session.post(url, headers=header, data=data, proxies=proxies_)
 38         time.sleep(2)
 39         return response if response.status_code == 200 else session_post(url, header, data, tab - 1)
 40     except Exception as e:
 41         if tab == 1:
 42             logging.exception(e)
 43         return session_post(url, header, data, tab - 1)
 44
 45
 46 def get_node_text(node, xpath):
 47     """
 48     通过节点和xpath来获取节点需要的内容
 49     :param node:
 50     :param xpath:
 51     :return:
 52     """
 53     try:
 54         if xpath == "string(.)": return node.xpath('string(.)').strip()
 55         if len(node.xpath(xpath)) > 0:
 56             return node.xpath(xpath)[0].strip() if isinstance(node.xpath(xpath)[0], str) else node.xpath(xpath)[0]
 57         return ""
 58     except:
 59         logging.exception('获取xpath %s 出错' % (xpath))
 60         return None
 61
 62
 63 def get_youjia_tpp_conn():
 64     """
 65     获取井队数据库连接
 66     :return:
 67     """
 68     return pymysql.connect(host='host', user='user', passwd='passwd', db='db', port=3306,
 69                            charset='utf8')
 70
 71
 72 def storage_database_text(data_json, t_name, l_name="youjia_tpp"):
 73     """
 74     非json类型数据存储数据库
 75     :param data_json:
 76     :param t_name:
 77     :param l_name:
 78     :return:
 79     """
 80     now_time = str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
 81     data_list = []
 82     insert_sql = "INSERT INTO " + l_name + "." + t_name + " ("
 83     update_sql = "UPDATE " + l_name + "." + t_name + " SET "
 84     for key in data_json:
 85         update_sql += str(key) + "=%s , "
 86         if str(key) == "id":
 87             id_key = data_json[key]
 88         insert_sql += str(key) + ","
 89     update_sql += "modify_time = '" + str(now_time) + "' where id = '" + str(id_key) + "'"
 90     insert_sql = insert_sql[:-1]
 91     insert_sql += ")VALUES("
 92     for key in data_json:
 93         insert_sql += "%s,"
 94         data_list.append(str(data_json[key]))
 95     insert_sql = insert_sql[:-1]
 96     insert_sql += ");"
 97     # print(update_sql)
 98     # print(insert_sql)
 99     with get_youjia_tpp_conn() as conn:
100         try:
101             print("storage_database_text  insert_sql : ", t_name)
102             conn.execute(insert_sql, tuple(data_list))
103         except IntegrityError:
104             print("storage_database_text  update_sql : ", t_name)
105             conn.execute(update_sql, tuple(data_list))
106         except Exception as msg:
107             logging.exception(msg)
108
109
110 def storage_database_json(id_, data_json, j_name, t_name, l_name="youjia_tpp"):
111     """
112     存储json形式至数据库
113     :param id_: id
114     :param data_json: json
115     :param j_name: json的名字
116     :param t_name: 表名
117     :param l_name: 库名
118     :return:
119     """
120     now_time = str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
121     insert_sql = "INSERT INTO " + l_name + "." + t_name + " (`id`,`" + j_name + "`)VALUES(%s,%s);"
122     updatesql = "update " + l_name + "." + t_name + " set `" + j_name + "`=%s , modify_time=%s where id = %s;"
123     # print(updatesql % (data_json, now_time, id_))
124     with get_youjia_tpp_conn() as conn:
125         try:
126             print("storage_database_json  insert_sql : ", t_name)
127             conn.execute(insert_sql, (id_, data_json))
128         except IntegrityError:
129             print("storage_database_json  update_sql : ", t_name)
130             conn.execute(updatesql, (data_json, now_time, id_))
131         except Exception as msg:
132             logging.exception(msg)
133
134
135 def pre_login():
136     try:
137         param = {
138             # 'uuid': 'e8514dbe200b4fde9393.1532912269.1.0.0',
139             'service': 'phoenix',
140             'continue': 'https://www.zhenguo.com/auth/authenticated/?continue=/help/trust/',
141         }
142         url = 'https://passport.meituan.com/account/unitivelogin?' + urlencode(param)
143         response = session_get(url=url, header=headers, tab=5)
144         if response.status_code == 200:
145             print("pre_login 成功")
146             return response.text
147         else:
148             return None
149     except ConnectionError as e:
150         print(e.args)
151         print('预登陆出错')
152
153
154 def parse_param(html):
155     try:
156         html = etree.HTML(html)
157         csrf = html.xpath('//input[@name="csrf"]/@value')[0]
158         origin = html.xpath('//input[@name="origin"]/@value')[0]
159         fingerprint = html.xpath('//input[@name="fingerprint"]/@value')[0]
160         uuid = html.xpath('//i[@class="form-uuid"]/text()')[0]
161         need_captcha = html.xpath('//div[@class="form-field J-form-field-captcha form-field--captcha"]/@style')[
162             0].replace("display:", "")
163         return (csrf, uuid, need_captcha, origin, fingerprint)
164     except:
165         print('解析csrf,uuid,need_captcha出错')
166
167
168 def formal_login(username, password, param):
169     csrf = param[0]
170     uuid = param[1]
171     origin, fingerprint = param[3], param[4]
172     if 1 == 1:
173         captcha_param = {
174             'uuid': uuid,
175         }
176         url = 'https://passport.meituan.com/account/captcha?' + urlencode(captcha_param)
177         print(url)
178         image_resp = session_get(url)
179         with open('C:/Users/admin/Desktop/image/zg.jpg', 'wb') as file:
180             file.write(image_resp.content)
181         captcha = input('需要验证码:')
182     # else:
183     #     captcha = ''
184     url_param = {
185         'uuid': uuid,
186         'service': 'phoenix',
187         'continue': 'https://www.zhenguo.com/auth/authenticated/?continue=/help/trust/',
188     }
189     postdata = {
190         'email': username,
191         'password': password,
192         'captcha': captcha,
193         'origin': origin,
194         'fingerprint': fingerprint,
195         'csrf': csrf
196     }
197     url = 'https://passport.meituan.com/account/unitivelogin?' + urlencode(url_param)
198     try:
199         response = session_post(url, data=postdata, header=headers)
200         if response.status_code == 200:
201             print("登陆成功！")
202             return response.text
203         else:
204             return None
205     except ConnectionError as e:
206         print(e.args)
207         print('登录出错')
208
209
210 def parse_token(html):
211     try:
212         html = etree.HTML(html)
213         action_url = html.xpath('//form[@class="J-form mainbox__content"]/@action')[0]
214         token = html.xpath('//input[@name="token"]/@value')[0]
215         expire = html.xpath('//input[@name="expire"]/@value')[0]
216         isdialog = html.xpath('//input[@name="isdialog"]/@value')[0]
217         autologin = html.xpath('//input[@name="autologin"]/@value')[0]
218         csrf = html.xpath('//*[@id="csrf"]/text()')[0]
219
220         # headers['x-csrf-token'] = csrf
221         # trust_response = session.post(action_url, data=postdata, headers=headers)
222         # print(trust_response.text)
223         return {"action_url": action_url, "token": token, "expire": expire, "isdialog": isdialog,
224                 "autologin": autologin, "csrf": csrf}
225     except:
226         logging.exception('解析token出错')
227
228
229 def redirect_login(token_json):
230     """
231     {"action_url": action_url, "token": token, "expire": expire, "isdialog": isdialog,
232                 "autologin": autologin, "csrf": csrf}
233     :param token:
234     :return:
235     """
236     postdata = {
237         'token': token_json['token'],
238         'expire': token_json['expire'],
239         'isdialog': token_json['isdialog'],
240         'autologin': token_json['autologin'],
241         'logintype': 'normal'
242     }
243     headers['x-csrf-token'] = token_json['csrf']
244     try:
245         trust_response = session_post(token_json['action_url'], data=postdata, header=headers)
246         print("重定向成功！！")
247         # tt = session.get("https://www.zhenguo.com/house/list/", headers=t_h)
248     except ConnectionError as e:
249         print(e.args)
250         print('重定向出错')
251
252
253 def test():
254     try:
255         time.sleep(5)
256         url = 'http://maoyan.com/profile'
257         response = session_get(url, header=headers)
258         print(response.status_code)
259         print(response.text)
260     except ConnectionError as e:
261         print(e.args)
262         print('测试出错')
263
264
265 def crawl_order(account_id, token, page_no=1, page_size=20):
266     orders_url = "https://www.zhenguo.com/host/orders/"
267     response = session_get(orders_url, header=headers)
268     print(response.status_code)
269     html = etree.HTML(response.text)
270     csrf = html.xpath('//meta[@name="csrf-token"]/@content')[0]
271     headers['x-csrf-token'] = csrf
272     print(csrf)
273     queryOrderByTypeUrl = "https://www.zhenguo.com/gw/order/api/v1/orderSearch/queryOrderByType"
274     OrderByType = {'pageNow': page_no, 'pageSize': page_size, 'orderStatusType': 9}
275     headers['Accept'] = "application/json"
276     headers['Content-Type'] = "application/json"
277     query_response = session_post(queryOrderByTypeUrl, data=json.dumps(OrderByType), header=headers)
278     query_json = query_response.json()
279     query_list = query_json['data']['list']
280     print(len(query_list))
281     for order_json in query_list:
282         order_id = order_json['orderId']
283         storage_database_json(order_id, json.dumps(order_json), 'order', 'zhenguo_order')
284         storage_database_text({"id": order_id, 'account_id': account_id}, 'zhenguo_order')
285
286     if len(query_list) == page_size:
287         crawl_order(account_id, page_no + 1)
288
289
290 def house_detail(list_json):
291     """
292     解析房屋详情的
293     :param list_json:
294     :return:
295     """
296     room_id = list_json["id"]
297     room_url = "https://www.zhenguo.com/housing/%s" % room_id
298     room_response = session_get(room_url)
299     if room_response:
300         html = etree.HTML(room_response.text)
301         room_type = get_node_text(html,
302                                   '//*[@id="J-layout"]/div[2]/div/div[2]/div/div[2]/section[1]/div[2]/span[1]/text()')
303         list_json["room_type"] = room_type
304         house_wear = get_node_text(html,
305                                    '//*[@id="J-layout"]/div[2]/div/div[2]/div/div[2]/section[1]/div[2]/span[2]/text()')
306         list_json["house_wear"] = house_wear
307         room_area = get_node_text(html,
308                                   '//*[@id="J-layout"]/div[2]/div/div[2]/div/div[2]/section[1]/div[2]/span[3]/text()')
309         list_json["room_area"] = room_area
310         for node in html.xpath('//*[@id="J-layout"]/div[2]/div/div[2]/div/div[2]/section[2]/ul/li'):
311             text = get_node_text(node, './div[1]/text()')
312             node_detail = get_node_text(node, './div[2]/text()')
313             if text == "房源":
314                 room_count = node_detail
315                 list_json["room_count"] = room_count
316             if text == "评价":
317                 comment_count = node_detail
318                 list_json["comment_count"] = comment_count
319             if text == "咨询回复率":
320                 rep_rate = node_detail
321                 list_json["rep_rate"] = rep_rate
322             if text == "咨询回复时长":
323                 rep_length = node_detail
324                 list_json["rep_length"] = rep_length
325         str(1).strip()
326         reserve = get_node_text(html, '//*[@id="J-layout"]/div[2]/div/'
327                                       'div[2]/div/div[2]/section[8]/ul[1]/li[2]/text()').split("，")
328         # list_json["reserve"] = reserve
329         if len(reserve) > 1:
330             less_day = reserve[0].replace("最少预订", "").replace("天", "").strip()
331             more_day = reserve[1].replace("最多预订", "").replace("天", "").strip()
332             list_json["less_day"] = less_day
333             list_json["more_day"] = more_day
334         unsubscribe = get_node_text(html, '//*[@id="J-layout"]/div[2]/div/div[2]/div/div[2]/section[8]/ul[2]/li/text()')
335         list_json["unsubscribe"] = unsubscribe
336     return list_json
337
338
339 def crawl_room(account_id, token):
340     comment_url = "https://www.zhenguo.com/gw/ugc/api/v1/product/comments?productId=%s&pageNow=1&pageSize=100"
341     room_list_url = "https://www.zhenguo.com/house/list/"
342     room_response = session_get(url=room_list_url, header=headers)
343     if room_response:
344         html = etree.HTML(room_response.text)
345     for node in html.xpath('//div[@class="houseCard__block"]'):
346         title = get_node_text(node, './div[@class="houseCard__titleLine"]/text()')  # 标题
347         price = get_node_text(node, './div[@class="houseCard__addLine clearfix"]'
348                                     '/span[1]/span[@class="houseCard__price"]/text()').replace("¥", "")  # 价格
349         state = get_node_text(node, './div[@class="houseCard__bottomLine clearfix"]/'
350                                     'div[1]/span[@class="houseCard__verifyStatus-5"]/text()')  # 状态
351         room_id = get_node_text(node, './div[@class="houseCard__bottomLine clearfix"]'
352                                       '/div[1]/@data-product-id')  # 房源id
353         print(account_id, title, price, state, room_id)
354         list_json = {"account_id": account_id, "title": title,
355                      "price": price, "state": state, "id": room_id, "room_id": room_id}
356         comment_ = comment_url % room_id
357
358         house_json = house_detail(list_json)
359         response = session_get(url=comment_)
360         if response:
361             print(response.text)
362             storage_database_json(room_id, json.dumps(response.json()), "comment", "zhenguo_room_info",
363                                   l_name="youjia_tpp")
364         storage_database_text(house_json, 'zhenguo_room_info')
365
366
367 def crawl_room_list(account_id, token):
368     app_header = {"User-Agent": "Mozilla/5.0 (Linux; Android 5.1.1; vivo X7 Build/LMY47V) AppleWebKit/537.36 "
369                                 "(KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36 TitansX/11.6.12 "
370                                 "KNB/1.2.0 android/5.1.1 phoenix/com.meituan.phoenix/2.6.0 com.meituan.phoenix/2.6.0",
371                   "Cookie": "token=" + token}
372     list_url = "https://iphx.meituan.com/ds/product/online/list"
373     list_resp = session_get(url=list_url, header=app_header)
374     if list_resp:
375         list_json = list_resp.json()
376         for room_json in list_json['data']['list']:
377             room_id = room_json['productId']
378             product_quota_url = "https://iphx.meituan.com/api/product/api/v1/product/getProductQuota/"+str(room_id)
379             product_quota_resp = session_get(url=product_quota_url, header=app_header)
380             print(room_json)
381             print(product_quota_resp.json()['data'])
382
383
384
385 def crawl(account_id, token):
386     """
387     登录的session搞定之后 开始爬取详细信息
388     :return:
389     """
390     crawl_room_list(account_id, token)  # 爬取手机端信息
391
392     # crawl_room(account_id, token)  # 房屋爬取
393     # crawl_order(account_id, token)  # 订单爬虫
394
395
396 def login(username, password):
397     html_pre_login = pre_login()
398     param = parse_param(html_pre_login)
399     print("param: ", param)
400     html_login = formal_login(username, password, param)
401     # print(html_login)
402     token_json = parse_token(html_login)
403     print("token_json: ", token_json)
404     redirect_login(token_json)
405     return token_json['token']
406
407
408 if __name__ == '__main__':
409     username = 'username'
410     password = 'username'
411     token = login(username, password)
412     crawl(1, token)

榛果美团登录爬虫 requests session相关推荐

python爬虫——利用 session 处理登录状态 github 登录实例
更多精彩内容 Cookie Cookies是服务器在本地机器上存储的小段文本并随每一个请求发送至同一个服务器 Session session机制是一种服务器端的机制,服务器使用一种类似于散列表的结构( ...
网络爬虫--requests、post、解密、Cookie、Session、IP代理
网络爬虫的定义网络爬虫是一种按照一定的规则,自动地抓取万维网信息的程序或者脚本. 根据使用场景,网络爬虫可分为通用爬虫和聚焦爬虫两种. 通用爬虫通用网络爬虫是捜索引擎抓取系统(Baidu.Goo ...
爬虫之利用requests.session进行状态保持
requests模块中的Session类能够自动处理发送请求获取响应过程中产生的cookie,进而达到状态保持的目的. 1.1 requests.session的作用以及应用场景 requests.s ...
python模拟session_python使用requests.session模拟登录
最近开发一套接口,写个Python脚本,使用requests.session模拟一下登录. 因为每次需要获取用户信息,登录需要带着session信息,所以所有请求需要带着session. 请求使用po ...
python爬虫使用session保持登录状态
今天有个客户需求,从网站上下载会员试题,需要在登录状态下载,然后将网页中展示的试题保存在word中. 网站上展示的所有试题要保存在一个word文档中,但是每一个试题结束下一个试题开始都是分开页码,一道 ...
python模拟登录详细教程_Python模拟登录requests.Session应用详解
最近由于某些原因,需要用到Python模拟登录网站,但是以前对这块并不了解,而且目标网站的登录方法较为复杂, 所以一下卡在这里了,于是我决定从简单的模拟开始,逐渐深入地研究下这块. 注:本文仅为交流学 ...
零基础爬虫requests初阶教程，手把手教你爬数据
目录一.环境与工具二.学爬虫必备知识三.简单体验 requests 四.get 请求 3.1 基础讲解一 3.3 基础讲解二 3.2 基础讲解三 3.4 获取cookie 3.5 获取请求头 3 ...
python爬虫requests库_python爬虫基础教程：requests库（二）代码实例
get请求简单使用 import requests ''' 想要学习Python?Python学习交流群:973783996满足你的需求,资料都已经上传群文件,可以自行下载! ''' respons ...
python爬虫requests库_python爬虫使用Requests库 - pytorch中文网
在入门教程中我们介绍了urllib库和urllib2的用法,同时我们了解一些爬虫的基础以及对爬虫有了基本的了解.其实在我们生产环境中,使用Request库更加方便与实用,同时我们这需要短短的几行代码就 ...

榛果美团登录爬虫 requests session

榛果美团登录爬虫 requests session相关推荐

最新文章

热门文章

榛果 美团 登录 爬虫 requests session

榛果 美团 登录 爬虫 requests session相关推荐

最新文章

热门文章

榛果美团登录爬虫 requests session

榛果美团登录爬虫 requests session相关推荐