一、python实现app数据抓取需求

1、分析豆果美食数据包

2、通过python多线程-线程池抓取数据

3、通过使用代理ip隐藏爬虫

4、将数据保存到 mongodb 中

handle_mongo.py

import pymongo
from pymongo.collection import Collectionclass Connect_mongo(object):def __init__(self):self.client = pymongo.MongoClient(host="127.0.0.1", port=27017)self.db_data = self.client["dou_guo_mei_shi"]def insert_item(self, item):db_collection = Collection(self.db_data, "dgms_item")db_collection.insert(item)mongo_info = Connect_mongo()

spider_dgms.py

import requests
import json
from multiprocessing import Queue
from handle_mongo import mongo_info
from concurrent.futures import ThreadPoolExecutor# 数据队列
queue_list = Queue()# 请求数据方法
def handel_request(url, data):header = {# "Cookie": "duid=69270019","client": "4","version": "7106.2",# "channel": "baidu","act-code": "1637324809","act-timestamp": "1637324809","pset": "1",# "pseudo-id": "44c57e66cae004c9","device": "SM-N976N","brand": "samsung","sdk": "25,7.1.2","resolution": "1280*720","dpi": "1.5","timezone": "28800","language": "zh","cns": "2","imsi": "460071317077478","uuid": "f4d26323-9b23-403c-9187-e662ba7fc470","User-Agent": "Mozilla/5.0 (Linux; Android 7.1.2; SM-N976N Build/QP1A.190711.020; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/92.0.4515.131 Mobile Safari/537.36","battery-level": "0.98","battery-state": "3","caid": "44c57e66cae004c9","bssid": "AC:22:0B:07:74:4E","display-resolution": "1280*720","scale": "1.5","reach": "1","rom-version": "d2que-user 7.1.2 QP1A.190711.020 700211101 release-keys","syscmp-time": "1635765679000","countrycode": "CN","sysmemory": "3186032640","sysdisksize": "61.39 GB","terms-accepted": "1","newbie": "1","app-state": "0","bootmark": "822206a5-4ae7-412f-8e85-601e91ff3d10","updatemark": "1635817290.809861000","Content-Type": "application/x-www-form-urlencoded; charset=utf-8","Accept-Encoding": "gzip, deflate","Connection": "Keep-Alive","session-info": "O0y/dPYTJAm0BbGoew14b/fQctKsmmjzQdUMA09SAmiCS7mYzKaP/73rPRSMQ2uIW+v7/szNkhJjhWHJizw+1luYaLUqgMlU1ieRCbekVZspYwlXLyRowX2oEkhJ0MIj","Host": "api.douguo.net",# "Content-Length": "179",}# 设置代理# proxies = {"http": "127.0.0.1:8888"}  # 通用代理# response = requests.post(url=url, headers=header, data=data, proxies=proxies)response = requests.post(url=url, headers=header, data=data)return response# 请求菜谱分类页面
def handle_index():url = "https://api.douguo.net/recipe/flatcatalogs"data = {"client": "4",# "_session": "1637373525108351564145807749",# "v": "new1637324615","_vs": "0","sign_ran": "e53fd78533e564209a573d14bd449d83","code": "a8665c30af67ce0c",}response = handel_request(url, data)# 解析数据index_response_dict = json.loads(response.text)for index_item in index_response_dict["result"]["cs"]:# print(index_item["name"])for index_item_1 in index_item["cs"]:data_2 = {"client": "4",# "_session": "1637373525108351564145807749","keyword": index_item_1["name"],"order": "0","_vs": "400","type": "0","auto_play_mode": "2","sign_ran": "1ce60f6319b32e96194116a84c331275","code": "bf2b7920137d77f9",}queue_list.put(data_2)# print("----->",index_item_1["name"])# print(response.text)# 获取菜谱列表
def handle_caipu_list(data):print("当前处理的食材:", data["keyword"])caipu_list_url = "https://api.douguo.net/recipe/v2/search/0/20"caipu_list_response = handel_request(url=caipu_list_url, data=data)# print(caipu_list_response.text)caipu_list_response_dict = json.loads(caipu_list_response.text)for item in caipu_list_response_dict["result"]["list"]:caipu_info = {}caipu_info["shicai"] = data["keyword"]if item["type"] == 13:caipu_info["user_name"] = item["r"]["an"]caipu_info["shicai_id"] = item["r"]["id"]caipu_info["describe"] = item['r']['cookstory']caipu_info["caipu_name"] = item['r']['n']caipu_info["zuoliao_list"] = item['r']['major']# print(caipu_info)# 请求详细做法信息detail_url = "https://api.douguo.net/recipe/v2/detail/" + str(caipu_info["shicai_id"])detail_data = {"client": "4","_session": "1637373525108351564145807749","author_id": "0","_vs": "11102","_ext": '{"query":{"kw":' + caipu_info["shicai"] + ',"src":"11102","idx":"2","type":"13","id":' + str(caipu_info["shicai_id"]) + '"}}"',"is_new_user": "1","sign_ran": "f588cc28475995ac6398393f5007e4be","code": "584429579d7b207a",}detail_response = handel_request(detail_url, detail_data)detail_response_dict = json.loads(detail_response.text)caipu_info["tips"] = detail_response_dict["result"]["recipe"]["tips"]caipu_info["cook_step"] = detail_response_dict["result"]["recipe"]["cookstep"]print("当前入库菜谱是: ", caipu_info["caipu_name"])mongo_info.insert_item(caipu_info)  # 插入数据到mongodbprint("插入完成")else:continue# print(item)if __name__ == '__main__':# 插入一个菜谱# handle_index()# handle_caipu_list(queue_list.get())# 多线程抓取数据handle_index()pool = ThreadPoolExecutor(max_workers = 20)while queue_list.qsize() > 0:pool.submit(handle_caipu_list, queue_list.get())

爬虫_app 4 app数据抓取入门相关推荐

  1. python爬取app播放的视频,Python爬虫工程师必学——App数据抓取实战视频教程

    爬虫分为几大方向,WEB网页数据抓取.APP数据抓取.软件系统数据抓取.本课程主要为同学讲解如何用python实现App数据抓取,课程从开发环境搭建,App爬虫必备利器详解,项目实战,到最后的多App ...

  2. 22.网络爬虫—APP数据抓取详讲

    网络爬虫-APP数据抓取详讲 Fiddler 工作原理 安装完成Fiddler后的配置 前提条件 工具配置 手机数据抓取 Fiddler手机端配置 手机端操作 实战演示 后记 前言:

  3. Python爬虫实战:手机APP数据抓取分析!谁说不能爬取app数据的?

    大多数手机APP里面返回的是json格式数据,或者一堆加密过的数据 .这里以超级课程表APP为例,使用python抓取超级课程表里用户发的话题.主要是练习python爬取app的一些方式和技巧. 1. ...

  4. python中国大学排名爬虫写明详细步骤-Python爬虫--2019大学排名数据抓取

    Python爬虫--2019大学排名数据抓取 准备工作 输入:大学排名URL连接 输出:大学排名信息屏幕输出 所需要用到的库:requests,bs4 思路 获取网页信息 提取网页中的内容并放到数据结 ...

  5. 网络爬虫——中国大学排名数据抓取

    网络爬虫--中国大学排名数据抓取 目标网址 中国大学排名网:http://www.zuihaodaxue.com/zuihaodaxuepaiming2019.html 全球有很多份大学排名,这里以上 ...

  6. 如何用python抓取文献_浅谈Python爬虫技术的网页数据抓取与分析

    浅谈 Python 爬虫技术的网页数据抓取与分析 吴永聪 [期刊名称] <计算机时代> [年 ( 卷 ), 期] 2019(000)008 [摘要] 近年来 , 随着互联网的发展 , 如何 ...

  7. 爬虫教程( 3 ) --- 手机 APP 数据抓取

    1. Fiddler 设置 这是使用 fiddler 进行手机 app 的抓包,也可以使用 Charles,burpSuite 等... 电脑安装 Fiddler, 手机 和 安装 fiddler 的 ...

  8. 基于Python爬虫的股票成交量数据抓取分析系统

    目录 数据获取 2 1.1. 实验环境搭建 2 1.2. 抓取数据 2 1.2.1. 新浪财经 3 1.2.2. 网易财经 6 1.2.3. 东方财富 12 1.2.4. TuShare (挖地兔) ...

  9. 爬虫的原理和数据抓取

    为什么要做爬虫? 都说现在是"大数据时代",那数据从何而来? 企业产生的用户数据:百度指数.阿里指数.TBI腾讯浏览指数.新浪微博指数 数据平台购买数据:数据堂.国云数据市场.贵阳 ...

最新文章

  1. 一文初识:美、日、中3国药品GMP特点
  2. C# 创建、部署和调用WebService的示例
  3. Android上超级好用的前端调试方法(adb reverse)
  4. VTK:vtkCompositePolyDataMapper2用法实战
  5. PHP性能如何实现全面优化?
  6. 每日Ubuntu小技巧 - 使用TeamViewer连接远程桌面
  7. shell脚本和linux命令,Linux shell脚本全面学习(一)
  8. IntelliJ IDEA打开错误 _CGContextSetAllowsAcceleration
  9. 吉林大学超星学习通04
  10. intent传递集合数据
  11. 计算机保研面试中,都有哪些令人窒息的问题?
  12. This Exception was thrown from a job compiled with Burst, which has limited exception support. 报错
  13. 腾讯全民wifi如何?
  14. c语言统计学生成绩输入一个正整数n,输入一个正整数n,再输入n个学生的成绩,计算平均分,并统计各等级成绩的个数...
  15. thinkpadE430c加装固态硬盘小记
  16. Win10 如何隐藏控制面板里面的BitLocker 驱动器加密功能
  17. 新时期,老师该怎么撑伞?
  18. sendmail安装使用
  19. NTC热敏电阻采集温度
  20. 2012年3月18日学习

热门文章

  1. Protel 2004 电路设计 鲁捷,焦振宇,孟凡文编著
  2. 二叉树的后序非递归遍历(巧妙思想)
  3. JAVA题目~分数类Fraction Exp03-4
  4. Python3.7 下安装pyqt5
  5. ios沙箱模式开启_iOS沙盒(sandBox)机制总结
  6. 【运筹学】(2)—预测
  7. android 耳机监听权限,android 耳机监听
  8. Pandas数据分析实战1——淘宝粽子行业分析
  9. 全国软件专业人才开发与设计赛题之中等题“统计省份人员信息”
  10. 运用程序化交易系统的能力表现在哪些方面?