Python3网络爬虫:今日头条新闻App的广告数据抓取
咱们就不说废话了,直接上完整的源码
def startGetData(self):ret = random.randint(2, 10)index = 0 url = "" while index < ret:if index == 0:url = "http://lf.snssdk.com/api/news/feed/v80/?fp=PSTqPzFSJ2HuFlG7LlU1FYmeLS4_&version_code=6.6.5&app_name=news_article&vid=07FEB726-62D1-442A-ADE2-6781" \"0CF8C421&device_id=51911855605&channel=App%20Store&resolution=750*1334&aid=13&ab_version=304488,346137,349052,271178,326588,326524,326532,338589,3" \"36927,295827,325048,345778,239096,348856,344345,170988,346540,332095,325197,338954,330633,297058,276204,286212,313219,338067,348326,347814,277771," \"310595,342074,334586,339207,323233,328671,346557,280773,338894,319962,344870,345191,348452,349020,348669,343444,214069,337616,348941,207251,26631" \"2,247847,280447,281298,328218,325618,328227,348992,288417,290193,326190,339904,344131&ab_feature=201617,z1&ab_group=z1,201617&openudid=0bf32dfcb91c" \"3dc330eb92c492a9e9093fc44b51&idfv=07FEB726-62D1-442A-ADE2-67810CF8C421&ac=WIFI&os_version=9.3.1&ssmix=a&device_platform=iphone&iid=31813899088&ab_c" \"lient=a1,f2,f7,e1&device_type=iPhone%206&idfa=0784D090-1DC1-4B24-BAA1-0C474ED94D52&detail=1&refresh_reason=1&last_refresh_sub_entrance_interva" \"l=" + str(int(time.time())) + "&tt_from=pull&count=20&list_count=37&support_rn=4&LBS_status=deny&cp=54AbF4Ad5aAE6q1&loc_mode=0&min_behot_time=" + str(int(time.time())) + "&image=1&session_refres" \"h_idx=3&strict=1&refer=1&language=zh-Hans-CN&concern_id=6286225228934679042&as=a295754fa45e4a0a5a3192&ts=" + str(int(time.time()))self.list_count = 17 elif index >= 1:url = "http://lf.snssdk.com/api/news/feed/v80/?fp=PSTqPzFSJ2HuFlG7LlU1FYmeLS4_&version_code=6.6.5&app_name=news_article&vid=07FEB726-62D1-442A-ADE2-67810" \"CF8C421&device_id=51911855605&channel=App%20Store&resolution=750*1334&aid=13&ab_version=304488,346137,349052,271178,326588,326524,326532,338589,336" \"927,295827,325048,345778,239096,348856,344345,170988,346540,332095,325197,338954,330633,297058,276204,286212,313219,338067,348326,347814,277771,31" \"0595,342074,334586,339207,323233,328671,346557,280773,338894,319962,344870,345191,348452,349020,348669,343444,214069,337616,348941,207251,266312,2" \"47847,280447,281298,328218,325618,328227,348992,288417,290193,326190,339904,344131&ab_feature=201617,z1&ab_group=z1,201617&openudid=0bf32dfcb91c3dc" \"330eb92c492a9e9093fc44b51&idfv=07FEB726-62D1-442A-ADE2-67810CF8C421&ac=WIFI&os_version=9.3.1&ssmix=a&device_platform=iphone&iid=31813899088&ab_clien" \"t=a1,f2,f7,e1&device_type=iPhone%206&idfa=0784D090-1DC1-4B24-BAA1-0C474ED94D52&detail=1&last_refresh_sub_entrance_interval=" + str(int(time.time())) + "&tt_from=load_m" \"ore&count=20&list_count=" + str(self.list_count) + "&support_rn=4&LBS_status=deny&cp=5bAfF1A75eB77q1&max_behot_time=" + str(int(time.time())) + "&loc_mode=0&image=1&strict=1&city=&refer=1&concer" \"n_id=6286225228934679042&language=zh-Hans-CN&as=a285d52fa5274adb8a3006&ts=" + str(int(time.time()))self.list_count += 8 time.sleep(5)index = index + 1 print(url)self.parse_url(url)
这个是启动函数
def parse_url(self, url):response = requests.get(url, headers=self.getHeader(), verify=False)self.parse_json(response.content.decode("utf-8"))
网络请求并返回json字符窜
def getHeader(self):header = {"Host": "is.snssdk.com", "Accept-Language": "zh-Hans;q=1", "tt-request-time": str(int(time.time() * 1000)), "Connection": "keep-alive", "Accept-Encoding": "gzip,deflate", "Cookie": "CNZZDATA1272189606=1385639719-1525687011-%7C1525692411;alert_coverage=76;install_id=31781370987;ttreq=1$b79c6e66ea460b1579579c027e8073593305644e;odin_tt = 4c07858cc8b75143c593d0a99a04aa8fcf10136c3dca9badd9c31a2aa9cc415022834c64d7f52952d9290e3028876735;UM_distinctid = 1633a13d9fd41b-0910970a30f79a8-12485712-3d10d-1633a13d9fe84a;_ga=GA1.2.555016291.1525687770;_gid=GA1.2.96631484.1525687770;qh[360] = 1;__tea_sdk__ssid=957b8ce1-d5b3-4010-bd9c-bfec73bdf526;__tea_sdk__user_unique_id=6552731409432937992;tt_webid=6552731409432937992", "X-SS-Cookie": "CNZZDATA1272189606=1385639719-1525687011-%7C1525692411;alert_coverage = 76;install_id=31781370987;ttreq=1$b79c6e66ea460b1579579c027e8073593305644e;odin_tt=4c07858cc8b75143c593d0a99a04aa8fcf10136c3dca9badd9c31a2aa9cc415022834c64d7f52952d9290e3028876735;UM_distinctid=1633a13d9fd41b-0910970a30f79a8-12485712-3d10d-1633a13d9fe84a;_ga=GA1.2.555016291.1525687770;_gid=GA1.2.96631484.1525687770;qh[360]=1;__tea_sdk__ssid=957b8ce1-d5b3-4010-bd9c-bfec73bdf526;__tea_sdk__user_unique_id=6552731409432937992;tt_webid=6552731409432937992", "User-Agent": "News/6.6.5(iPhone;iOS10.2;Scale/2.00)", "Accept": "*/*"}print(str(int(time.time() * 1000)))return header
头部封装
def parse_json(self, jsonStr):print(jsonStr)DataInfo.time = Util().getCurrTime()try:json_list = (json.loads(jsonStr))["data"]for json_str in json_list:content = json.loads(json_str["content"])if "label" in content:if "广告".__eq__(content["label"]):print("广告")filter_words = content["filter_words"]for filter_word in filter_words:name = filter_word["name"]if "游戏" in name:print("游戏" + str(content))self.savaDataInfo(content)except KeyError as x:print(x)
解析json数据
def savaDataInfo(self, content):DataInfo.title = content["title"]DataInfo.type = 1 DataInfo.channel = "jinritoutiao" if "download_url" in content["raw_ad_data"]:DataInfo.appdownload = content["raw_ad_data"]["download_url"]self.saveBitmapUrlOrPath(content)DataInfo.device_type = "ios" DataInfo.app_name = content["source"]MySqlManager().insert_inspection_list(3)
保存数据到mysql
def saveBitmapUrlOrPath(self, content):bitmap = {}video = {}bitmap_path = {}filename = "pic1_" + str(int(time.time() * 1000000)) + ".jpg" bitmap_path["pic_path1"] = self.path + filenameDataInfo.pic_path = bitmap_pathif "display_url" in content:DataInfo.source_type = 3 video["video1"] = content["display_url"]if "video_detail_info" in content:bitmap["pic1"] = content["video_detail_info"]["detail_video_large_image"]["url"]else:bitmap["pic1"] = content["large_image_list"][0]["url"]else:DataInfo.source_type = 1 bitmap["pic1"] = content["large_image_list"][0]["url"]Util().save_img(bitmap["pic1"], filename, self.path)DataInfo.pic_list = bitmapDataInfo.video = video
下载图片到服务器,并保存图片路径
# 将产品详情插入数据库 def insert_product_detail(self, product_id, json_obj):table_name = "product_detail" if self.isProductIdExits(table_name, product_id) == 1:update_sql = "UPDATE " + table_name + " SET company_num=%d,days=%d,first_seen='%s',labels='%s',last_seen='%s'," \"logo_url='%s',media_list='%s',media_num=%d,product_id=%d,product_name='%s',updated_at='%s' WHERE product_id=%d" \% (json_obj["companyNum"], json_obj["days"], json_obj["firstSeen"], json_obj["labels"], json_obj["lastSeen"], json_obj["logoURL"], json.dumps(json_obj["mediaList"]), json_obj["mediaNum"], json_obj["productId"], json_obj["productName"], self.getCurrentTime(), product_id)self.execute(update_sql)else:insert_sql = "INSERT INTO " + table_name + "(company_num,days,first_seen,labels,last_seen,logo_url,media_list,media_num,product_id,product_name,created_at)" \" VALUES (%d ,%d ,'%s','%s','%s','%s','%s',%d ,%d ,'%s','%s')" \% (json_obj["companyNum"], json_obj["days"], json_obj["firstSeen"], json_obj["labels"], json_obj["lastSeen"], json_obj["logoURL"], json.dumps(json_obj["mediaList"]), json_obj["mediaNum"], json_obj["productId"], json_obj["productName"], self.getCurrentTime())self.execute(insert_sql)# 将产品详情页图标数据插入数据库 def insert_product_detail_table(self, product_id, json_obj):table_name = "product_detail_table" if self.isProductIdExits(table_name, product_id) == 1:update_sql = "UPDATE " + table_name + " SET ad_creative_list='%s',ad_creative_list='%s',xlabel='%s',ad_count_last_year=%d,product_id=%d,updated_at='%s' WHERE product_id=%d" \% (json.dumps(json_obj["adCreativeList"]), json.dumps(json_obj["adMaterialList"]), json.dumps(json_obj["xlabel"]), json_obj["adCountLastYear"], product_id, self.getCurrentTime(), product_id)self.execute(update_sql)else:insert_sql = "INSERT INTO " + table_name + "(ad_creative_list,ad_material_list,xlabel,ad_count_last_year,product_id,created_at)" \" VALUES ('%s','%s','%s',%d,%d,'%s')" \% (json.dumps(json_obj["adCreativeList"]), json.dumps(json_obj["adMaterialList"]), json.dumps(json_obj["xlabel"]), json_obj["adCountLastYear"], product_id, self.getCurrentTime())self.execute(insert_sql)# 将图片素材插入数据库 def insert_product_detail_pic(self, product_id, json_obj):self.savePic(json_obj)table_name = "product_pic_material_list" material_id = json_obj["materialId"]if self.isMaterialIdExits(table_name, material_id) == 1:update_sql = "UPDATE " + table_name + " SET company_num=%d,creative_num=%d,first_seen='%s',h=%d,last_days=%d,last_seen='%s',material_id=%d,material_type=%d," \"media_list='%s',new='%s',pic1='%s',pic2='%s',pic3='%s',product_num=%d,video='%s',w=%d,product_id=%d,video='%s' WHERE material_id=%d" \% (json_obj["companyNum"], json_obj["creativeNum"], json_obj["firstSeen"], json_obj["h"], json_obj["lastDays"], json_obj["lastSeen"], material_id, json_obj["materialType"], json.dumps(json_obj["mediaList"]), json_obj["new"], json_obj["pic1"], json_obj["pic2"], json_obj["pic3"], json_obj["productNum"], json_obj["video"], json_obj["w"], product_id, self.getCurrentTime(), material_id)self.execute(update_sql)else:insert_sql = "INSERT INTO " + table_name + "(company_num,creative_num,first_seen,h,last_days,last_seen,material_id,material_type,media_list,new,pic1,pic2,pic3" \",product_num,video,w,product_id,created_at,pic1_path,pic2_path,pic3_path)" \" VALUES (%d,%d,'%s',%d,%d,'%s',%d,%d,'%s','%s','%s','%s','%s',%d,'%s',%d,%d,'%s','%s','%s','%s')" \% (json_obj["companyNum"], json_obj["creativeNum"], json_obj["firstSeen"], json_obj["h"], json_obj["lastDays"], json_obj["lastSeen"], json_obj["materialId"], json_obj["materialType"], json.dumps(json_obj["mediaList"]), json_obj["new"], json_obj["pic1"], json_obj["pic2"], json_obj["pic3"], json_obj["productNum"], json_obj["video"], json_obj["w"], product_id, self.getCurrentTime(), self.pic1_path, self.pic2_path, self.pic3_path)self.execute(insert_sql)def savePic(self, json_obj):pic1 = json_obj["pic1"]pic2 = json_obj["pic2"]pic3 = json_obj["pic3"]if pic1.strip() != '':filename = "pic1_" + str(int(time.time() * 1000000)) + ".jpg" self.pic1_path = self.path + filenameUtil().save_img(pic1, filename, self.path)if pic2.strip() != '':filename = "pic2_" + str(int(time.time() * 1000000)) + ".jpg" self.pic2_path = self.path + filenameUtil().save_img(pic2, filename, self.path)if pic3.strip() != '':filename = "pic3_" + str(int(time.time() * 1000000)) + ".jpg" self.pic3_path = self.path + filenameUtil().save_img(pic3, filename, self.path)def isProductIdExits(self, table_name, product_id):query_sql = "select *from " + table_name + " where product_id = " + str(product_id)cursor = self.conn.cursor()result = cursor.execute(query_sql)print(result)self.conn.commit()return resultdef isMaterialIdExits(self, table_name, material_id):query_sql = "select *from " + table_name + " where material_id = " + str(material_id)cursor = self.conn.cursor()result = cursor.execute(query_sql)print(result)self.conn.commit()return resultdef insert_inspection_list(self, table_id):sql = "INSERT INTO " + self.getTableName(table_id) + "(title,app_download,time,channel,type,content,gif,video,source_type,pic_list,pic_path,device_type,material_size,app_name,created_at,updated_at)" \" VALUES ('%s','%s','%s','%s',%d,'%s','%s','%s',%d,'%s','%s','%s','%s','%s','%s','%s')" \% (DataInfo.title, DataInfo.app_download, DataInfo.time, DataInfo.channel, DataInfo.type, DataInfo.content, json.dumps(DataInfo.gif), json.dumps(DataInfo.video), DataInfo.source_type, json.dumps(DataInfo.pic_list), json.dumps(DataInfo.pic_path), DataInfo.device_type, DataInfo.material_size, DataInfo.app_name, self.getCurrentTime(), self.getCurrentTime())cursor = self.conn.cursor()cursor.execute(sql)self.conn.commit()def getCurrentTime(self):return str(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))def getTableName(self, table_id):return "material_" + str(table_id % 10)def execute(self, sql):cursor = self.conn.cursor()cursor.execute(sql)self.conn.commit()def close(self):self.conn.close()
保存数据库操作
Python3网络爬虫:今日头条新闻App的广告数据抓取相关推荐
- Python3网络爬虫:网易新闻App的广告数据抓取
咱们就不说废话了,直接上完整的源码 def startGetData(self):self.url = "https://nex.163.com/q" body = self.ge ...
- Python3网络爬虫:腾讯新闻App的广告数据抓取
废话就不说了,咱们直接上代码 def startGetData(self):index = 0while index < 3:index = index + 1self.url = " ...
- 网络爬虫-今日头条_signature参数逆向(第一弹)
失踪人口回归ing 今天要讲的是今日头条web版的_signature参数逆向 直接上链接 --> 今日头条 首先随便点开一个版本,这里点的是娱乐,抓包看看结果. 可以很清晰地看到首页html源 ...
- 手写网络协议栈-协议封装,netmap,dpdk网卡数据抓取,柔性数组
今夜只有一个话题,手写网络协议栈,保证大家都能学会 1. 协议头的封装 2. netmap/dpdk的原理 3. 柔性数组的使用 视频讲解如下,点击观看: 手写网络协议栈-协议封装,netmap,dp ...
- 网络爬虫-今日头条-街拍
爬取今日头条里的街拍数据 1.保存标题及图片URL到MongoDB 2.保存图片到本地 通过分析 今日头条街拍 的网页URL,可以发现其内容也是通过Ajax异步加载的,于是分析其API接口: http ...
- Android新闻阅读器(数据抓取)
第一篇技术博客,写得不好请见谅,谢谢(^_^) 由于最近师弟师妹们学习Android的需求,于是就写了此篇博客并且与各位分享一下. 整篇博客总共分为两部分. 第一部分搭建一个新闻列表界面(ListVi ...
- 企查查app新增企业数据抓取
企查查每日新增企业数据抓取 尚未完成的工作: 需要自行抓包获取设备id,appid,sign等等 sign和时间戳保持一致即可 把所有的数据库.redis配置 无法自动登录,账号需要独立 redis数 ...
- 转:【Python3网络爬虫开发实战】6.4-分析Ajax爬取今日头条街拍美图
[摘要] 本节中,我们以今日头条为例来尝试通过分析Ajax请求来抓取网页数据的方法.这次要抓取的目标是今日头条的街拍美图,抓取完成之后,将每组图片分文件夹下载到本地并保存下来. 1. 准备工作 在本节 ...
- 【Python3网络爬虫开发实战】6.4-分析Ajax爬取今日头条街拍美图
[摘要] 本节中,我们以今日头条为例来尝试通过分析Ajax请求来抓取网页数据的方法.这次要抓取的目标是今日头条的街拍美图,抓取完成之后,将每组图片分文件夹下载到本地并保存下来. 1. 准备工作 在本节 ...
最新文章
- oracle11g开启1158,1、Oracle11g中浏览器访问不了http://localhost:1158/em的问题
- Nginx为什么快到根本停不下来?
- 将集合中的内容按时间排序
- iphone分辨率_目前最值得入手的三款安卓机!流畅度堪比iPhone,用三五年不过时...
- UA MATH567 高维统计专题2 Low-rank矩阵及其估计1 Matrix Completion简介
- 研究动机(Motivation)-如何写好科技论文之我见(一)
- msyql show命令(转)
- VS2012 发布网站步骤
- 头插法和尾插法创建链表(有无头结点)
- 【2016年第1期】基于大数据的小麦蚜虫发生程度决策树预测分类模型
- .NET 基础一步步一幕幕[out、ref、params]
- netty 学习 (1)
- 上班两年干了些啥?该思考人生
- 软件项目工程中应该编写的十三类文档
- 计算机桌面出现蓝色底色,电脑桌面图标有蓝色阴影怎么去掉
- 初中计算机课堂游戏设计,如何设计初中信息技术课堂作业
- CentOS 7输入startx无法启动图形化界面
- HDU 2111 JAVA
- python图书搜索与书籍封面下载
- MIT Technology Review 2020年“十大突破性技术”解读 【中国科学基金】2020年第3期发布...
热门文章
- 优达学城 深度学习 任务3
- PDF时间戳数字签名
- win10服务器cpu占用过高,完美解决:Win10资源管理器占用CPU过高
- 通俗易懂,什么是.NET?什么是.NET Framework?什么是.NET Core? 转自:https://www.cnblogs.com/1996V/p/9037603.html#net1...
- Boot(重点SCSS☆☆☆☆☆)(day03)
- 阿里实名认证Java版(详细教程)
- linux虚拟机a problem has occurred and the system can‘t recover解决方案
- 一行命令批量修改染色体和位置为RS号
- 二分查找时间复杂度及其Python实现
- 如何玩转OA系统业务审批流程