咱们就不说废话了,直接上完整的源码

def startGetData(self):ret = random.randint(2, 10)index = 0
    url = ""
    while index < ret:if index == 0:url = "http://lf.snssdk.com/api/news/feed/v80/?fp=PSTqPzFSJ2HuFlG7LlU1FYmeLS4_&version_code=6.6.5&app_name=news_article&vid=07FEB726-62D1-442A-ADE2-6781" \"0CF8C421&device_id=51911855605&channel=App%20Store&resolution=750*1334&aid=13&ab_version=304488,346137,349052,271178,326588,326524,326532,338589,3" \"36927,295827,325048,345778,239096,348856,344345,170988,346540,332095,325197,338954,330633,297058,276204,286212,313219,338067,348326,347814,277771," \"310595,342074,334586,339207,323233,328671,346557,280773,338894,319962,344870,345191,348452,349020,348669,343444,214069,337616,348941,207251,26631" \"2,247847,280447,281298,328218,325618,328227,348992,288417,290193,326190,339904,344131&ab_feature=201617,z1&ab_group=z1,201617&openudid=0bf32dfcb91c" \"3dc330eb92c492a9e9093fc44b51&idfv=07FEB726-62D1-442A-ADE2-67810CF8C421&ac=WIFI&os_version=9.3.1&ssmix=a&device_platform=iphone&iid=31813899088&ab_c" \"lient=a1,f2,f7,e1&device_type=iPhone%206&idfa=0784D090-1DC1-4B24-BAA1-0C474ED94D52&detail=1&refresh_reason=1&last_refresh_sub_entrance_interva" \"l=" + str(int(time.time())) + "&tt_from=pull&count=20&list_count=37&support_rn=4&LBS_status=deny&cp=54AbF4Ad5aAE6q1&loc_mode=0&min_behot_time=" + str(int(time.time())) + "&image=1&session_refres" \"h_idx=3&strict=1&refer=1&language=zh-Hans-CN&concern_id=6286225228934679042&as=a295754fa45e4a0a5a3192&ts=" + str(int(time.time()))self.list_count = 17
        elif index >= 1:url = "http://lf.snssdk.com/api/news/feed/v80/?fp=PSTqPzFSJ2HuFlG7LlU1FYmeLS4_&version_code=6.6.5&app_name=news_article&vid=07FEB726-62D1-442A-ADE2-67810" \"CF8C421&device_id=51911855605&channel=App%20Store&resolution=750*1334&aid=13&ab_version=304488,346137,349052,271178,326588,326524,326532,338589,336" \"927,295827,325048,345778,239096,348856,344345,170988,346540,332095,325197,338954,330633,297058,276204,286212,313219,338067,348326,347814,277771,31" \"0595,342074,334586,339207,323233,328671,346557,280773,338894,319962,344870,345191,348452,349020,348669,343444,214069,337616,348941,207251,266312,2" \"47847,280447,281298,328218,325618,328227,348992,288417,290193,326190,339904,344131&ab_feature=201617,z1&ab_group=z1,201617&openudid=0bf32dfcb91c3dc" \"330eb92c492a9e9093fc44b51&idfv=07FEB726-62D1-442A-ADE2-67810CF8C421&ac=WIFI&os_version=9.3.1&ssmix=a&device_platform=iphone&iid=31813899088&ab_clien" \"t=a1,f2,f7,e1&device_type=iPhone%206&idfa=0784D090-1DC1-4B24-BAA1-0C474ED94D52&detail=1&last_refresh_sub_entrance_interval=" + str(int(time.time())) + "&tt_from=load_m" \"ore&count=20&list_count=" + str(self.list_count) + "&support_rn=4&LBS_status=deny&cp=5bAfF1A75eB77q1&max_behot_time=" + str(int(time.time())) + "&loc_mode=0&image=1&strict=1&city=&refer=1&concer" \"n_id=6286225228934679042&language=zh-Hans-CN&as=a285d52fa5274adb8a3006&ts=" + str(int(time.time()))self.list_count += 8
        time.sleep(5)index = index + 1
        print(url)self.parse_url(url)

这个是启动函数

def parse_url(self, url):response = requests.get(url, headers=self.getHeader(), verify=False)self.parse_json(response.content.decode("utf-8"))
网络请求并返回json字符窜
def getHeader(self):header = {"Host": "is.snssdk.com",
              "Accept-Language": "zh-Hans;q=1",
              "tt-request-time": str(int(time.time() * 1000)),
              "Connection": "keep-alive",
              "Accept-Encoding": "gzip,deflate",
              "Cookie": "CNZZDATA1272189606=1385639719-1525687011-%7C1525692411;alert_coverage=76;install_id=31781370987;ttreq=1$b79c6e66ea460b1579579c027e8073593305644e;odin_tt = 4c07858cc8b75143c593d0a99a04aa8fcf10136c3dca9badd9c31a2aa9cc415022834c64d7f52952d9290e3028876735;UM_distinctid = 1633a13d9fd41b-0910970a30f79a8-12485712-3d10d-1633a13d9fe84a;_ga=GA1.2.555016291.1525687770;_gid=GA1.2.96631484.1525687770;qh[360] = 1;__tea_sdk__ssid=957b8ce1-d5b3-4010-bd9c-bfec73bdf526;__tea_sdk__user_unique_id=6552731409432937992;tt_webid=6552731409432937992",
              "X-SS-Cookie": "CNZZDATA1272189606=1385639719-1525687011-%7C1525692411;alert_coverage = 76;install_id=31781370987;ttreq=1$b79c6e66ea460b1579579c027e8073593305644e;odin_tt=4c07858cc8b75143c593d0a99a04aa8fcf10136c3dca9badd9c31a2aa9cc415022834c64d7f52952d9290e3028876735;UM_distinctid=1633a13d9fd41b-0910970a30f79a8-12485712-3d10d-1633a13d9fe84a;_ga=GA1.2.555016291.1525687770;_gid=GA1.2.96631484.1525687770;qh[360]=1;__tea_sdk__ssid=957b8ce1-d5b3-4010-bd9c-bfec73bdf526;__tea_sdk__user_unique_id=6552731409432937992;tt_webid=6552731409432937992",
              "User-Agent": "News/6.6.5(iPhone;iOS10.2;Scale/2.00)",
              "Accept": "*/*"}print(str(int(time.time() * 1000)))return header

头部封装

def parse_json(self, jsonStr):print(jsonStr)DataInfo.time = Util().getCurrTime()try:json_list = (json.loads(jsonStr))["data"]for json_str in json_list:content = json.loads(json_str["content"])if "label" in content:if "广告".__eq__(content["label"]):print("广告")filter_words = content["filter_words"]for filter_word in filter_words:name = filter_word["name"]if "游戏" in name:print("游戏" + str(content))self.savaDataInfo(content)except KeyError as x:print(x)

解析json数据

def savaDataInfo(self, content):DataInfo.title = content["title"]DataInfo.type = 1
    DataInfo.channel = "jinritoutiao"
    if "download_url" in content["raw_ad_data"]:DataInfo.appdownload = content["raw_ad_data"]["download_url"]self.saveBitmapUrlOrPath(content)DataInfo.device_type = "ios"

    DataInfo.app_name = content["source"]MySqlManager().insert_inspection_list(3)

保存数据到mysql

def saveBitmapUrlOrPath(self, content):bitmap = {}video = {}bitmap_path = {}filename = "pic1_" + str(int(time.time() * 1000000)) + ".jpg"
    bitmap_path["pic_path1"] = self.path + filenameDataInfo.pic_path = bitmap_pathif "display_url" in content:DataInfo.source_type = 3
        video["video1"] = content["display_url"]if "video_detail_info" in content:bitmap["pic1"] = content["video_detail_info"]["detail_video_large_image"]["url"]else:bitmap["pic1"] = content["large_image_list"][0]["url"]else:DataInfo.source_type = 1
        bitmap["pic1"] = content["large_image_list"][0]["url"]Util().save_img(bitmap["pic1"], filename, self.path)DataInfo.pic_list = bitmapDataInfo.video = video

下载图片到服务器,并保存图片路径

# 将产品详情插入数据库
def insert_product_detail(self, product_id, json_obj):table_name = "product_detail"
    if self.isProductIdExits(table_name, product_id) == 1:update_sql = "UPDATE " + table_name + " SET company_num=%d,days=%d,first_seen='%s',labels='%s',last_seen='%s'," \"logo_url='%s',media_list='%s',media_num=%d,product_id=%d,product_name='%s',updated_at='%s' WHERE product_id=%d" \% (json_obj["companyNum"], json_obj["days"], json_obj["firstSeen"], json_obj["labels"],
                        json_obj["lastSeen"], json_obj["logoURL"],
                        json.dumps(json_obj["mediaList"]), json_obj["mediaNum"], json_obj["productId"],
                        json_obj["productName"], self.getCurrentTime(), product_id)self.execute(update_sql)else:insert_sql = "INSERT INTO " + table_name + "(company_num,days,first_seen,labels,last_seen,logo_url,media_list,media_num,product_id,product_name,created_at)" \" VALUES (%d ,%d ,'%s','%s','%s','%s','%s',%d ,%d ,'%s','%s')" \% (json_obj["companyNum"], json_obj["days"], json_obj["firstSeen"], json_obj["labels"],
                        json_obj["lastSeen"], json_obj["logoURL"],
                        json.dumps(json_obj["mediaList"]), json_obj["mediaNum"], json_obj["productId"],
                        json_obj["productName"], self.getCurrentTime())self.execute(insert_sql)# 将产品详情页图标数据插入数据库
def insert_product_detail_table(self, product_id, json_obj):table_name = "product_detail_table"
    if self.isProductIdExits(table_name, product_id) == 1:update_sql = "UPDATE " + table_name + " SET ad_creative_list='%s',ad_creative_list='%s',xlabel='%s',ad_count_last_year=%d,product_id=%d,updated_at='%s' WHERE product_id=%d" \% (json.dumps(json_obj["adCreativeList"]), json.dumps(json_obj["adMaterialList"]),
                        json.dumps(json_obj["xlabel"]), json_obj["adCountLastYear"],
                        product_id, self.getCurrentTime(), product_id)self.execute(update_sql)else:insert_sql = "INSERT INTO " + table_name + "(ad_creative_list,ad_material_list,xlabel,ad_count_last_year,product_id,created_at)" \" VALUES ('%s','%s','%s',%d,%d,'%s')" \% (json.dumps(json_obj["adCreativeList"]), json.dumps(json_obj["adMaterialList"]),
                         json.dumps(json_obj["xlabel"]), json_obj["adCountLastYear"],
                         product_id, self.getCurrentTime())self.execute(insert_sql)# 将图片素材插入数据库
def insert_product_detail_pic(self, product_id, json_obj):self.savePic(json_obj)table_name = "product_pic_material_list"
    material_id = json_obj["materialId"]if self.isMaterialIdExits(table_name, material_id) == 1:update_sql = "UPDATE " + table_name + " SET company_num=%d,creative_num=%d,first_seen='%s',h=%d,last_days=%d,last_seen='%s',material_id=%d,material_type=%d," \"media_list='%s',new='%s',pic1='%s',pic2='%s',pic3='%s',product_num=%d,video='%s',w=%d,product_id=%d,video='%s' WHERE material_id=%d" \% (json_obj["companyNum"], json_obj["creativeNum"], json_obj["firstSeen"], json_obj["h"],
                        json_obj["lastDays"], json_obj["lastSeen"], material_id,
                        json_obj["materialType"],
                        json.dumps(json_obj["mediaList"]), json_obj["new"], json_obj["pic1"], json_obj["pic2"],
                        json_obj["pic3"],
                        json_obj["productNum"], json_obj["video"], json_obj["w"], product_id,
                        self.getCurrentTime(), material_id)self.execute(update_sql)else:insert_sql = "INSERT INTO " + table_name + "(company_num,creative_num,first_seen,h,last_days,last_seen,material_id,material_type,media_list,new,pic1,pic2,pic3" \",product_num,video,w,product_id,created_at,pic1_path,pic2_path,pic3_path)" \" VALUES (%d,%d,'%s',%d,%d,'%s',%d,%d,'%s','%s','%s','%s','%s',%d,'%s',%d,%d,'%s','%s','%s','%s')" \% (json_obj["companyNum"], json_obj["creativeNum"], json_obj["firstSeen"], json_obj["h"],
                         json_obj["lastDays"], json_obj["lastSeen"], json_obj["materialId"],
                         json_obj["materialType"],
                         json.dumps(json_obj["mediaList"]), json_obj["new"], json_obj["pic1"], json_obj["pic2"],
                         json_obj["pic3"],
                         json_obj["productNum"], json_obj["video"], json_obj["w"], product_id,
                         self.getCurrentTime(), self.pic1_path, self.pic2_path, self.pic3_path)self.execute(insert_sql)def savePic(self, json_obj):pic1 = json_obj["pic1"]pic2 = json_obj["pic2"]pic3 = json_obj["pic3"]if pic1.strip() != '':filename = "pic1_" + str(int(time.time() * 1000000)) + ".jpg"
        self.pic1_path = self.path + filenameUtil().save_img(pic1, filename, self.path)if pic2.strip() != '':filename = "pic2_" + str(int(time.time() * 1000000)) + ".jpg"
        self.pic2_path = self.path + filenameUtil().save_img(pic2, filename, self.path)if pic3.strip() != '':filename = "pic3_" + str(int(time.time() * 1000000)) + ".jpg"
        self.pic3_path = self.path + filenameUtil().save_img(pic3, filename, self.path)def isProductIdExits(self, table_name, product_id):query_sql = "select *from " + table_name + " where product_id = " + str(product_id)cursor = self.conn.cursor()result = cursor.execute(query_sql)print(result)self.conn.commit()return resultdef isMaterialIdExits(self, table_name, material_id):query_sql = "select *from " + table_name + " where material_id = " + str(material_id)cursor = self.conn.cursor()result = cursor.execute(query_sql)print(result)self.conn.commit()return resultdef insert_inspection_list(self, table_id):sql = "INSERT INTO " + self.getTableName(table_id) + "(title,app_download,time,channel,type,content,gif,video,source_type,pic_list,pic_path,device_type,material_size,app_name,created_at,updated_at)" \" VALUES ('%s','%s','%s','%s',%d,'%s','%s','%s',%d,'%s','%s','%s','%s','%s','%s','%s')" \% (DataInfo.title, DataInfo.app_download, DataInfo.time, DataInfo.channel, DataInfo.type,
             DataInfo.content, json.dumps(DataInfo.gif), json.dumps(DataInfo.video), DataInfo.source_type,
             json.dumps(DataInfo.pic_list),
             json.dumps(DataInfo.pic_path), DataInfo.device_type,
             DataInfo.material_size,
             DataInfo.app_name, self.getCurrentTime(), self.getCurrentTime())cursor = self.conn.cursor()cursor.execute(sql)self.conn.commit()def getCurrentTime(self):return str(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))def getTableName(self, table_id):return "material_" + str(table_id % 10)def execute(self, sql):cursor = self.conn.cursor()cursor.execute(sql)self.conn.commit()def close(self):self.conn.close()

保存数据库操作

Python3网络爬虫:今日头条新闻App的广告数据抓取相关推荐

  1. Python3网络爬虫:网易新闻App的广告数据抓取

    咱们就不说废话了,直接上完整的源码 def startGetData(self):self.url = "https://nex.163.com/q" body = self.ge ...

  2. Python3网络爬虫:腾讯新闻App的广告数据抓取

    废话就不说了,咱们直接上代码 def startGetData(self):index = 0while index < 3:index = index + 1self.url = " ...

  3. 网络爬虫-今日头条_signature参数逆向(第一弹)

    失踪人口回归ing 今天要讲的是今日头条web版的_signature参数逆向 直接上链接 --> 今日头条 首先随便点开一个版本,这里点的是娱乐,抓包看看结果. 可以很清晰地看到首页html源 ...

  4. 手写网络协议栈-协议封装,netmap,dpdk网卡数据抓取,柔性数组

    今夜只有一个话题,手写网络协议栈,保证大家都能学会 1. 协议头的封装 2. netmap/dpdk的原理 3. 柔性数组的使用 视频讲解如下,点击观看: 手写网络协议栈-协议封装,netmap,dp ...

  5. 网络爬虫-今日头条-街拍

    爬取今日头条里的街拍数据 1.保存标题及图片URL到MongoDB 2.保存图片到本地 通过分析 今日头条街拍 的网页URL,可以发现其内容也是通过Ajax异步加载的,于是分析其API接口: http ...

  6. Android新闻阅读器(数据抓取)

    第一篇技术博客,写得不好请见谅,谢谢(^_^) 由于最近师弟师妹们学习Android的需求,于是就写了此篇博客并且与各位分享一下. 整篇博客总共分为两部分. 第一部分搭建一个新闻列表界面(ListVi ...

  7. 企查查app新增企业数据抓取

    企查查每日新增企业数据抓取 尚未完成的工作: 需要自行抓包获取设备id,appid,sign等等 sign和时间戳保持一致即可 把所有的数据库.redis配置 无法自动登录,账号需要独立 redis数 ...

  8. 转:【Python3网络爬虫开发实战】6.4-分析Ajax爬取今日头条街拍美图

    [摘要] 本节中,我们以今日头条为例来尝试通过分析Ajax请求来抓取网页数据的方法.这次要抓取的目标是今日头条的街拍美图,抓取完成之后,将每组图片分文件夹下载到本地并保存下来. 1. 准备工作 在本节 ...

  9. 【Python3网络爬虫开发实战】6.4-分析Ajax爬取今日头条街拍美图

    [摘要] 本节中,我们以今日头条为例来尝试通过分析Ajax请求来抓取网页数据的方法.这次要抓取的目标是今日头条的街拍美图,抓取完成之后,将每组图片分文件夹下载到本地并保存下来. 1. 准备工作 在本节 ...

最新文章

  1. oracle11g开启1158,1、Oracle11g中浏览器访问不了http://localhost:1158/em的问题
  2. Nginx为什么快到根本停不下来?
  3. 将集合中的内容按时间排序
  4. iphone分辨率_目前最值得入手的三款安卓机!流畅度堪比iPhone,用三五年不过时...
  5. UA MATH567 高维统计专题2 Low-rank矩阵及其估计1 Matrix Completion简介
  6. 研究动机(Motivation)-如何写好科技论文之我见(一)
  7. msyql show命令(转)
  8. VS2012 发布网站步骤
  9. 头插法和尾插法创建链表(有无头结点)
  10. 【2016年第1期】基于大数据的小麦蚜虫发生程度决策树预测分类模型
  11. .NET 基础一步步一幕幕[out、ref、params]
  12. netty 学习 (1)
  13. 上班两年干了些啥?该思考人生
  14. 软件项目工程中应该编写的十三类文档
  15. 计算机桌面出现蓝色底色,电脑桌面图标有蓝色阴影怎么去掉
  16. 初中计算机课堂游戏设计,如何设计初中信息技术课堂作业
  17. CentOS 7输入startx无法启动图形化界面
  18. HDU 2111 JAVA
  19. python图书搜索与书籍封面下载
  20. MIT Technology Review 2020年“十大突破性技术”解读 【中国科学基金】2020年第3期发布...

热门文章

  1. 优达学城 深度学习 任务3
  2. PDF时间戳数字签名
  3. win10服务器cpu占用过高,完美解决:Win10资源管理器占用CPU过高
  4. 通俗易懂,什么是.NET?什么是.NET Framework?什么是.NET Core? 转自:https://www.cnblogs.com/1996V/p/9037603.html#net1...
  5. Boot(重点SCSS☆☆☆☆☆)(day03)
  6. 阿里实名认证Java版(详细教程)
  7. linux虚拟机a problem has occurred and the system can‘t recover解决方案
  8. 一行命令批量修改染色体和位置为RS号
  9. 二分查找时间复杂度及其Python实现
  10. 如何玩转OA系统业务审批流程