python爬取贴吧网页信息

第一步分析网页代码如下：

'''
author：superWang
date：2017-09-15
requests模板：2.18.4
bs4模板：4.6.0
json模板：2.0.9
time模板：无
爬取贴吧网页http://c.tieba.baidu.com/p/4994831746?pn=1 中的信息 ，url中pn=1表示第一页
'''#!/usr/bin/env python
#-*- coding:utf-8 -*-import requests
from bs4 import BeautifulSoup
import json
import time#得到贴吧个楼层的信息
url = 'http://c.tieba.baidu.com/p/4994831746?pn=1'
res = requests.get(url)
#print(res.text)
soup = BeautifulSoup(res.text,'html5lib')
l_posts = soup.select("#j_p_postlist .l_post")
#print(len(l_posts))
#这里为什么要取l_posts[1] 而不去 l_posts[0]，因为二楼有回复评论的信息，所以我就取了二楼来做测试
print(l_posts[1].select(".d_author .d_name .p_author_name")[0].text)#名字
#print(l_posts[1].select(".d_author .l_badge .user_badge")[0]['title'])#等级
print(l_posts[1].select(".d_author .l_badge .user_badge")[0].text)#等级
print(l_posts[1].select(".d_post_content_main .p_content cc")[0].text)#内容
print(l_posts[1].select(".d_post_content_main .core_reply .post-tail-wrap .tail-info")[1].text)#楼数
print(l_posts[1].select(".d_post_content_main .core_reply .post-tail-wrap .tail-info")[2].text)#时间#得到贴吧一个楼层中的回复评论信息
#print(l_posts[1]["data-field"])#回复人 查询的数据 tid
jd = json.loads(l_posts[1]["data-field"])#解码成Python的dict类型
#print(type(jd))
print(jd)
tid = jd["content"]["thread_id"]#回复人 查询的数据 tid
fid = jd["content"]["forum_id"]#回复人 查询的数据 fid
post_id = jd["content"]["post_id"]
url2 = "http://c.tieba.baidu.com/p/totalComment?pn=1&see_lz=0&tid="+str(tid)+"&fid="+str(fid)
#print(url2)
res2 = requests.get(url2)
print(res2.text)#本页所有回复的信息
jd2 = json.loads(res2.text)#解码成Python的dict类型
print(jd2["data"]["comment_list"][str(post_id)]["comment_info"])#得到回复人信息列表
#print(type(jd2["data"]["comment_list"][str(post_id)]["comment_info"]))
comment_info = jd2["data"]["comment_list"][str(post_id)]["comment_info"]
print(comment_info[0]["now_time"])
now_time = comment_info[0]["now_time"]#回复人的时间戳
username = comment_info[0]["username"]#回复人的姓名
content = comment_info[0]["content"]#回复的信息time_now=time.localtime(now_time) #将时间戳转化成python的日期格式
print (time.strftime("%Y-%m-%d %H:%M:%S", time_now))#将日期转成正常显示的字串#判断是否是楼主
print(len(l_posts[1].select(".d_author .louzhubiaoshi_wrap")))#为1 表示为楼主
print(len(l_posts[4].select(".d_author .louzhubiaoshi_wrap")))#为0 表示不为楼主

第二步，整理分析的代码如下：

'''
author：superWang
date：2017-09-15
requests模板：2.18.4
bs4模板：4.6.0
json模板：2.0.9
time模板：无
爬取贴吧网页http://c.tieba.baidu.com/p/4994831746?pn=1 中的信息 ，url中pn=1表示第一页
'''#!/usr/bin/env python
#-*- coding:utf-8 -*-import requests
from bs4 import BeautifulSoup
import json
import timeclass GetTieBaInfo():#得到本页各楼层所有的信息def getInfoWithPage(self,page):url = 'http://c.tieba.baidu.com/p/4994831746?pn='+str(page)res = requests.get(url)#print(res.text)soup = BeautifulSoup(res.text,'html5lib')l_posts = soup.select("#j_p_postlist .l_post")# 得到本页各楼层回复的所有信息  所要提供的数据jd = json.loads(l_posts[0]["data-field"])  # 解码成Python的dict类型tid = jd["content"]["thread_id"]  # 回复人 查询的数据 tidfid = jd["content"]["forum_id"]  # 回复人 查询的数据 fidcomment_list = self.getBackWithPage(page,tid,fid)   #comment_list存放着本页所有回复评论信息for l_post in l_posts:p_author_name = l_post.select(".d_author .d_name .p_author_name")[0].text    #名字user_badge = l_post.select(".d_author .l_badge .user_badge")[0].text         #等级p_content = l_post.select(".d_post_content_main .p_content cc")[0].text.strip()     #内容tail_info = l_post.select(".d_post_content_main .core_reply .post-tail-wrap .tail-info")[1].text    #楼数tail_time = l_post.select(".d_post_content_main .core_reply .post-tail-wrap .tail-info")[2].text    #时间print("名字:"+p_author_name+"\t等级:"+user_badge+"\t内容:"+p_content+"\t楼数:"+tail_info+"\t时间:"+tail_time)jd = json.loads(l_post["data-field"])    # 解码成Python的dict类型post_id = jd["content"]["post_id"]      #通过这个post_id得到此楼层回复评论信息 在comment_list中得到此楼层回复评论信息try:comment_infos = comment_list[str(post_id)]["comment_info"]   #得到此楼层的所有回复评论信息print("\t此楼层回复：")for comment_info in comment_infos:now_time = comment_info["now_time"]  # 回复人的时间戳username = comment_info["username"]  # 回复人的姓名content = comment_info["content"]  # 回复的信息time_now = time.localtime(now_time)  # 将时间戳转化成python的日期格式back_time = time.strftime("%Y-%m-%d %H:%M:%S", time_now)#print(time.strftime("%Y-%m-%d %H:%M:%S", time_now))  # 将日期转成正常显示的字串print("\t\t姓名:"+username+"\t\t信息:"+content+"\t\t时间:"+back_time )except KeyError:pass#得到楼主的信息def getAuthorInfoWithPage(self,page):url = 'http://c.tieba.baidu.com/p/4994831746?pn=' + str(page)res = requests.get(url)# print(res.text)soup = BeautifulSoup(res.text, 'html5lib')l_posts = soup.select("#j_p_postlist .l_post")# 得到本页各楼层回复的所有信息  所要提供的数据jd = json.loads(l_posts[0]["data-field"])  # 解码成Python的dict类型tid = jd["content"]["thread_id"]  # 回复人 查询的数据 tidfid = jd["content"]["forum_id"]  # 回复人 查询的数据 fidcomment_list = self.getBackWithPage(page, tid, fid)  # comment_list存放着本页所有回复评论信息for l_post in l_posts:#print(len(l_posts[1].select(".d_author .louzhubiaoshi_wrap")))  # 为1 表示为楼主if len(l_post.select(".d_author .louzhubiaoshi_wrap")) == 1:p_author_name = l_post.select(".d_author .d_name .p_author_name")[0].text  # 名字user_badge = l_post.select(".d_author .l_badge .user_badge")[0].text  # 等级p_content = l_post.select(".d_post_content_main .p_content cc")[0].text.strip()  # 内容tail_info = l_post.select(".d_post_content_main .core_reply .post-tail-wrap .tail-info")[1].text  # 楼数tail_time = l_post.select(".d_post_content_main .core_reply .post-tail-wrap .tail-info")[2].text  # 时间print("名字:" + p_author_name + "\t等级:" + user_badge + "\t内容:" + p_content + "\t楼数:" + tail_info + "\t时间:" + tail_time)jd = json.loads(l_post["data-field"])  # 解码成Python的dict类型post_id = jd["content"]["post_id"]  # 通过这个post_id得到此楼层回复评论信息 在comment_list中得到此楼层回复评论信息try:comment_infos = comment_list[str(post_id)]["comment_info"]  # 得到此楼层的所有回复评论信息print("\t此楼层回复：")for comment_info in comment_infos:now_time = comment_info["now_time"]  # 回复人的时间戳username = comment_info["username"]  # 回复人的姓名content = comment_info["content"]  # 回复的信息time_now = time.localtime(now_time)  # 将时间戳转化成python的日期格式back_time = time.strftime("%Y-%m-%d %H:%M:%S", time_now)# print(time.strftime("%Y-%m-%d %H:%M:%S", time_now))  # 将日期转成正常显示的字串print("\t\t姓名:" + username + "\t\t信息:" + content + "\t\t时间:" + back_time)except KeyError:pass#得到本页各楼层回复的所有信息def getBackWithPage(self,page,tid,fid):url = "http://c.tieba.baidu.com/p/totalComment?pn="+ str(page) +"&see_lz=0&tid=" + str(tid) + "&fid=" + str(fid)res2 = requests.get(url)jd2 = json.loads(res2.text)  # 解码成Python的dict类型#print(jd2["data"]["comment_list"][str(post_id)]["comment_info"])  # 得到回复人信息列表# print(type(jd2["data"]["comment_list"][str(post_id)]["comment_info"]))comment_list = jd2["data"]["comment_list"]    # 得到所有回复人信息字典return comment_listif __name__ == "__main__":a = GetTieBaInfo()a.getAuthorInfoWithPage(1)

最后的效果图如下：

python爬取贴吧网页信息相关推荐

用python爬取东方财富网网页信息_爬取东方财富网数据的网页分析
自学Python已有3个月之多,浏览无数大神的佳作,收获颇丰.当初自学python就是为了学习爬虫,爬取网站上好看妹子的图片--[流口水][流口水] 言归正传,近期学习量化交易知识,发现东方财富网(e ...
python关于二手房的课程论文_基于python爬取链家二手房信息代码示例
基本环境配置 python 3.6 pycharm requests parsel time 相关模块pip安装即可确定目标网页数据哦豁,这个价格..................看到都觉得脑阔 ...
Python爬取安居客经纪人信息
Python爬取安居客经纪人信息 Python2.7.15 今天我们来爬取安居客经纪人的信息.这次我们不再使用正则,我们使用beautifulsoup.不了解的可以先看一下这个文档,便于理解.http ...
Python爬取药监局化妆品管理信息发现的问题
Python爬取药监局化妆品管理信息 **1.json格式本质上是字符串!!! 今天在爬取国家药监局化妆品管理信息的时候,发现"json数据本质上是字符串",以前我还以为json本 ...
Python 爬取拉勾招聘信息
Python 爬取拉勾招聘信息故事背景最近有个好哥们啊浪迫于家里工资太低,准备从北方老家那边来深圳这边找工作,啊浪是学平面设计的知道我在深圳这边于是向我打听深圳这边平面设计薪资水平,当时我有点懵逼 ...
运用Python爬取二手房价格与信息的两种常用方法
最近房地产市场进一步收紧,多地地方政府出台各种收紧政策,以保证房地产健康发展,因此云朵君就想到运用Python网络爬虫,抓取部分房产信息,了解下最近房地产的情况. 接下来以房天下二手房信息,以获取某个 ...
python爬取 “得到” App 电子书信息
前言文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理. PS:如有需要Python学习资料的小伙伴可以加点击下方链接自行获取 pyt ...
python爬取知网论文信息
用Python爬取指望关于某个主题的所有论文摘要等信息. 很幸运的找到了一个大佬的代码来自己改改改! 先放大佬代码连接 Git 接下来就是我自己嚯嚯嚯改的,很小白的了... 应该是很详细得了为了看懂 ...
python爬取b站用户_用Python爬取bilibili全站用户信息
教你用Python爬取哔哩哔哩全站用户信息运行下载 git clone https://github.com/cexll/bili_user_Spider.git 复制代码运行环境 Window ...
用python爬取交大图书馆图书信息
由于到图书馆中查找数据的时候,每个网页都需要一张一张的翻转,而同时因为每张网页中的内容十分有限,故写此爬虫,方便查找之用 # -*- coding=utf-8 -*- #@author: .Edgar ...

python爬取贴吧网页信息

python爬取贴吧网页信息相关推荐

最新文章

热门文章