python爬百度贴吧_python爬虫-爬取百度贴吧帖子加图片

1.[代码][Python]代码

# -*- coding: utf-8 -*-

""" 百度贴吧帖子抓取

"""

import urllib2

import json

import os

from lxml import etree

from pymongo import MongoClient

import sys

reload(sys)

sys.setdefaultencoding("utf-8")

client = MongoClient('localhost', 27017)

tb = u'四川大学' # 设置要抓取的贴吧

def get_tz_id(tb_name, page_num):

tz_id = []

for page in range(1, page_num+1):

url = "http://tieba.baidu.com/f?kw=%s&pn=%s" % (tb_name, (page*50-50))

html = urllib2.urlopen(url).read()

tree = etree.HTML(html)

ul_li = tree.xpath('//*[@id="thread_list"]/li')[1:]

for li in ul_li:

data_field = li.xpath('./@data-field') # 滤掉百度推广部分

if data_field:

id_ = eval(data_field[0])['id']

tz_id.append(id_)

return tz_id

def save_img(path, img_id, url):

try:

picture = urllib2.urlopen(url).read()

except urllib2.URLError, e:

print e

picture = False

if picture:

if not os.path.exists(path): # 创建文件路径

os.makedirs(path)

f = open('%s/%s.jpg' % (path, img_id), "wb")

f.write(picture)

f.flush()

f.close()

def store_mongodb(dic):

database = client.bdtb

return database[tb].insert(dic)

def get_info(tz_id):

tz_url = 'http://tieba.baidu.com/p/%s' % tz_id

html = urllib2.urlopen(tz_url).read()

tree = etree.HTML(html)

fist_floor = tree.xpath('//div[@class="l_post j_l_post l_post_bright noborder "]')

title = tree.xpath('//div[@class="core_title core_title_theme_bright"]/h1/@title')

content = fist_floor[0].xpath('./div[3]/div[1]/cc/div')[0]

info = {}

if content.xpath('./img'): # 判断是否有图片,有图片为true

text = fist_floor[0].xpath('./div[3]/div[1]/cc/div')[0].xpath('string(.)').strip()

if len(text) == 0:

return False # 滤掉没有文字的帖子

images = fist_floor[0].xpath('./div[3]/div[1]/cc/div/img') # 获取图片

number = 1

image_li = []

for each in images:

src = each.xpath('./@src')[0]

if src.find('static')+1: # 滤掉贴吧表情图片

pass

else:

img_id = '%s_%s' % (tz_id, number)

save_img(tb, img_id, src) # 保存图片到本地

image_li.append('%s/%s_%s' % (tb, tz_id, number))

number += 1

info['content'] = text

info['image'] = image_li

else:

info['content'] = content.text.strip()

info['image'] = 'null'

info['source'] = tb

info['title'] = ''.join(title)

data_field = fist_floor[0].xpath('./@data-field')[0]

data_info = json.loads(data_field)

info['dateline'] = data_info['content']['date'] # create time

info['sex'] = data_info['author']['user_sex'] # sex

info['author'] = data_info['author']['user_name']

reply_floor = tree.xpath('//div[@class="l_post j_l_post l_post_bright "]')

reply_li = []

for each_floor in reply_floor:

if not each_floor.xpath('./div[3]/div[1]/cc/div'): # 滤掉百度推广

return False

reply_content = each_floor.xpath('./div[3]/div[1]/cc/div')[0].xpath('string(.)').strip()

reply_info = {}

if len(reply_content) > 0: # 滤掉无文字的回复

re_field = each_floor.xpath('./@data-field')[0]

re_info = json.loads(re_field)

reply_info['dateline'] = re_info['content']['date']

reply_info['author'] = re_info['author']['user_name']

reply_info['content'] = reply_content

reply_li.append(reply_info)

info['reply'] = reply_li

store_mongodb(info)

def main():

id_list = get_tz_id(tb, 1)

for each in id_list:

get_info(each)

# break

client.close()

if __name__ == "__main__":

main()

python爬百度贴吧_python爬虫-爬取百度贴吧帖子加图片相关推荐

python爬虫抓取百度图片_Python爬虫抓取百度的高清摄影图片
成果预览: 源代码: import requests import re url = 'https://image.baidu.com/search/index' headers = { 'User- ...
python爬图代码实例_Python爬虫爬取煎蛋网图片代码实例
这篇文章主要介绍了Python爬虫爬取煎蛋网图片代码实例,文中通过示例代码介绍的非常详细,对大家的学习或者工作具有一定的参考学习价值,需要的朋友可以参考下今天,试着爬取了煎蛋网的图片. 用到的包: ...
python爬贴吧回复_Python爬虫——抓取贴吧帖子
对珊瑚老哥保证了自己会尽量补档动画MTV吧的资源,有空应该研究下爬虫了. 不要在意头图,我不会假借各位对某个动漫的爱好然后坑人的.无论是电磁炮吧主那种拿电磁铁糊弄人的奸商,还是逸站靠小林做幌子卖收费破 ...
python爬网页源码_python爬虫爬取网页的内容和网页源码不同？
可以看到这里id为k_total的元素值不同,爬出来是1,网页源码是55. 附还未完成的代码:import requests from bs4 import BeautifulSoup import ...
python手机壁纸超清_python爬虫爬取超清壁纸代码实例
简介壁纸的选择其实很大程度上能看出电脑主人的内心世界,有的人喜欢风景,有的人喜欢星空,有的人喜欢美女,有的人喜欢动物.然而,终究有一天你已经产生审美疲劳了,但你下定决定要换壁纸的时候,又发现网上的壁 ...
python爬取百度贴吧中的所有邮箱_使用 Python 编写多线程爬虫抓取百度贴吧邮箱与手机号...
原标题:使用 Python 编写多线程爬虫抓取百度贴吧邮箱与手机号不知道大家过年都是怎么过的,反正栏主是在家睡了一天,醒来的时候登QQ发现有人找我要一份贴吧爬虫的源代码,想起之前练手的时候写过一个抓 ...
python爬百度贴吧_Python爬虫实战之爬取百度贴吧帖子
大家好,上次我们实验了爬取了糗事百科的段子,那么这次我们来尝试一下爬取百度贴吧的帖子.与上一篇不同的是,这次我们需要用到文件的相关操作. 本篇目标对百度贴吧的任意帖子进行抓取指定是否只抓取楼主发帖内 ...
python爬去新浪微博_Python爬虫爬取新浪微博内容示例【基于代理IP】
Python爬虫爬取新浪微博内容示例[基于代理IP] 发布时间:2020-09-07 10:08:14 来源:脚本之家阅读:120 本文实例讲述了Python爬虫爬取新浪微博内容.分享给大家供大家参 ...
python爬取微博评论点赞数_python 爬虫爬微博分析数据
python 爬虫爬微博分析数据最近刚看完爱情公寓5,里面的大力也太好看了吧... 打开成果的微博,小作文一样的微博看着也太爽了吧... 来用python分析分析狗哥这几年微博的干了些啥. ...
python爬去智联招聘网_Python爬虫爬取智联招聘（进阶版）
点击上方"程序人生",选择"置顶公众号" 第一时间关注程序猿(媛)身边的故事图片:Westworld Season 2 作者王强简介 Python追随者, ...

python爬百度贴吧_python爬虫-爬取百度贴吧帖子加图片

python爬百度贴吧_python爬虫-爬取百度贴吧帖子加图片相关推荐

最新文章

热门文章