爬取百度贴吧帖子页内容

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

说明：仅学习参考，如有侵权，将立即删除此内容

<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

开发文档

version_2

功能：爬取输入贴吧名字(如“李毅”)的帖子内容，并保存获取的原始html文件,以及处理后的包含相关字段的json文件

函数：next_page_url.py，主要负责获取next_page_url

　　　page_key_info.py，主要负责获取当页关键信息的提取

　　　settings.py完成，存放设置

　　　request_response.py用来处理请求

　　　tiezi_total.py 实现主要逻辑，并爬取相关内容

　　　main_spider.py程序运行的接口(未启用)

文件：jsonfiletotal存放提取出的json数据

技术点：多线程、线程中的通讯(队列)、递归获取下页地址、生产者消费者模型

bug_1：这个版本，目前流程大体上是正确的，但是会在解析第二页内容时，报UnicodeDecodeError，目前本人没能力解决，大佬如果有经验，方便的话，希望可以提点一下。

bug_1修复：问题已解决，代码已修复，主要细节问题，出现在请求下页地址时，没有将其正确的拼凑好。

<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

源码

# settings.py# User-Agent池
# waiting doHEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36",
}

# tiezi_total.pyimport requests
from urllib import parse
import re
import json
from queue import Queue
from threading import Thread
import time
from settings import HEADERS
from next_page_url import NextPageUrl
from page_key_info import PageKeyInfo
from request_response import RequestResponseclass MainProcess():"""接受参数：start_url,tieba_name"""def __init__(self, tieba_name, url):self.tieba_name = tieba_nameself.url = url.format(parse.quote(tieba_name))self.url_queue = Queue()self.rawhtml_queue = Queue()self.content_queue = Queue()def __make_url_and_rawhtml(self, url):　　　　 """生产url和rawhtml"""# self.url_queue.put(url)html_str = RequestResponse(url).run()next_page_url = NextPageUrl(html_str).run()print(next_page_url)# 将html字符串放入队列self.rawhtml_queue.put(html_str)while next_page_url:self.url_queue.put(next_page_url)return self.__make_url_and_rawhtml(next_page_url)def __make_key_info(self):　　　　　"""消费url和rawhtml，生产content"""while self.url_queue.not_empty and self.rawhtml_queue.not_empty:# 从队列中取出一一对应的url和rawhtmlurl = self.url_queue.get()html_str = self.rawhtml_queue.get()item_list = PageKeyInfo(html_str).run()# 将当前页url放入相关数据中返回item = dict(current_page_url=url)item_list.append(item)# 将相关数据放入队列self.content_queue.put(item_list)# 显示状态print("开始从当前{}提取信息".format(url))# 队列计数减1self.url_queue.task_done()self.rawhtml_queue.task_done()def __save_json_file(self):"""保存相关数据为json文件,消费content"""while self.content_queue.not_empty:# 从队列取数content = self.content_queue.get()# 构造filenameurl = content[-1]["current_page_url"]filename = parse.unquote(re.split(pattern=r'\?', string=url)[-1])+".json"with open("./jsonfiletotal/"+filename, 'w', encoding='utf8') as f:f.write(json.dumps(content, ensure_ascii=False, indent=4))print("保存"+filename+"文件成功")# 队列计数减1self.content_queue.task_done()def run(self):# 将首个url放入self.url_queue队列self.url_queue.put(self.url)# 创建线程列表thread_list = list()make_url_and_rawhtml_thread = Thread(target=self.__make_url_and_rawhtml, args=(self.url,))thread_list.append(make_url_and_rawhtml_thread)make_key_info_thread = Thread(target=self.__make_key_info)thread_list.append(make_key_info_thread)save_json_file_thread = Thread(target=self.__save_json_file)thread_list.append(save_json_file_thread)for t in thread_list:t.setDaemon = Truet.start()# 让所有队列里内容清空self.url_queue.join()self.rawhtml_queue.join()self.content_queue.join()# 测试用例
if __name__ == "__main__":tieba_name = "李毅"first_url = "https://tieba.baidu.com/f?kw={}&ie=urf-8&pn=0"obj = MainProcess(tieba_name, first_url)obj.run()

#request_response.pyimport requests
from settings import HEADERSclass RequestResponse():"""传入一个请求url,返回一个原始字符串"""def __init__(self,url):self.url = urldef __get_resquest(self,url):"""获取响应,接受一个url参数，作为通用函数目前设置反反爬虫策略返回原始的未处理的原始字符串"""response = requests.get(url, headers=HEADERS)print("请求响应代码：", response.status_code)response_ = response.content.decode()return response_def run(self):return self.__get_resquest(self.url)

# next_page_url.pyimport reclass NextPageUrl():"""传入的参数为response产生的原始字符串"""def __init__(self,html_str):"""初始化参数"""self.html_str = html_strdef __info_str(self,html_str):"""将传入的html_str分解，提取有用的内容"""html_ = re.findall(r'<code class=\"pagelet_html\" id=\"pagelet_html_frs-list/pagelet/thread_list\" style=\"display:none;\">(.*?)</code>', html_str, re.S)[0]return html_def __parse_next_url(self, html_):# 提取当页下包含下一页的divdiv_content = re.findall(r'<div class=\"thread_list_bottom clearfix\">(.*?)-->', html_, re.S)[0]# if next_page == None:代表没有下一页# 由于无法直接定位，取所有的url，并放入列表next_url_list = re.findall(r'<a(.*?)>', div_content, re.S)for i in next_url_list:if "next pagination-item" in i:next_page_url = "https: + re.findall(r'href="(.*?)"', i, re.S)[0]return next_page_urldef run(self):"""提供主要的对外接口"""__html_ = self.__info_str(self.html_str)__next_page_url = self.__parse_next_url(__html_)return __next_page_url

# page_key_info.pyimport reclass PageKeyInfo():"""传入的参数为response产生的原始字符串"""def __init__(self, html_str):"""初始化参数"""self.html_str = html_strdef __info_str(self, html_str):"""将传入的html_str分解，提取有用的内容"""html_ = re.findall(r'<code class=\"pagelet_html\" id=\"pagelet_html_frs-list/pagelet/thread_list\" style=\"display:none;\">(.*?)</code>', html_str, re.S)[0]return html_def __get_usefulinfo_by_one(self,ul_one):one_tiezi_info = dict()# 获取标题和地址title_and_href = re.findall(r'j_th_tit ">.*?<a rel="noreferrer" href="(.*?)" title="(.*?) target="_blank"', ul_one, re.S)title_and_href = title_and_href[0] if len(title_and_href) > 0 else Noneif title_and_href:title_href_ = "https://tieba.baidu.com"+title_and_href[0]title_ = title_and_href[1]else:title_href_ = Nonetitle_ = None# 获取作者和作者idauthor_name = re.findall(r'<span class="tb_icon_author ".*?title="主题作者: (.*?)"', ul_one, re.S)author_name = author_name[0] if len(author_name) > 0 else Noneauthor_id = re.findall(r'title="主题作者.*?".*?data-field=\'{&quot;user_id&quot;:(.*?)}\' >', ul_one, re.S)author_id = author_id[0] if len(author_id) > 0 else Noneauthor_home = re.findall(r'class="frs-author-name j_user_card " href="(.*?)" target="_blank">', ul_one, re.S)author_home = "https://tieba.baidu.com" + author_home[0] if len(author_home) > 0 else None# 取内容content = re.findall(r'<div class="threadlist_abs threadlist_abs_onlyline ">(.*?)</div>', ul_one, re.S)content = content[0] if len(content) > 0 else Noneimage = re.findall(r'bpic="(.*?)" class="threadlist_pic j_m_pic', ul_one, re.S)# 将数据存放在字典中one_tiezi_info["title"] = title_one_tiezi_info["title_href"] = title_href_one_tiezi_info["author_name"] = author_nameone_tiezi_info["author_id"] = author_idone_tiezi_info['author_home'] = author_homeone_tiezi_info['content'] = contentone_tiezi_info['image'] = imagereturn one_tiezi_infodef __ul_content(self,html_):# 获取当前主题页的所有列表ul_content_list = re.findall(r'li class=\" j_thread_list clearfix\"(.*?)<li class=\" j_thread_list clearfix\"', html_, re.S)return ul_content_listdef __get_content(self,html_):item_list = list()# 获取包含所有单块帖子的列表ul_content_list = self.__ul_content(html_)for ul_one in ul_content_list:item = self.__get_usefulinfo_by_one(ul_one)item_list.append(item)return item_listdef run(self):# 处理字符串__html_ = self.__info_str(self.html_str)# 处理关键字段__item_list = self.__get_content(__html_)return __item_list

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

开发文档

verison_1
使用方式：python main_spider.py 李毅
功能：爬取输入贴吧名字(如“李毅”)的帖子内容，并保存获取的原始html文件,以及处理后的包含相关字段的json文件
函数：settings.py 存放设置
　　　tieziparse.py处理单页的内容，提取有用信息
　　　main_spider.py 执行主要逻辑
　　　tieba_title_total.py,贴吧名字，url,处理单页字符带，待修改
文件：htmlfiletotal存放获取到的html页面
　　　jsonfiletotal存放提取出的json数据

说明：属于单线程，目前能够执行，优化请看version_2

<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

源码

# settings.py存放主要的参数
# User-Agent池
# waiting doHEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36",
}

# main_spider.pyfrom tieba_title_total import TitleTotal
import sys#
class ProcessMain():def __init__(self,tieba_name,first_url):self.tieba_name = tieba_nameself.first_url = first_url# 获取贴吧当前页的所有帖子def tieba_current_pagecontent(self,tieba_name,url):obj = TitleTotal(tieba_name,url)  next_page_url = obj.run()while next_page_url:return self.tieba_current_pagecontent(tieba_name,next_page_url)def run(self):self.tieba_current_pagecontent(self.tieba_name,self.first_url)        if __name__=="__main__":tieba_name = sys.argv[1]# tieba_name = "李毅"first_url = "https://tieba.baidu.com/f?kw={}&ie=utf-8&pn=0"obj = ProcessMain(tieba_name,first_url)obj.run()

# tieba_title_total.pyimport requests
from urllib import parse
import re
from settings import HEADERS
import json
from tieziparse import TieziParseclass TitleTotal():"""获取帖子内容标题：title、title_href作者：author_name、author_id、author_home内容：content、image下一页地址：next_page_url"""def __init__(self,tieba_name,url):self.tieba_name = tieba_nameself.url = url.format(parse.quote(tieba_name))def __get_response(self,url):"""获取响应,接受一个url参数，作为通用函数目前设置反反爬虫策略"""response = requests.get(url,headers=HEADERS)print("响应代码：",response.status_code)response_ = response.content.decode()return response_def __save_raw_file(self,content,filename):"""保存文件"""with open("./htmlfiletotal/"+filename,'w',encoding='utf8') as f:f.write(content)print("写入原始文件成功")def __save_json_file(self,content,filename):"""保存json文件"""with open("./jsonfiletotal/"+filename,'w',encoding='utf8') as f:f.write(json.dumps(content,ensure_ascii=False, indent=4))print("保存json文件成功")def __parse_html_str(self,html_str):current_page_info = dict()instantiation_ = TieziParse(html_str)next_page_url,item_list = instantiation_.run()current_page_info["next_page_url"] = next_page_urlcurrent_page_info["content_list"] = item_listreturn current_page_infodef run(self):"""实现主要逻辑"""print("开始url:",self.url)# 获取响应html_str = self.__get_response(self.url)# 保存抓取到的原始的内容filename = parse.unquote(re.split(pattern=r'\?',string=self.url)[-1])+".html"self.__save_raw_file(content=html_str,filename=filename)# 对抓取到的原始内容进行详细信息抓取key_info_dict = self.__parse_html_str(html_str)# 将提取到的关键信息保存到json文件filename = parse.unquote(re.split(r'\?',self.url)[-1])+".json"self.__save_json_file(content=key_info_dict, filename=filename)print("下一页url:",key_info_dict["next_page_url"])return key_info_dict["next_page_url"]# 测试用例
if __name__=="__main__":tieba_name = "李毅"first_url = "https://tieba.baidu.com/f?kw={}&ie=utf-8&pn=0"obj = TitleTotal(tieba_name,first_url)obj.run()

# tieziprase.pyimport reclass TieziParse():"""包含单页中所有帖子内容的字符串处理传入的htnl_str标题：title、title_href作者：author_name、author_id、author_home内容：content、image下一页地址：next_page_url"""def __init__(self, html_str):"""初始化参数"""self.html_str = html_strdef __info_str(self):"""将传入的html_str分解，提取有用的内容"""html_ = re.findall(r'<code class=\"pagelet_html\" id=\"pagelet_html_frs-list/pagelet/thread_list\" style=\"display:none;\">(.*?)</code>', self.html_str, re.S)[0]return html_def __parse_next_url(self, html_):# 提取当页下包含下一页的divdiv_content = re.findall(r'<div class=\"thread_list_bottom clearfix\">(.*?)-->', html_, re.S)[0]# if next_page == None:代表没有下一页# 由于无法直接定位，取所有的url，并放入列表next_url_list = re.findall(r'<a(.*?)>', div_content, re.S)for i in next_url_list:if "next pagination-item" in i:next_page_url = "https:" + re.findall(r'href="(.*?)"', i, re.S)[0]return next_page_urldef __get_usefulinfo_by_one(self, ul_one):one_tiezi_info = dict()# 获取标题和地址title_and_href = re.findall(r'j_th_tit ">.*?<a rel="noreferrer" href="(.*?)" title="(.*?) target="_blank"', ul_one, re.S)# title_ = None# title_href_ = None# for con in title_and_href:title_and_href = title_and_href[0] if len(title_and_href)>0 else Noneif title_and_href:title_href_ = "https://tieba.baidu.com"+title_and_href[0]title_ = title_and_href[1]else:title_href_ = Nonetitle_ = None# 获取作者和作者idauthor_name = re.findall(r'<span class="tb_icon_author ".*?title="主题作者: (.*?)"', ul_one, re.S)author_name = author_name[0] if len(author_name)>0 else Noneauthor_id = re.findall(r'title="主题作者.*?".*?data-field=\'{&quot;user_id&quot;:(.*?)}\' >', ul_one, re.S)author_id = author_id[0] if len(author_id)>0 else Noneauthor_home = re.findall(r'class="frs-author-name j_user_card " href="(.*?)" target="_blank">', ul_one, re.S)author_home = "https://tieba.baidu.com"+author_home[0] if len(author_home)>0 else None# 取内容content = re.findall(r'<div class="threadlist_abs threadlist_abs_onlyline ">(.*?)</div>', ul_one, re.S)content = content[0] if len(content)>0 else Noneimage = re.findall(r'bpic="(.*?)" class="threadlist_pic j_m_pic', ul_one, re.S)# 将数据存放在字典中one_tiezi_info["title"] = title_one_tiezi_info["title_href"] = title_href_one_tiezi_info["author_name"] = author_nameone_tiezi_info["author_id"] = author_idone_tiezi_info['author_home'] = author_homeone_tiezi_info['content'] = contentone_tiezi_info['image'] = imagereturn one_tiezi_infodef __ul_content(self, html_):# 获取当前主题页的列表ul_content_list = re.findall(r'li class=\" j_thread_list clearfix\"(.*?)<li class=\" j_thread_list clearfix\"', html_, re.S)return ul_content_listdef __get_content(self, html_):item_list = list()# 获取包含所有单块帖子的列表ul_content_list = self.__ul_content(html_)for ul_one in ul_content_list:item = self.__get_usefulinfo_by_one(ul_one)item_list.append(item)return item_listdef run(self):"""向外提供的接口返回的第一个参数是next_page_url返回的第二个参数是item_list"""# 分解传入的html_str，提取有用信息__html_ = self.__info_str()# 获取下一页的url__next_page_url = self.__parse_next_url(__html_)# 获取当前页中所有的帖子等信息__item_list = self.__get_content(__html_)return __next_page_url, __item_list

爬取百度贴吧帖子页内容相关推荐

Python爬虫实战之爬取百度贴吧帖子
Python爬虫实战之爬取百度贴吧帖子大家好,上次我们实验了爬取了糗事百科的段子,那么这次我们来尝试一下爬取百度贴吧的帖子.与上一篇不同的是,这次我们需要用到文件的相关操作. 本篇目标对百度贴吧的 ...
python爬虫(13)爬取百度贴吧帖子
爬取百度贴吧帖子一开始只是在网上看到别人写的爬取帖子的文章,然后自己就忍不住手痒自己锻炼一下, 然后照着别人的写完,发现不太过瘾, 毕竟只是获取单个帖子的内容,感觉内容稍显单薄,然后自己重新做了修改 ...
python爬取贴吧所有帖子-Python爬虫实例（一）爬取百度贴吧帖子中的图片
程序功能说明:爬取百度贴吧帖子中的图片,用户输入贴吧名称和要爬取的起始和终止页数即可进行爬取. 思路分析: 一.指定贴吧url的获取例如我们进入秦时明月吧,提取并分析其有效url如下 ?后面为查询字 ...
java 百度贴吧爬虫,爬取百度贴吧帖子
依然是参考教程 Python爬虫实战一之爬取百度贴吧帖子.作者崔庆才写了很多关于Python爬虫的文章,大家有兴趣的话可以去他的个人博客静觅学习. 这次将爬取的数据写入了文本文件中,并用到了re模块中 ...
Python3 简单爬虫爬取百度贴吧帖子
使用Python3.x的版本对http://tieba.baidu.com/p/2005436135,该百度贴吧帖子进行爬取操作. 一.使用到的库. 1. urllib.request :对 ...
【Python网络编程】爬取百度贴吧、小说内容、豆瓣小说、Ajax爬微博、多线程爬淘宝
一.爬取百度贴吧 import re titleR ='<a rel="noreferrer" href=".*?" title=".*?&qu ...
python爬百度贴吧_Python爬虫实战之爬取百度贴吧帖子
大家好,上次我们实验了爬取了糗事百科的段子,那么这次我们来尝试一下爬取百度贴吧的帖子.与上一篇不同的是,这次我们需要用到文件的相关操作. 本篇目标对百度贴吧的任意帖子进行抓取指定是否只抓取楼主发帖内 ...
Python爬虫实战-爬取百度贴吧帖子
本篇目标 1.对百度贴吧的任意帖子进行抓取 2.指定是否只抓取楼主发帖内容 3.将抓取到的内容分析并保存到文件如果觉得一步步看麻烦的话可以拉到最下面有完整源码可以直接使用 1.URL格式的确定首先 ...
python爬虫吧-python爬虫-爬取百度贴吧帖子加图片
1.[代码][Python]代码 # -*- coding: utf-8 -*- """ 百度贴吧帖子抓取 """ import urlli ...

爬取百度贴吧帖子页内容

爬取百度贴吧帖子页内容相关推荐

最新文章

热门文章