清水河畔论坛二手帖子爬虫

  1 # -*- coding:utf-8 -*-
  2 '''
  3 清水河畔二手帖子+爬取二手交易帖子
  4 '''
  5 import requests
  6 import json
  7 from bs4 import BeautifulSoup
  8 import sys
  9 import urllib
 10 import re
 11 from urllib import request,parse
 12 import pymongo
 13 #by 元帅 uestc 2018.2.28
 14 class QSHSpider(object):
 15     # 模拟登陆清水河畔
 16     def __init__(self):
 17         self.headers = {
 18         'username':'',
 19         'password':'',
 20         'Cache - Control': '',
 21         'Connection': 'keep - alive',
 22         'Cookie':'',
 23         'Host':'bbs.uestc.edu.cn',
 24         'Referer':'http: // bbs.uestc.edu.cn / member.php?mod = logging & action = login',
 25         'Upgrade-Insecure - Requests': '1',
 26         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
 27         }
 28
 29     # 模拟登陆
 30     def login(self):
 31         request_url = "http://bbs.uestc.edu.cn/member.php?mod=logging&action=login"
 32         request_new = requests.get(request_url, headers=self.headers)
 33         self.login_request = request_new.text
 34
 35     def get_shopurl(self):
 36         bf = BeautifulSoup(self.login_request,'html.parser')
 37         #print(bf)
 38         shop_a = bf.find_all('div',id = 'hd')[0].find_all('ul',id = 'mn_F201_menu',class_='p_pop h_pop')[0].find_all('li')[0].find_all('a')[0]
 39         self.shopurl = shop_a['href']
 40         print('登陆成功！')
 41         print('\n')
 42         print('您已进入二手帖子专题：' + self.shopurl)
 43
 44     def get_tieziurls(self):
 45         #request_tiezi = requests.get(self.shopurl,headers = self.headers)
 46         req = request.Request(url=self.shopurl,headers=self.headers, method="POST")
 47         response = request.urlopen(req)
 48         content = response.read()
 49         res = r"<a.*?href=.*?<\/a>"
 50         urls = re.findall(res, content.decode('utf-8'))
 51         print('备选主题有：书籍资料；生活用品；交通工具；卡券虚拟；数码硬件；'
 52               '拼单；物品租借；其他；版务/投诉；已解决；')
 53         searcher = input("请输入需要查找的主题 ")
 54         #获取a标签内内容
 55         # res = r'<a .*?>(.*?)</a>'
 56         # texts = re.findall(res, content.decode('utf-8'))
 57         # for t in texts:
 58         #     print(t)
 59         #获取a标签内超链接
 60         #urls = re.findall(r'<a.*?href=.*?>\r\n(.+?)<span class="xg1 num">(.*?)</span><\/a>',re.S)
 61         #bff = BeautifulSoup(request_tiezi.text,'html.parser')
 62         #print(bff)
 63         #shop_tc1 = bff.find_all('div',id = 'wp',class_ = 'wp')[0]
 64         #.find_all('div',class_ = 'boardnav')[0].find_all('div',id = 'ct',class_ = 'wp cl')[0].find_all('div',class_ = 'mn')
 65         #shop_tc1 = bff.select('.wp')
 66         #print(shop_tc1)
 67         #.find_all('ul',id = 'thread_types',class_ = 'ttp bm cl cttp',style = 'height: auto;')[0]
 68         #.find_all('li').find_all('a')[0]
 69         # shop_tiezi_url = shop_tc1['href']
 70         # print(shop_tiezi_url)
 71         for u in urls:
 72             if searcher in str(u):
 73                 ui = u.replace('amp;','')
 74                 #print(ui)
 75                 urls1 = re.findall(r'<a .*?>(.*?)</a>' , ui, re.I | re.S | re.M)
 76                 urls2 = re.findall(r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')" ,ui,re.S |re.I |re.M)
 77                 print(urls1[0][0:8] + '   ' + urls2[0])
 78                 self.tieziurls = urls2[0]
 79                 tags = re.findall(r"\d+\.?\d*",urls2[0])
 80                 self.pagetag = tags[1]
 81                 #self.pagetag = re.sub("\D", "", urls2[0])
 82                 #print(self.pagetag)
 83                 break
 84     def get_tiezi(self):
 85         #req_tiezi = request.Request(url=self.tieziurls,headers=self.headers, method="POST")
 86         req_tiezi = request.Request(url=self.new_url, headers=self.headers, method="POST")
 87         response_tiezi = request.urlopen(req_tiezi)
 88         content_tiezi = response_tiezi.read()
 89         res_tiezi = r"<a.*?href=.*?<\/a>"
 90         urls_tiezi = re.findall(res_tiezi, content_tiezi.decode('utf-8'))
 91         for tiezi in urls_tiezi:
 92             if 'class="s xst"' in tiezi:
 93                 tiezi = tiezi.replace('amp;', '')
 94                 tiezi_title = re.findall(r'<a .*?>(.*?)</a>', tiezi, re.I | re.S | re.M)
 95                 tiezi_urls = re.findall(r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')" ,tiezi,re.S |re.I |re.M)
 96                 print('\n\n')
 97                 print('帖子主题:' +tiezi_title[0] + '  ' + '帖子地址:' + tiezi_urls[0])
 98                 self.tiezi_url = tiezi_urls[0]
 99                 infor.get_details()
100                 #self.tiezi_data[] = '帖子地址'
101                 #数据库
102                 tiezi_data = {'帖子' : {'title': tiezi_title[0], 'url' : tiezi_urls[0]}}
103                 client = pymongo.MongoClient('localhost',27017)
104                 mydb = client['mydb']
105                 qingshuihepan = mydb['qingshuihepan']
106                 qingshuihepan.insert_one(tiezi_data)
107     def get_pages(self):
108         urls_based = 'http://bbs.uestc.edu.cn/forum.php?mod=forumdisplay&fid=61&typeid={}&filter=typeid&typeid={}&page={}'
109         for i in range(1,3):
110             self.new_url = urls_based.format(self.pagetag,self.pagetag,i)
111             print('\n\n')
112             print('第'+ str(i) + '页' + '   ' '本页网址：' + self.new_url)
113             infor.get_tiezi()
114     def get_details(self):
115         #print(self.tiezi_url)
116         print('本帖详细内容：')
117         req_detail = request.Request(url=self.tiezi_url, headers=self.headers, method="POST")
118         response_detail = request.urlopen(req_detail)
119         content_detail = response_detail.read()
120         #print(content_detail.decode('utf-8'))
121         bs = BeautifulSoup(content_detail,'html.parser')
122         print(bs.find_all(class_='t_f')[0].text.strip())
123         #urls_detail = re.findall(r'<td class="t_f".*?>(.*?)</td>' , content_detail.decode('utf-8'))
124         # for ud in urls_detail:
125         #     print(ud)
126
127 if "__main__" == __name__:
128     infor = QSHSpider()
129     infor.login()
130     print('登陆清水河畔ing……')
131     infor.get_shopurl()
132     infor.get_tieziurls()
133     #infor.get_tiezi()
134     infor.get_pages()

转载于:https://www.cnblogs.com/zysps1/p/qingshuihepan_ershoutiezi.html

清水河畔论坛二手帖子爬虫相关推荐

UESTC论坛-清水河畔自动登陆/重复发贴/安全性分析
注:以下方法只针对电子科技大学-BBS(清水河畔) 针对问题:如何用代码实现在论坛上自动登陆/反复发贴/抢楼? 一. 登陆到发贴的流程图二. 对论坛的分析 1. 登陆页http://bbs ...
【Python】爬取理想论坛单帖爬虫
代码: # 单帖爬虫,用于爬取理想论坛帖子得到发帖人,发帖时间和回帖时间,url例子见main函数 from bs4 import BeautifulSoup import requests impo ...
动易网站首页调用动网论坛最新帖子列表的操作方法
动易网站首页调用动网论坛最新帖子列表的操作方法以下采用的是:动易网站为swCMS6.5版和动网论坛为dvbbs 8.1.1 版 1. 确定动易swCMS6.5网站首页"论坛新帖& ...
一个n人搜索的论坛精华帖子→网络（转）
<script type="text/javascript"> function fastreply(subject) { if($('postform')) { $( ...
【Nodejs】理想论坛帖子爬虫1.02
在1.01版本中,我发现各回调函数找到数据后再插入数据库有个竞争问题不好解决,如果等所有回调都完成也没有好的处理方法,因为启动不止一处启动了新的TopicSpider实例. 于是我决定把读数据和写DB ...
python爬虫爬取虎扑湖人论坛专区帖子数据，并存入MongoDB数据库中
今天就带大家从头到尾一步一步带着大家爬取虎扑论坛帖子的数据,里面涉及到的一些知识,我会给出学习的连接,大家可以自行去学习查看. 前期准备首先我们打开虎扑NBA论坛,我选择的是湖人专区(小湖迷一个). ...
爬虫xx网站论坛的帖子源码分享
import re import time from urllib import parse import urllib import requests def updatepostinfo(star ...
爬虫监控360论坛有帖子自动发邮件
代码粗糙,凑合看,24小时运行没问题,写在这备忘啰嗦扯蛋版本 #*-coding:utf-8-*- import urllib2 import re import smtplib from emai ...
python爬取论坛帖子_python爬虫爬取虎扑论坛的帖子名称和链接，为什么只能爬10页就报...
该楼层疑似违规已被系统折叠隐藏此楼查看此楼报错信息: UnboundLocalError: local variable 'text_list' referenced before assignm ...

清水河畔论坛二手帖子爬虫

清水河畔论坛二手帖子爬虫相关推荐

最新文章

热门文章