某投诉网站爬虫-获取对应的投诉信息

此爬虫原本是一道面试题，说难不难，说简单也不简单；因为站点的反爬机制，我现在也没摸清楚，不过目前看来应该是你在一段时间内访问数据超过对应的数量就会封ip 5-10分钟，然后就可以正常访问了

爬虫源代码

# -*- coding:utf-8  -*-
# @Time     : 2020-12-03 13:20
# @Author   : BGLB
# @Email    : bglb@qq.com
# @Software : PyCharm
import csv
import hashlib
import json
import os
import random
import threading
import timefrom requests import get"""
题目要求:1. 用任意一语言，或者Python或者其他熟悉的，写一个爬虫程序，爬取tousu.sina.com.cn网站中最新投诉的内容，并且输出到一个文本文件或者excel文件。2. 输出内容:1.投诉内容2.投诉对象3.投诉要求4.投诉时间3. 测试时间:测试时间为48小时4. 提交内容a.源程序并附上注释b.输出的文件实例"""def time_logging(func):"""记录函数运行时间的装饰器:param func: 需要记录的函数名:return:"""def wrapper(*args, **kw):start_time = time.time()func_result = func(*args, **kw)runtime = time.time()-start_timeif runtime < 60:runtime = "{:.2f}s".format(runtime)elif runtime < 3600:runtime = "{:.2f}m".format(runtime/60)else:runtime = "{:.2f}h".format(runtime/3600)content = '[{0:^15}] - 运行时间 - [{1:^6}]'.format(func.__name__, runtime)print("{}".format(content))return func_resultreturn wrapperclass SinaTousu(object):def __init__(self, host_str="全国投诉", type_str="最热投诉", count=100):self.host_str = host_strself.type_str = type_strself.__page_size = 30self.__pages = 1self.__max_thread_count = 10self.__current_thread_count = 0self.__url = self.__create_url()self.count = countif self.count > self.__page_size:self.__pages += int(count/self.__page_size)else:self.__page_size = self.count__data_type_dict = {"最热投诉": 1,"最新投诉": 2,"已回复": 3,"已完成": 4,}__hosts_type_dict = {"湖北投诉": "https://hb.tousu.sina.com.cn","全国投诉": "https://tousu.sina.com.cn"}# 站点的相关接口 由于只爬取投诉接口 所以没有过多优化def __create_url(self):"""构造url"""__api_dict = {"LAWS_FEED": self.__hosts_type_dict[self.host_str]+"/api/laws/feed","INDEX_FEED": self.__hosts_type_dict[self.host_str]+"/api/index/feed",  # 投诉相关的接口 通过js 解析获得"index_article": self.__hosts_type_dict[self.host_str]+"/api/articles/notice",}return __api_dict["INDEX_FEED"]def forged_param(self, page):"""构造参数：:param page: 当前抓取页数:return:"""ts = int(time.time()*1000)key = "$d6eb7ff91ee257475%"rs = ""a = [str(x) for x in range(0, 9)] + \[chr(x).lower() for x in range(65, 91)] + \[chr(x) for x in range(65, 91)]for i in range(16):rs += a[random.randint(1, len(a)-1)]a = "".join(sorted([str(ts), rs, key, str(self.__data_type_dict[self.type_str]), str(self.__page_size), str(page)]))signature = hashlib.sha256(a.encode("utf8")).hexdigest()return {"ts": ts,"type": self.__data_type_dict[self.type_str],"page_size": self.__page_size,"page": page,"rs": rs,"signature": signature}@staticmethoddef parse_json(jsondata):"""处理json数据:param jsondata::return:"""if jsondata is None:returnres_lists = []host = "https:"for item in jsondata["lists"]:res_item = {"投诉编号": item["main"]["sn"],"投诉对象": item["main"]["cotitle"],"投诉标题": item["main"]["title"],"投诉时间": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(item["main"]["timestamp"]))),"投诉详情": item["main"]["summary"],"投诉要求": item["main"]["appeal"],"详情页面": host+item["main"]["url"],"投诉发起人昵称": item["author"]["title"],"投诉发起人微博": host+item["author"]["wb_profile"],# "投诉发起人性别": item["author"]["gender"]}res_lists.append(res_item)return res_listsdef get_json(self, param_dict):res = get(self.__url, param_dict)# print(self.__url)res.encoding = res.apparent_encodingdata = res.content.decode()print(res.status_code)if res.status_code == 456:print("您被封啦，请等待5~60分钟自动解封")if res.status_code == 200:result = json.loads(data)["result"]if result["status"]["code"] == 0:return result["data"]print("{}参数错误！检查参数".format(result["status"]))return None@staticmethoddef write_file(filename, datas):"""写入文件:param filename: 文件名称:param data: 数据:return: 写入数据行数"""ext = filename.split('.')[-1]path = "./"+filename# is_write = os.path.exists(path)with open(path, "w", encoding="utf8", newline='') as f:if ext == "csv":header = [x for x in datas[0].keys()]w = csv.DictWriter(f, fieldnames=header)w.writerows(datas)  # 写入数据if ext == "json":json.dump(datas, f, ensure_ascii=False)print("写入文件-[{}]-[{}]条数据".format(path, len(datas)))return len(datas)@staticmethoddef file_walker(path):file_dict = {"json": [], "result": []}for root, dirs, files in os.walk(path):for fn in files:if fn.startswith("result"):file_dict["result"].append(fn)if fn.startswith("jsondata"):file_dict["json"].append(fn)for v in file_dict.values():if len(v) > 0:v.sort(key=lambda x: int(x.split('.')[0].split('_')[-1]))return file_dict@staticmethoddef combine(res_filename, filelist):ext = res_filename.split('.')[-1]resfilepath = "./"+res_filenameif os.path.exists(resfilepath):os.remove(resfilepath)if len(filelist) == 0:returnwith open(res_filename, 'wb+') as fw:if ext == "csv":fw.write("投诉编号,投诉对象,投诉标题,投诉时间,投诉详情,投诉要求,详情页面,投诉发起人昵称,投诉发起人微博\n".encode('utf8'))for file in filelist:file_path = "./"+filefw.write(open(file_path, 'rb').read())os.remove(file_path)if ext == "json":fw.write("{".encode("utf8"))for file in filelist:file_path = "./"+filefw.write('"result_{}":'.format(file.split('.')[0].split('_')[-1]).encode('utf8'))fw.write(open(file_path, 'rb').read())if filelist.index(file) is not len(filelist)-1:fw.write(','.encode('utf8'))os.remove(file_path)fw.write("}".encode("utf8"))# raise ValueError("参数错误{}".format(res_filename))def set_pages_pagesize(self, pages, page_size):self.count = pages*page_sizeif page_size > 30:self.__page_size = 30self.__pages = int(self.count/self.__page_size)+1print("警告：每页最大数据条数为30条，已为您选取最优选择：\n{}".format({"pages": self.__pages, "page_size": self.__page_size}))else:self.__pages = pagesself.__page_size = page_sizedef get_pages_pagesize(self):return {"pages": self.__pages, "page_size": self.__page_size}def __start(self, page, isMultithreading):lock = Noneif isMultithreading:lock = threading.Lock()lock.acquire()params = self.forged_param(page)data_json = self.get_json(params)if data_json is None:returnpages = data_json['pager']['page_amount']item_count = data_json['pager']['page_amount']if pages < self.__pages:self.__pages = pagesprint("----数据总页数-[{}]-数据总条数-[{}]--------".format(pages, item_count))result_data = self.parse_json(data_json)self.write_file("jsondata_{}.json".format(page), data_json)self.write_file("result_{}.csv".format(page), result_data)if isMultithreading:lock.release()def thread_manage(self, f, kwargs):"""线程管理函数:param f: 函数:return:"""t = threading.Thread(target=f, kwargs=kwargs)self.__current_thread_count += 1time.sleep(0.1)t.start()if threading.active_count()-2 >= self.__max_thread_count+3:t.join()# if self.__current_thread_count >= self.__max_thread_count:if self.__current_thread_count > 40:self.__current_thread_count = 0time.sleep(10)def rm_file(self):for file in ["./result.csv", "./jsondata.json"]:if os.path.exists(file):os.remove(file)def run(self, page_list=None, isMultithreading=True):"""传入需要爬取的页码list:param page_list: 默认值空:param isMultithreading: 默认多线程:return:"""self.rm_file()print("------------一共爬取{}条数据，选取最优的爬取速度为 [每次抓取量:{}, 抓取次数:{}]-------------".format(self.count, self.__page_size, self.__pages))page_count_list = [x for x in range(1, self.__pages+1)]if page_list is not None:page_count_list = page_listrandom.shuffle(page_count_list)for page in page_count_list:# time.sleep(0.5)# self.__start(page)self.thread_manage(self.__start, kwargs={"page": page,'isMultithreading':isMultithreading})print("--------------[{}]-[{}]-当前抓取次数-[{}]------------------".format(self.host_str, self.type_str, page))while True:if threading.active_count() == 1:fs_dict = self.file_walker("./")self.combine("result.csv", fs_dict["result"])self.combine("jsondata.json", fs_dict["json"])print("抓取完毕")breakif __name__ == '__main__':q = SinaTousu("全国投诉", "最新投诉")# print(q.get_pages_pagesize())q.set_pages_pagesize(100, 30)@time_loggingdef main():q.run()#   q.run(isMultithreading=False)  # 关闭多线程main()"""时间 - 访问次数25s - 1804m - 32541s - 90 41s - 956m - 40611s - 40 封"""

爬虫解析

网站接口主要的加密方式为 sha256
主要变量有时间戳ts, key="$d6eb7ff91ee257475%" 字符串数组a [0-9,a-z,A-Z] 当前访问页码page,当前访问每页数据量 page_size 访问类型对应的数字 - _type = ["最热投诉","最新投诉",'已完成', '已回复'] 数组中随机一个字符串rs
先来一个空数组 base_sign = [] 数组中有六个变量字符串ts rs key _type_index + 1 page_size page
然后把 base_sign 升序排列转为字符串最后通过 sha256 加密这个字符串就可以得到 signature 的值
网站接口分为 全国站点 和 湖北站点 目前也只发现这两个站点
每次访问最多可以获取30条数据，网站上每次固定十条数据

代码解析

代码每次运行会删除上次爬的数据
代码加入了多线程，可以设置关闭，多线程还有点小问题会导致反爬

测试图片

大概是访问了 80多次就封了
有大佬可以请教下这个反爬该怎么避免吗？

某投诉网站爬虫-获取对应的投诉信息相关推荐

python爬虫获取肯德基门店信息
python爬虫获取肯德基门店信息 1.在谷歌浏览器中打开肯德基官网,进入餐厅查询页面 2.在搜索框中输入地区并按f12打开开发者工具发现已经抓取到了一条Ajax请求,可以从中获取请求的url,以及 ...
独家 | 虚假疫苗网站如何获取你的个人信息
作者: Lance Whitney翻译:陈超校对:王可汗本文约1000字,建议阅读3分钟本文揭示了诈骗网站如何利用人们对新冠疫苗信息的关注获取用户的个人信息. 该网站最近被政府查封,该网站仿造一家开发 ...
Java用Jsoup开发爬虫获取双色球开奖信息
想要获取双色球开奖信息,利用爬虫无疑是个比较方便的方式,针对简单的功能,除了python以外,Java也有比较便捷的方式--Jsoup 要获取指定位置的内容,需要知道该内容的标签,比如红球的标签是'l ...
div中内容靠右_python读取excel的公司名称信息，并爬虫获取公司的经营范围信息，回填到excel中...
大家总说,python今天学明天忘,没有实际操作,想要知道哪里有练手的机会.其实你要善于发现问题,捕捉问题.不管是你看到文章中的一些案例,还是微信群友问的一个问题.你都需要仔细思考,如果是你做你应该怎 ...
python读取excel的公司名称信息，并爬虫获取公司的经营范围信息，回填到excel中
本文作者:是老王吖原文链接:https://blog.csdn.net/jdkss/article/details/106077755?utm_source=app 1.项目需求项目需求:这个问题 ...
Python爬虫获取斗鱼主播信息
感谢参考原文-http://bjbsair.com/2020-03-27/tech-info/7150.html 下面我们进入正题首先我们进入斗鱼的官网我发现首页是一些推荐的主播,并不全面,不能 ...
微博数据爬虫——获取用户微博相关信息（四）
任务:给定u_id,获取用户每条微博的mid.发布时间.照片数.@数.链接数 1.获取微博信息查看网页源代码,发现数据保存在js中利用正则匹配可以实现获取单条微博全部信息 add = urllib ...
selenium爬虫获取自己的课表信息
方正教务系统做的还挺安全!!! 密码用rsa加密,还带了token防御一开始,我尝试用 request.post()表单提交数据,没用,看了一下网络请求,表单提交了4个数据,如下图: 也就是说,我们 ...
菜鸟爬虫——获取安居客二手房信息
以安居客二手房为例前言了解爬虫爬虫目录结构爬虫主体代码 items.py 反反爬虫策略运行爬虫前言因为需要一些二手房数据,菜鸟开启了爬虫之路!不过需要注意的是,在爬取数据时,要遵守< ...

某投诉网站爬虫-获取对应的投诉信息

某投诉网站爬虫-获取对应的投诉信息

爬虫源代码

爬虫解析

代码解析

测试图片

某投诉网站爬虫-获取对应的投诉信息相关推荐

最新文章

热门文章