Python爬取最新反爬虫汽车之家口碑

本人刚学Python没几天,代码可能比较丑陋, 大牛不要喷

用的Python2.7.2, 因为PyV8最高支持2.7.2, js混淆部分用的PyV8直接运行的js

原理已经写过一篇了,这里不再赘述了.可以看我的这篇

目录结构如下:

fonts文件夹负责存放下载的字体文件

decode_fontfile负责解析字体文件

decode_script负责解析js混淆

document负责模拟js中的document对象,因为PyV8中没有document对象,但是js混淆中用到了

spider是主要逻辑

下面贴一下代码:

spider.py

# -*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
import re
from decode_script import DecodeScript
from hero.proxy import proxy
from decode_fontfile import DecodeFontFile
import sys
reload(sys)
sys.setdefaultencoding('utf8')class ParseHtml(object):def __init__(self):self.header = {"Host": "k.autohome.com.cn","User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0","Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8","Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2","Accept-Encoding": "gzip, deflate, br","Connection": "keep-alive","Upgrade-Insecure-Requests": "1"}def get_html_doc(self, url):"""根据传入的url,获得所有口碑页面的html代码"""s = requests.Session()resp = s.get(url, verify=False)if resp.status_code != 200:return 1else:return resp.contentdef get_text_con(self, html_doc):"""解析网页源代码,利用css属性,获得口碑内容部分的源代码"""soup = BeautifulSoup(html_doc,'lxml')mouth_item = soup.find_all(class_='mouth-item')[-1:][0]text_con = mouth_item.find(class_="text-con")return text_condef get_font_url(self, html_doc):"""利用正则获取字体文件链接"""regex = r'\w+\.\w+\..*?ttf'font_url = re.findall(regex, html_doc)[0]return font_urldef run():url = "https://k.autohome.com.cn/detail/view_01c16ytpa964w38c1s70v00000.html?st=2&piap=0|2123|0|0|1|0|0|0|0|0|1#pvareaid=2112108"parse = ParseHtml()html_doc = parse.get_text_con(url) # 获得网页源代码 ,如果状态码不是200,则返回404if html_doc == 1:run()else:# 获取字体文件链接, 并下载字体文件font_url = parse.get_font_url(html_doc)decode_fontfile = DecodeFontFile()decode_fontfile.download_fontfile(font_url)text_con = parse.get_text_con(html_doc)decode_script = DecodeScript()list_text = decode_script.get_text_con(text_con, decode_fontfile)for text in list_text:for key, value in text.items():print(key+":"+value)run()

decode_script.py

# -*- coding:utf-8 -*-
"""对混淆的js代码破解,获取想要的内容"""
from bs4 import BeautifulSoup
import re
import PyV8
from document import Global
from decode_fontfile import DecodeFontFile
import sysreload(sys)
sys.setdefaultencoding('utf8')class DecodeScript(object):"""传入口碑的所有内容, 返回正常文本信息"""def get_list_part(self, text_con):"""传入口碑内容,返回拆分后的列表"""return str(text_con).split('【')[1:]def get_list_title_con_js(self, part_con):"""获取标题和混淆的js代码"""# 获取小标题title = part_con.split("】")[0]# 获取加密的文本start = re.search('<!--@athm_BASE64@-->', part_con).span()[1]end = re.search('<!--@athm_js@-->', part_con).span()[0]part_base64 = part_con[start: end].decode("utf-8")# 获取混淆的js代码soup_part = BeautifulSoup(part_con, "lxml")h_js = soup_part.find('script')# 将标题和混淆的js存入一个列表list_title_con_js = [title, part_base64, h_js]return list_title_con_jsdef put_js(self, js):"""组装js代码"""# 去掉多余字符,用切片也可以# if '<script>' in js:#     js = js.replace('<script>', "")# if '</script>' in js:#     js = js.replace('</script>', "")js = str(js)[8:-9]# 在开始处定义变量def_var = "var result = "js = def_var+js# 在指定位置定义数组first_point = js.index("{")def_arr = "var arr = [];"js = js[:first_point+1]+def_arr+js[first_point+1:]# 在指定位置给数组赋值regex = r"function\s*\w+\(\)\s*\{\s*(\w+)\s*=[\s\S]*?\);\s*(\w+)\s*=[\s\S]*?\);\s*(\w+)\s*=[\s\S]*?\);"tuple_groups = re.search(regex, js).groups()second_point = re.search(regex, js).span()[1]set_arr = "arr = ["+str(tuple_groups[0])+", "+str(tuple_groups[1])+"];"js = js[:second_point]+set_arr+js[second_point:]# 在指定位置return数组add_return = "return arr;"js = js.strip()js = js[:-13]+add_return+js[-13:]return jsdef run_js(self, js):"""在v8中运行js,获得16进制数字和对应数字"""glob = Global()list_num16 = []list_index = []with PyV8.JSContext(glob) as ctext:ctext.eval(js)vars = ctext.localsjs_array = vars.resultfor num16 in js_array[0]:list_num16.append(num16)for index in js_array[1]:list_index.append(index)return [list_num16, list_index]def replace_span(self, part_con, decode_fontfile):"""用16进制数字替换掉段落中的span"""list_title_con_js = self.get_list_title_con_js(part_con)title = list_title_con_js[0]                   #获取标题con = list_title_con_js[1]                     #获取加密后段落js = self.put_js(list_title_con_js[2])         #获取js后重新组装jslist_num16_index = self.run_js(js)             #利用v8运行js,获得16进制数字和对应关系list_num16 = list_num16_index[0]list_num16 = list_num16[0].split(",")list_index = list_num16_index[1]regex = r"<span\s*class[\s\S]*?hs_kw(\d+)[\s\S]*?</span>"list_span = re.finditer(regex, con)for span in list_span:tag_span = span.group().encode('unicode_escape').decode('string_escape')index = list_index[int(span.group(1))]num16 = list_num16[int(index)]glyph = "uni"+num16.upper()decode = DecodeFontFile()font = decode_fontfile.get_font(glyph)con = con.replace(tag_span, font)return {title: str(con)}def get_text_con(self, text_con, decode_fontfile):# 传入完成口碑加密内容, 返回按标题分割的片断列表list_part = self.get_list_part(text_con)content = []for part_con in list_part:part_text = self.replace_span(part_con, decode_fontfile)content.append(part_text)return content

decode_fontfile.py

# -*- coding:utf-8 -*-
"""解析字体文件"""
from fontTools.ttLib import TTFont
import requests
import re
import oslist_font = [ ' ', '一', '七', '三', '上', '下', '不', '中', '档', '比', '油', '泥', '灯', '九', '了', '二', '五','低', '保', '光', '八', '公', '六', '养', '内', '冷', '副', '加', '动', '十', '电', '的', '皮', '盘', '真', '着', '路', '身','软', '过', '近', '远', '里', '量', '长', '门', '问', '只', '右', '启', '呢', '味', '和', '响', '四', '地', '坏', '坐', '外','多', '大', '好', '孩', '实', '小', '少', '短', '矮', '硬', '空', '级', '耗', '雨', '音', '高', '左', '开', '当', '很', '得','性', '自', '手', '排', '控', '无', '是', '更', '有', '机', '来' ]class DecodeFontFile(object):def __init__(self):self.file_path = ""self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"}def download_fontfile(self, font_url):font_url = "http://"+font_urlcont = requests.get(font_url, headers=self.headers).contentfile_name = re.findall(r'\w{20,}[\s\S]*?ttf', font_url)[0]self.file_path = "./fonts/"+file_namewith open(self.file_path, "wb") as f:f.write(cont)# 创建 self.font 属性def get_glyph_id(self, glyph):ttf = TTFont(self.file_path)# gly_list = ttf.getGlyphOrder()  # 获取 GlyphOrder 字段的值index = ttf.getGlyphID(glyph)# os.remove(self.file_path)return indexdef get_font(self, glyph):id = self.get_glyph_id(glyph)return list_font[id]

document.py

# -*- coding:utf-8 -*-
"""模拟Document对象和window对象"""
import PyV8class Element():def __init__(self):self.sheet = ""
class Head(object):def appendChild(self, *args, **kwargs):return "sheet"class v8Doc(PyV8.JSClass):def createElement(self,  *args, **kwargs):return Element()def getElementsByTagName(self, *args, **kwargs):head = Head()list = [head]return listdef getComputedStyle(self, *args, **kwargs):return Nonedef decodeURIComponent(self, *args, **kwargs):return argsdef querySelectorAll(self, *args, **kwargs):return Noneclass Global(PyV8.JSClass):def __init__(self):self.document = v8Doc()self.window = v8Doc()

输出结果

Python爬取最新反爬虫汽车之家口碑相关推荐

用Python爬取最新股票数据含完整源代码
用Python爬取最新股票数据含完整源代码抓取目标: url:http://webapi.cninfo.com.cn/#/marketDataDate 数据目标: 获取证券代码证券简称交易日期 ...
python爬取喜马拉雅_Python爬虫实战案例之爬取喜马拉雅音频数据详解
这篇文章我们来讲一下在网站建设中,Python爬虫实战案例之爬取喜马拉雅音频数据详解.本文对大家进行网站开发设计工作或者学习都有一定帮助,下面让我们进入正文. 前言喜马拉雅是专业的音频分享平台,汇集 ...
python爬取晋江_爬虫爬取晋江文学网总分榜（失败）
一.目的 : 爬取晋江文学网总分榜二.python爬取数据三.爬取在开始多出现了38号而且顺序内容不准确代码: import requests from bs4 import Beautifu ...
python爬取晋江_爬虫爬取晋江文学网总分榜
一.目的 : 爬取晋江文学网总分榜二.python爬取数据三.爬取在开始多出现了38号而且顺序内容不准确代码: import requests from bs4 import Beautifu ...
python爬取最新说章节_练习_Python3 爬取笔趣阁最新小说章节
警告:本文代码仅供学习,禁止违法使用或商用. 这里拿人气小说<黎明之剑>来举个栗子,喜欢小说<黎明之剑>的朋友们请支持正版阅读. 笔趣阁网站上的其他书籍基本上的都可以套用,其他 ...
python爬取方式_Python 爬虫入门（三）—— 寻找合适的爬取策略
写爬虫之前,首先要明确爬取的数据.然后,思考从哪些地方可以获取这些数据.下面以一个实际案例来说明,怎么寻找一个好的爬虫策略.(代码仅供学习交流,切勿用作商业或其他有害行为) 1).方式一:直接爬取网站 ...
python爬取网页内容_Python爬虫原理解析
笔者公众号:技术杂学铺笔者网站:mwhitelab.com 本文将从何为爬虫.网页结构.python代码实现等方面逐步解析网络爬虫. 1. 何为爬虫如今互联网上存储着大量的信息. 作为普通网民,我 ...
python爬取拉勾网_python爬虫—爬取拉钩网
本人自学python,小试牛刀,爬取广州片区拉钩网招聘信息.仅用于学习参考文章:https://blog.csdn.net/SvJr6gGCzUJ96OyUo/article/details/805 ...
python爬取网页内容post_python爬虫之使用POST抓取网页内容
首先先向小伙伴介绍一下HTTP中GET和POST 教小伙伴们写爬虫,通过POST获取网页内容使用POST的原因: GET是不安全的,因为在传输过程,数据被放在请求的URL中,而如今现有的很多服务器. ...
python 爬取直播_python爬虫，轻快爬取直播平台热度排行
''' 需求分析 1.爬取页面 2.获取单个房间的数据 3.从上面数据中提取主播昵称和查看次数 4.格式化数据 5.按查看次数排序数据 6.添加序号 7.打印输出 ''' ''' 理解面向对象的封装思 ...

Python爬取最新反爬虫汽车之家口碑

Python爬取最新反爬虫汽车之家口碑相关推荐

最新文章

热门文章