pythonttf字体反爬虫_利用Python采集起点中文网小说，并解决字体反爬的问题

个人比较喜欢看小说，于是乎想利用Python爬取小说网站--起点中文网，在Python编程爬取定位过程中遇到了Python反爬虫，咨询了我旁边的前端大神，说下方法

当前页面接口返回的html源码

万字

第一步：获取当前页面的字体文件链接，可以通过正则获取

start_url = 'https://www.qidian.com/finish?action=hidden&orderId=&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=2&page=1'

#获取当前页面的html

response = requests.get(start_url).text

#通过正则获取当前页面字体文件链接

url = re.search('woff.*?url.*?\'(.+?)\'.*?truetype',fonturl) .group(1)

第二步:通过fontTools模块获取当前字体映射关系

def get_font(url):

response = requests.get(url)

font = TTFont(BytesIO(response.content))

cmap = font.getBestCmap()

font.close()

return cmap

第三步：通过当前映射关系可以对应的数据被变更为英文，然后创建dict已经转换

def get_encode(cmap,values):

WORD_MAP = {'zero':'0','one':'1','two':'2','three':'3','four':'4','five':'5','six':'6','seven':'7','eight':'8','nine':'9','period':'.'}

word_count=''

for value in values.split(';'):

value = value[2:]

key = cmap[int(value)]

word_count += WORD_MAP[key]

return word_count

第四步：然后就是通过pyquery进行数据提取

def get_index(start_url):

#获取当前页面的html

response = requests.get(start_url).text

doc = pq(response)

#获取当前字体文件名称

classattr = doc('p.update > span > span').attr('class')

pattern = '(.*?)'%classattr

#获取当前页面所有被字数字符

numberlist = re.findall(pattern,response)

#获取当前包含字体文件链接的文本

fonturl = doc('p.update > span > style').text()

#通过正则获取当前页面字体文件链接

url = re.search('woff.*?url.*?\'(.+?)\'.*?truetype',fonturl) .group(1)

cmap = get_font(url)

books = doc('.all-img-list li').items()

i = 0

for book in books:

item = {}

item['img'] = 'http:' + book('.book-img-box a img').attr('src')

item['bookname'] = book('.book-mid-info h4 a').text()

item['author'] = book('.name').text()

item['classes'] = book('p.author > a:nth-child(4)').text()

item['content'] = book('.intro').text()

item['number'] = get_encode(cmap,numberlist[i][:-1])

i += 1

第五步：将输入存入mongodb

client = pymongo.MongoClient('127.0.0.1')

db = client.qidian

p = db.finish

def mongo(item):

p.insert(item)

附当前Python爬虫文件源码

#coding=utf-8

import requests,json,time,re

from requests.exceptions import RequestException

from pyquery import PyQuery as pq

from fontTools.ttLib import TTFont

from io import BytesIO

import pymongo

client = pymongo.MongoClient('127.0.0.1')

db = client.qidian

p = db.finish

start_url = 'https://www.qidian.com/finish?action=hidden&orderId=&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=2&page='

def get_font(url):

response = requests.get(url)

font = TTFont(BytesIO(response.content))

cmap = font.getBestCmap()

font.close()

return cmap

def get_encode(cmap,values):

WORD_MAP = {'zero':'0','one':'1','two':'2','three':'3','four':'4','five':'5','six':'6','seven':'7','eight':'8','nine':'9','period':'.'}

word_count=''

for value in values.split(';'):

value = value[2:]

key = cmap[int(value)]

word_count += WORD_MAP[key]

return word_count

def get_index(start_url):

#获取当前页面的html

response = requests.get(start_url).text

doc = pq(response)

#获取当前字体文件名称

classattr = doc('p.update > span > span').attr('class')

pattern = '(.*?)'%classattr

#获取当前页面所有被字数字符

numberlist = re.findall(pattern,response)

#获取当前包含字体文件链接的文本

fonturl = doc('p.update > span > style').text()

#通过正则获取当前页面字体文件链接

url = re.search('woff.*?url.*?\'(.+?)\'.*?truetype',fonturl) .group(1)

cmap = get_font(url)

books = doc('.all-img-list li').items()

i = 0

for book in books:

item = {}

item['img'] = 'http:' + book('.book-img-box a img').attr('src')

item['bookname'] = book('.book-mid-info h4 a').text()

item['author'] = book('.name').text()

item['classes'] = book('p.author > a:nth-child(4)').text()

item['content'] = book('.intro').text()

item['number'] = get_encode(cmap,numberlist[i][:-1])

i += 1

mongo(item)

def mongo(item):

p.insert(item)

def main():

for page in range(1,1000):

url = start_url + str(page)

get_index(url)

if __name__ == '__main__':

main()

针对月票榜月票数字体反爬修改

def get_index(start_url):

# 获取当前页面的html

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

response = requests.get(start_url).text

doc = pq(response)

# 获取当前包含字体文件链接的文本

fonturl = doc('div.total > p > span > style').text()

# 通过正则获取当前页面字体文件链接

addr = re.search('font-family: (.+?); src', fonturl).group(1)

url = 'https://qidian.gtimg.com/qd_anti_spider/{addr}.woff'.format(addr=addr)

cmap = get_font(url)

print(cmap)

# 获取当前字体文件名称

pattern = '(.*?)' % addr

# 获取当前页面所有被字数字符

numberlist = re.findall(pattern, response)

print('numberlist: ', numberlist)

books = doc('.book-img-text li').items()

i = 0

print('i: ', i)

for book in books:

item = {}

item['img'] = 'http:' + book('.book-img-box a img').attr('src')

item['bookname'] = book('.book-mid-info h4 a').text()

item['author'] = book('.name').text()

item['classes'] = book('p.author > a:nth-child(4)').text()

item['content'] = book('.intro').text()

item['number'] = get_encode(cmap, numberlist[i][:-1])

item['font_url'] = url

i += 1

mongo(item)

pythonttf字体反爬虫_利用Python采集起点中文网小说，并解决字体反爬的问题相关推荐

python 取反_利用python怎么对bool布尔值进行取反
利用python怎么对bool布尔值进行取反发布时间:2020-12-14 14:49:17 来源:亿速云阅读:71 这期内容当中小编将会给大家带来有关利用python怎么对bool布尔值进行取反 ...
python取反操作_在python中对于bool布尔值的取反操作
背景根据公司业务的需求,需要做一个对于mysql数据库的大批量更新.脚本嘛也是干干单单.使用了redis的队列做缓存,可以异步并发的多任务进行更新. 有点难受的地方在于,请求访问时,因为一些网速,速 ...
java+如何解决反爬虫_反爬虫，到底是怎么回事儿？
原标题:反爬虫,到底是怎么回事儿? 有位被爬虫摧残的读者留言问:「网站经常被外面的爬虫程序骚扰怎么办,有什么方法可以阻止爬虫吗? 」这是个好问题,自从 Python 火了起来,编写爬虫程序的门口越来 ...
爬空气质量MySQL_爬虫：利用selenium采集某某环境网站的空气质量数据
前言:在上一篇文章中,我们介绍了在http://PM2.5.in这个网站采集空气质量的数据,本篇文章是对其产生的一些问题的另一种解决方案,提供更加权威的数据采集. 技术框架:selenium.json ...
【爬虫】利用Python爬虫爬取小麦苗itpub博客的所有文章的连接地址并写入Excel中（2）...
[爬虫]利用Python爬虫爬取小麦苗itpub博客的所有文章的连接地址并写入Excel中(2) 第一篇( http://blog.itpub.net/26736162/viewspace-22865 ...
python把汉字变成拼音英文_利用python将表格中的汉字转化为拼音
GB18030的字符集标准 http://zbgb5.com/2/StandardDetail479488.htm 缺少包时用pip install 进行安装,例如: pip install xlsx ...
python爬虫之爬取起点中文网小说
python爬虫之爬取起点中文网小说 hello大家好,这篇文章带大家来制作一个python爬虫爬取阅文集团旗下产品起点中文网的程序,这篇文章的灵感来源于本人制作的一个项目:电脑助手启帆助手 ⬆是项 ...
python 爬虫抓取网页数据导出excel_Python爬虫|爬取起点中文网小说信息保存到Excel...
前言: 爬取起点中文网全部小说基本信息,小说名.作者.类别.连载\完结情况.简介,并将爬取的数据存储与EXCEL表中环境:Python3.7 PyCharm Chrome浏览器主要模块:xlwt ...
Python之起点中文网爬虫
Python之起点中文网爬虫注:请勿用于其他用途,仅供学习使用 import requests import re import os from lxml import etreehead = {& ...

pythonttf字体反爬虫_利用Python采集起点中文网小说，并解决字体反爬的问题

pythonttf字体反爬虫_利用Python采集起点中文网小说，并解决字体反爬的问题相关推荐

最新文章

热门文章