python爬虫小说下载

笔趣阁的网页结构比较简单，但也有点乱，需要注意细节。

需要增加运行日志

#-*- coding:utf8 -*-
#从https://www.xbiquge.cc/网站下载小说
#https://www.xbiquge.cc/book/9860/
#https://www.xbiquge.cc/book/9860/7063460.html
#catalog目录，chapter章节
#r'[\u4e00-\u9fa5]+' 1到任意多个汉字
#r'\d{1,10}' 章节链接编号,章节链接在类名为box_con的第2个div中
#r'[\u4e00-\u9fa5]+\d{1,4}[\u4e00-\u9fa5]+ [\u4e00-\u9fa5]+' 小说章节名
import requests
import json
import re
import time
import os
import sys
from bs4 import BeautifulSoup
from docx import Document
from docx.shared import Cm
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'}
url = input('please input url:')
if len(url) < 24:#url = 'https://www.xbiquge.cc/book/9860/'#为了测试方便，设置默认地址url = 'https://www.xbiquge.cc/book/14779/'
rootPath = r'C:\Users\QQ\Desktop\ls\py\{}'
#name = '我的微信连三界 狼烟新书'#name和saveCatalog()必须要注释掉一个
name = '一世兵王 我本疯狂新书'
def getCatalog():def saveCatalog():rep = requests.get(url, headers = headers)print(rep.text[:10])rep.encoding = 'gbk'soup = BeautifulSoup(rep.text, 'lxml')#解析title = soup.title.contents[0]print(title)global namename = (re.findall('(.+?) ', title))[0] + ' ' + (re.findall('_(.+?)_', title))[0]#小说名print(name)mkDir(path = rootPath.format(name))#为之后将要保存的文件创建文件夹f1 = r'C:\Users\QQ\Desktop\ls\py\{}\{}.txt'.format(name,'目录')with open(f1, 'w') as f:f.write(rep.text)#saveCatalog()#只需要运行一次def findAllChapter():f1 = r'C:\Users\QQ\Desktop\ls\py\{}\{}.txt'.format(name,'目录')f2 = r'C:\Users\QQ\Desktop\ls\py\{}\{}.txt'.format(name,'章节链接')with open(f1, 'r') as f:rep = f.read()soup = BeautifulSoup(rep, 'lxml')s = str(soup.find(id='list'))soup = BeautifulSoup(s, 'lxml')ss = soup.findAll('a')[:]global cul,cnlcul = re.findall(r'\d{7,8}.html', str(s))#ChapterUrlList#cnl = re.findall(r'第\d{1,4}章 [\u4e00-\u9fa5]+', str(ss))#ChapterNameList,我的微信连三界，漏掉了第373章 B级任务,修改#cnl = re.findall(r'>(第{0,1}\d{1,4}章 .+?)<', str(s))#ChapterNameList，一世兵王，漏掉了010 章 搂腰算非礼吗？#cnl = re.findall(r'>(第{0,1}\d{1,4} {0,1}章 .+?)<', str(s))#ChapterNameList，一世兵王，漏掉了137章无名字cnl = re.findall(r'>(第?\d{1,4} ?章? ?.*?)<', str(s))print(len(ss),len(cul),len(cnl))print(cul,cnl)print('len(cul):',len(cul),'len(cnl):',len(cnl))for i in range(0,1588):#检查正则表达式,检查完后需注释掉c = str(ss[i])cu = re.search(r'\d{7,8}.html',str(c)).group()cn = c[c.index('.html')+7:-4]if cu != cul[i] or cn != cnl[i]:print(cu,cul[i],cu==cul[i],cn,cnl[i],cn==cnl[i])breakwith open(f2, 'w') as f:for u,n in zip(cul,cnl):f.write(u + n + '\n')if len(cul) == len(cnl):with open(f2, 'w') as f:for u,n in zip(cul,cnl):f.write(u + n + '\n')print('All url and name of chapters from source have been saved in this file:{}'.format(f2))else:print('Rules require changes the regular expression')#需要修改正则表达式来适应网页的变化#如果未保存小说目录信息，则获取并保存,反之,开始提取各个章节的信息findAllChapter()def mkDir(path):if not os.path.exists(path):os.makedirs(path)def missingChapter():new = int(re.search(r'\d{1,4}',cnl[-1]).group())#print('newest chapter: ',cnl[-1])nl = [0]#chapter number listml = []#missing chapter number listfor i in range(len(cnl)):nl.append(int(re.search(r'\d{1,4}',cnl[i]).group()))d = nl[i] - nl[i-1]-1while d>0:ml.append(nl[i]-d)#print("missing chapters' number:{}!!!".format(ml[-1]),d)d-=1return nl'''for i in ml:if str(i) in str(cnl):print(i,True)else:print(i,False)'''
def saveChapter():f3 = r'C:\Users\QQ\Desktop\ls\py\{}\{}.txt'.format(name,name)#print(list(zip(cul[1900:],cnl[1900:])))with open(f3, 'a') as f:for cu,cn in zip(cul[modify():],cnl[modify():]):#开始位置根据实际情况调整rep = requests.get(url + cu, headers = headers)rep.encoding = 'gbk'content = ''for s in rep.text.splitlines():test1 = re.findall(r'&nbsp;&nbsp;&nbsp;&nbsp;(.+)<', s)if test1:content += test1[0] + '\n'if len(content)>1200:#章节字数少于1200则不写入文件f.write(content)f.write('\n')print('contents has been writen to file which from : {} {}'.format(cu,cn))else:f.write('\n')print(content)print('There are problems in this chapter : {} {} !!!'.format(cu,cn))break
def runlog():#记录每次运行时长、运行时间、已保存的章节、缺失章节、增加的章节等信息pass
def modify():#检查文件中是否有广告信息、多余字符、空章节。根据检查结果对saveChapter()进行完善f3 = r'C:\Users\QQ\Desktop\ls\py\{}\{}.txt'.format(name,name)f4 = r'C:\Users\QQ\Desktop\ls\py\{}\{}.txt'.format(name,'modify')with open(f3, 'r') as f, open(f4, 'w') as fs:cc(f)c=0li = f.readlines()#print(type(li),len(li))for n,i in enumerate(li):fs.write(i)if  i == '\n' and n < len(li)-1:c+=1if '第' not in li[n+1] and '章' not in li[n+1]:#print(cnl[c])fs.write(cnl[c] + '\n')passprint('c :',c,'cnl[c] :', cnl[c])return c
def cc(file):#count charactersf00 = r'C:\Users\QQ\Desktop\ls\py\{}\{}.txt'.format(name,'other characters')hs0 = {3 : '·、【】！@￥—～……（）；‘’：“”《》，。？、',4 : ''' `~!@#$%^&*()_+-={}|:％"<>?[]\;',./×'''}hs = {1 : 0,#中文2 : 0,#english letter3 : 0,#中文标点符号4 : 0,#english punctuation marks5 : 0,#数字6 : 0,#行数7 : 0,#中文字数占总字符数的比例}string = file.read()with open(f00, 'w') as f:for i in string:if 19968 <= ord(i) <= 40869:hs[1]+=1elif 65 <= ord(i) <=90 or 97 <= ord(i) <= 122:hs[2]+=1elif i in hs0[3]:hs[3]+=1elif i in hs0[4]:hs[4]+=1elif 48 <= ord(i) <= 57:hs[5]+=1elif i == '\n':hs[6]+=1else:f.write(i)#检查是否有其他特殊字符，应该是没有的。如果有，可能乱码了hs[7] = hs[1]/(len(string)+1)#len+1避免报错ZeroDivisionError: division by zerofile.seek(0)l = ['中文', 'english letter', '中文标点符号', 'english punctuation marks', '数字', '行数', '中文字数占总字符数的比例']for i in range(7):if i == 6:print('{} : {:.2%}'.format(l[i], hs[i+1]))else:print('{} : {:.2f}万'.format(l[i], hs[i+1]/10000))
def main():start = time.perf_counter()getCatalog()missingChapter()saveChapter()modify()end = time.perf_counter()print('total time consuming : ',(end - start)//60, 'minutes',(end  - start)%60, 'seconds')
main()

需要改进的地方：

逐一访问各章节非常耗时
没有完全避开广告信息
笔趣阁网页内容经常缺失，正则表达式未完全适应所有情况
章节序号为中文时，无法匹配

python爬虫小说下载相关推荐

python爬虫小说下载到txt文档_python 爬取网络小说清洗并下载至txt文件
什么是爬虫网络爬虫,也叫网络蜘蛛(spider),是一种用来自动浏览万维网的网络机器人.其目的一般为编纂网络索引. 网络搜索引擎等站点通过爬虫软件更新自身的网站内容或其对其他网站的索引.网络爬虫可以 ...
python爬虫小说下载异步
看别人用异步请求快的飞起, 忍不住手痒尝试了下, 但过程并不美好, 一不小心就是满屏的"红色警告" 不过成功的喜悦总是让人陶醉的, 这也是学习的魅力吧先看下运行过程需要插入一个 ...
Python爬虫实战——下载小说
Python爬虫实战--下载小说前言第三方库的安装示例代码效果演示结尾前言使用requests库下载开源网站的小说注意:本文仅用于学习交流,禁止用于盈利或侵权行为. 操作系统:wind ...
python爬虫-小说《大江大河》
python爬虫-小说<大江大河> 最近看了电视剧大江大河电视剧,挺好看的,就在网上找找小说看. 最近看了电视剧大江大河电视剧,挺好看的,就在网上找找小说看. 大江大河小说地址:傲宇中文网 ...
python爬虫下载-python爬虫之下载文件的方式总结以及程序实例
python爬虫之下载文件的方式以及下载实例目录第一种方法:urlretrieve方法下载第二种方法:request download 第三种方法:视频文件.大型文件下载实战演示第一种方法: ...
python 下载文件-python爬虫之下载文件的方式总结以及程序实例
python爬虫之下载文件的方式以及下载实例目录第一种方法:urlretrieve方法下载第二种方法:request download 第三种方法:视频文件.大型文件下载实战演示第一种方法: ...
python爬虫批量下载“简谱”
python讨论qq群:996113038 导语: 上次发过一篇关于"python打造电子琴"的文章,从阅读量来看,我们公众号的粉丝里面还是有很多对音乐感兴趣的朋友的.于是,今天我 ...
新一配：perl循环调用python爬虫批量下载喜马拉雅音频
新一配:perl循环调用python爬虫批量下载喜马拉雅音频手机下载喜马拉雅音频后,获得的音频文件虽然可以转成mp3格式,但其文件名却是一长串字符串,无法辨别是哪一集,网上找了各种工具,都有局限性, ...
2021-04-01裁判文书网数据python爬虫更新下载
长期持续更新数据 2020-11-08裁判文书网数据python爬虫更新下载添加链接描述截至3月已从数据库中下载1亿1200万条裁判文书数据,有需要数据的伙伴可以(｡･∀･)ﾉﾞ嗨前台QQ7900- ...

python爬虫小说下载

python爬虫小说下载相关推荐

最新文章

热门文章

python爬虫 小说下载

python爬虫 小说下载相关推荐

最新文章

热门文章

python爬虫小说下载

python爬虫小说下载相关推荐