python练手漫画爬虫，代码，软件成品打包下载链接，效果图

#-*-coding:GBK -*-
import urllib.request
import lxml
import pyquery
import zlib
import winreg #操作注册表
from bs4 import BeautifulSoup
import requests
import re
import time
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
import threading  #多线程
import os
from selenium.webdriver.support.select import Select
import tkinter as tk
import tkinter.messagebox as msg
from tkinter import *
import win32gui,win32api,win32con
from win32gui import *
def key360(): #获取360浏览器位置UnInsKey360 = '360SeSES\shell\open\command'key360 = winreg.OpenKey(winreg.HKEY_CLASSES_ROOT, UnInsKey360)name,value,type = winreg.EnumValue(key360,0) #注册表键名，键值，数据类型num = re.findall(r"(.+?)360se.exe",value)num = num[0]+'360se.exe'return(num)
def thread_it(func, *args):'''将函数打包进线程'''# 创建t = threading.Thread(target=func, args=args) # 守护 !!!t.setDaemon(True) # 启动t.start()# 阻塞--卡死界面！# t.join()  def getlink(url): #获取漫画章节headers = ("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36")opener = urllib.request.build_opener()opener.addheaders = [headers]urllib.request.install_opener(opener)
def get_link(url):file = urllib.request.urlopen(url).read()file = file.decode('ANSI')getlink(url)#pattern = '(https?://[^\s)";]+(\.(\w|/)*))'pattern_path = ('class="pic" title=\"((.*?))\"')pattern_path_link = re.compile(pattern_path,re.S).findall(file)pattern_path_link = list(pattern_path_link)for pattern_path_link in pattern_path_link:path = os.getcwd()global file_pathfile_path = path + '\\'+ str(pattern_path_link[0])if not os.path.exists(file_path):os.mkdir(file_path)else:print (file_path+' 目录已存在')continueglobal path_file1path_file1 = pattern_path_link[0]pattern = ('href=\"((.*?))\"')pattern1 = re.compile('class=\"plist pmedium max-h200\".*?>(.*?)</div>', re.S)link1 = pattern1.findall(file)#class="plist pmedium max-h200"到第一个</div>范围内内容link1 = str(link1)#pattern = re.compile('href=\"(.*?)\".*?</a>', re.S)link = re.compile(pattern,re.S).findall(link1)#print(link)#去重#link = list(set(link))global filenameglobal filename1filename='link.txt'filename1='link1.txt'for link in link:gethtml= 'http://www.pufei8.com' + link[0]print(gethtml)with open(filename,'a') as file_object:file_object.write(gethtml + "\n")
def get_link2(): #获取各个章节链接列表link2 = get_page()for i in link2:url=iget_link(url)with open(filename) as file_object:link3 = file_object.readlines()return link3
def liulanqi():link4 = get_link2()__browser_url = key360() ##360浏览器的地址chrome_options = Options()chrome_options.binary_location = __browser_urlpath1=os.getcwd()path =os.path.join(path1,"78.0.39.4.108\chromedriver.exe")  # 谷歌chromedriver完整路径# 设置chrome驱动的路径#driver = webdriver.Chrome(executable_path=path)options=chrome_optionsglobal driverdriver = webdriver.Chrome(path,options=chrome_options)#options=webdriver.ChromeOptions()options.add_experimental_option("excludeSwitches",['enable-automation']) # 此步骤很重要，设置为开发者模式，防止被各大网站识别出来使用了Seleniumdriver.maximize_window() # 最大化浏览器for i in link4:url=i.rstrip()driver.get(url)tag = driver.find_element_by_tag_name("select")getlink(url)file = urllib.request.urlopen(url).read()file = file.decode('ANSI')pattern_path = ('viewname = \"((.*?))\"')pattern_path_link = re.compile(pattern_path,re.S).findall(file)pattern_path_link = list(pattern_path_link)for pattern_path_link in pattern_path_link:global file_pathpattern_path_link1 = str(pattern_path_link[0])pattern_path_link1 =re.findall('[\u4e00-\u9fa5a-zA-Z0-9]+',pattern_path_link1,re.S)#pattern_path_link1 ="".join(pattern_path_link1)#只要中文，字母，数字file_path_zhang = file_path + '\\'+ pattern_path_link1if not os.path.exists(file_path_zhang):os.mkdir(file_path_zhang)else:print (file_path_zhang+' 目录已存在')continue#file_path_zhang = file_path + jpg_path = pattern_path_link[0]jpg_path =re.findall('[\u4e00-\u9fa5a-zA-Z0-9]+',jpg_path,re.S)#jpg_path ="".join(jpg_path)#只要中文，字母，数字a_list = Select(tag).optionslisthua = list(range(1,len(a_list)+1))gethtml1=[]for i in listhua:gethtml2 = url + r'?page=' + str(i)gethtml1.append(gethtml2)filename1='link1.txt'for i in gethtml1:url1=iprint(url1)b_list = gethtml1.index(i)+ 1result = re.search('page=(.*)',url1)result1 = result.group(1)driver.get(url1)tag1 = driver.find_element_by_id("viewimg").get_attribute("src")response =requests.get(tag1)file_path_img ='{}/{}/{}.{}'.format(path_file1,jpg_path,result1,'jpg')if not os.path.exists(file_path_img):with open(file_path_img,'wb') as f:f.write(response.content)if len(a_list) > b_list:continueelse:with open('link1.txt','a') as file_object:file_object.write(url + "\n")   #(写入完成列表)else:if len(a_list) > b_list:continueelse:with open('link1.txt','a') as file_object:file_object.write(url + "\n")   #(写入完成列表)continue
def test(content):  #输入框内容限定为数字# 如果不加上==""的话，就会发现删不完。总会剩下一个数字if content.isdigit() or content == "" :return Trueelse:return False
def main_win():root1 = tk.Tk()root1.resizable(0, 0)v1 = tk.StringVar()v2 = tk.StringVar()root1.title("漫画爬虫")screenwidth = root1.winfo_screenwidth()screenheight = root1.winfo_screenheight()dialog_width = 360dialog_height = 180root1.geometry("%dx%d+%d+%d" % (dialog_width, dialog_height, (screenwidth-dialog_width)/2, (screenheight-dialog_height)/2))def start():file = open("link.txt", 'w').close()file = open("link1.txt", 'w').close()s1.config(state=tk.DISABLED)thread_it(liulanqi)def stop():s1.config(state=tk.NORMAL)thread_it(liulanqi)def con_start():with open("link.txt") as file_object:lines = file_object.readlines()with open("link1.txt") as file_object:lines1 = file_object.readlines()lines2 = [i for i in lines if i not in lines1]with open("link.txt",'w') as file_object:for line2 in lines2:file_object.write(line2)thread_it(liulanqi)s1.config(state=tk.DISABLED)test_cmd = root1.register(test)v = IntVar()global ee = Entry(root1,width=10,textvariable=v,validate='key',  # 发生任何变动的时候，就会调用validatecommandvalidatecommand=(test_cmd, '%P')  # %P代表输入框的实时内容)e.place(x=150,y=20)s1 = tk.Button(root1, text='开始下载', font=('宋体', 12), width=8, height=1, command=lambda :thread_it(start))s1.place(x=100,y=60)s2 = tk.Button(root1, text='继续下载', font=('宋体', 12), width=8, height=1, command=lambda :thread_it(con_start))s2.place(x=200,y=60)s2 = tk.Button(root1, text='打开网页', font=('宋体', 12), width=8, height=1, command=lambda :thread_it(open_page))s2.place(x=150,y=120)root1.mainloop()def get_page():#获取网页链接url1 = 'http://www.pufei8.com/manhua/'global eputeput = e.get()#eput = float(eput)eput = int(eput)list1 = list(range(eput,eput+1))list2 = []page2 = []for i in list1:list2.append(str(i))for i in list2:url= url1 + ipage2.append(url)return(page2)
def open_page():__browser_url = key360() ##360浏览器的地址chrome_options = Options()chrome_options.binary_location = __browser_urlpath1=os.getcwd()path =os.path.join(path1,"78.0.39.4.108\chromedriver.exe")  # 谷歌chromedriver完整路径# 设置chrome驱动的路径#driver = webdriver.Chrome(executable_path=path)options=chrome_optionsglobal driverdriver = webdriver.Chrome(path,options=chrome_options)#options=webdriver.ChromeOptions()options.add_experimental_option("excludeSwitches",['enable-automation']) # 此步骤很重要，设置为开发者模式，防止被各大网站识别出来使用了Seleniumdriver.maximize_window() # 最大化浏览器driver.get("http://www.pufei8.com")
main_win()

360浏览器必装，自己的360浏览器可能因为内核版本不同而停留在data页面
没做病毒免杀，安装请关掉卫士杀毒或添加白名单使用
任选一部漫画举例 http://www.pufei8.com/manhua/1/209670.html?page=14
manhua/后面的1就是要爬的整部漫画，在输入框输入即可，如果没有对应链接会报错，异常没有处理，退出重来就可以了。
漫画没有下载完整而关掉了软件可以再打开点击继续下载，这时不要点击开始下载，会清空下载进度。
这个网站有时会刷不出图片而跳过下载，只好把下载失败的图片删除重试了。
下载的漫画就生成在软件所在的目录文件夹
看图王可以对图片自动排序和跳过文件夹浏览图片非常方便和适合这种目录结构的漫画
有其他问题可以CNDS上私信我。

成品链接：https://pan.baidu.com/s/1hroQfMupsoEGGimSUQ9g_Q
提取码：cgqw

python练手漫画爬虫，代码，软件成品打包下载链接，效果图相关推荐

python实战-HTML形式爬虫-批量爬取电影下载链接
文章目录一.前言二.思路 1.网站返回内容 2.url分页结构 3.子页面访问形式 4.多种下载链接判断三.具体代码的实现四.总结一.前言喜欢看片的小伙伴,肯定想打造属于自己的私人影院 ...
python练手经典100例-Python练手项目实例汇总（附源码下载）
1 #_*_ coding:utf-8 _*_ 2 from tkinter import * 3 importrandom4 importtime5 importtkinter.messagebox ...
10个不到500行代码的超牛Python 练手项目
10个不到500行代码的超牛Python 练手项目图:内容概览.注:本文内容由实验楼搜集.整理自Github,实际项目版权归原作者所有. 以下10个练手项目均摘录自一本尚未出版的 Python 神书 ...
70个Python练手项目列表预祝大家快乐
小孩眺望远方,成人怀念故乡. 为此给大家分享一下珍藏的Python实战项目,祝大家节日快乐哦!!! Python 前言:不管学习哪门语言都希望能做出实际的东西来,这个实际的东西当然就是项目啦,不用多说 ...
70个Python练手项目列表，偷偷练习卷死他们
不管学习哪门语言都要做出实际的东西来,这个实际的东西就是项目这里整理了70个Python实战项目列表,都有完整且详细的教程,你可以从中选择自己想做的项目进行参考学习练手,你也可以从中寻找灵感去做自己 ...
python练手小程序—调整图片分辨率(大小)
在GitHub上发现一些很有意思的项目,由于本人作为Python的初学者,编程代码能力相对薄弱,为了加强Python的学习,特此利用前辈们的学习知识成果,自己去亲自实现. 一周没有更新了,主要还是自己 ...
Python练手----字符串的密钥加密
Python练手----字符串的密钥加密最近各种笔试加上leetcode刷题遇到很多加密题目,这些题目大同小异都是给出一个原始字符串和一个密钥.通过密钥和26个英文字母的对应关系对原文字符串进行加密 ...
Python毕设-【课堂人脸签到系统】附源码课件/Python练手项目/Python毕业设计
Python毕设-[课堂人脸签到系统]附源码课件/允许白嫖文章目录 Python毕设-[课堂人脸签到系统]附源码课件/允许白嫖系统简介一.本课题拟解决的问题二.系统技术栈三.开发工具四.数 ...
Python毕设-【人脸签到系统】附源码/Python练手项目/Python毕业设计
本人承诺只做技术分享,永不收费. V----------------->:专栏详情文章目录本人承诺只做技术分享,永不收费. V----------------->:专栏详情一.背景 ...

python练手漫画爬虫，代码，软件成品打包下载链接，效果图

python练手漫画爬虫，代码，软件成品打包下载链接，效果图相关推荐

最新文章

热门文章