Python爬虫获取电影链接(续)

上一篇文章中的两个网站都是用的是get方法，获取很简单并且没有任何防爬手段，后面我再尝试BT天堂，影视大全网发现更多的网站搜索页面是post请求并需要提交表单，

所以这里给之前的程序作出一些补充，使之可以爬虫需要post请求的网站。

首先提出一个使用fiddler的小技巧，断点查询，在这里点击Rules在其下拉列表中选择Automatic breakpoint之后选择After Request 这样更容易查询到浏览器提交的相关信息以及请求的url

这里以BT天堂为例子其网站url:http://www.bttiantangs.com/ BT天堂当然最好的方案是先进入该网页，获取该网页的cookies，但是我在实践中发现该网站并不会检查cookies。

通过fiddler发现提交请求的url为http://www.bttiantangs.com/e/search/new.php 请求类型为post 提交的数据只有一个‘keyboard’，值得注意的是这一项数据直接提交字符串不需要其他处理。

为了扩展性，将网址所有相关信息放在一个字典中：

btmovie={'name':'BT天堂','root':'http://www.bttiantangs.com','posturl':'http://www.bttiantangs.com/e/search/new.php','dict':{'keyboard':''},'encode':'utf-8','pat':['<a href="(.*?)" class="zoom" rel="bookmark" target="_blank" title="(.*?)">',r'<em>.*?</em></a><a href="(.*?)" target="_blank">']}

其信息包括网站主页地址，网站名称，请求url，编码格式，提交表单，以及提取信息的相关正则表达式，这样做好处在于需要添加更多网页时可以增加一个字典即可。数据存储的最后一步将此类型网站数据放入一个列表中：

postlist=[]
postlist.append(btmovie)

创建一个线程用以加工所需要提交的表单数据：

class posttask(threading.Thread):def __init__(self,keyword):self.keyword=keywordthreading.Thread.__init__(self)def run(self):for item in postlist:#对发送的字典进行加工if item['name'] == 'BT天堂':   item['dict']['keyboard']=self.keywordse=posturlget(urls=postlist)se.start()

这个类可以通过判断网站名对不同网站进行提交表单的加工，并传入请求url

接下来即需要请求搜索结果，并提取搜索结果中的相关连接，创建一个新的线程即上面程序中的posturlget类

class posturlget(threading.Thread):def __init__(self,urls):self.urls=urlsself.encode=''self.pat=[]self.id=''self.data={}self.res=''self.root=''threading.Thread.__init__(self)def postlist(self,url,counts=3):try:webpage=requests.post(headers=headers,data=self.data,url=url)webpage.encoding=self.encodeself.res=webpage.textexcept Exception as f:print(f)if counts>0:counts-=1time.sleep(1)self.postlist(url,counts)else:print('请求错误')def fenpei(self):urllist=re.findall(self.pat[0],self.res)canshu={'name':self.id,'pat':self.pat[1],'encode':self.encode}task=spdier(dic=canshu)task.start()if urllist:for item in urllist:item=self.root+item[0]Cannel[self.id].put(item)Cannel[self.id].task_done()print("分配完成")else:print("没有匹配项")def run(self):for item in self.urls:self.encode=item['encode']self.pat=item['pat']self.data=item['dict']self.id=item['name']self.postlist(url=item['posturl'])self.root=item['root']if self.data:self.fenpei()

这里加入一些异常的处理措施让爬虫遇到异常时不至于崩溃，并且提取出相关详情页面的连接，存放在列表中，生成一个字典存放爬虫需要的相关数据传入最后的爬虫线程

这一步其实就是一个post请求获取搜索结果页面并提取出其中的详情连接，存放到相应队列中

最后跟上一个程序一样，具有较强通用性的下载连接获取爬虫：

class spdier(threading.Thread):#通用爬虫def __init__(self,dic):#dic是一个字典key为pat，encode，namethreading.Thread.__init__(self)self.id=dic['name']self.pat=dic['pat']self.encode=dic['encode']self.data=[]self.wait=5self.timeout=3def connect(self):try:if not Cannel[self.id].empty():#检测队列是否为空url=Cannel[self.id].get()print("%s has running for %s"%(self.id,url))webpage=requests.get(url=url,headers=headers,timeout=5)webpage.encoding=self.encodeself.data.append(webpage.text)self.timeout=3self.wait=5self.connect()else:print("%s wait for task!"%(self.id))if self.wait>0:self.wait-=1time.sleep(1)self.connect()else:print("%s connect compelet!"%(self.id))returnexcept Exception as f:print(f)if self.timeout>0:self.timeout-=1time.sleep(1)self.connect()else:print("连接失败")self.connect()def getres(self):for each in self.data:res=re.findall(self.pat,each)if res:for item in res:downloadurl[self.id].append(item)else:print("没有相关连接")def run(self):self.connect()if self.data:self.getres()print("%s has make the result!"%self.id)save=open(r'f://'+self.id+'.txt','w')for d in downloadurl[self.id]:save.write(d)save.write('\n')save.close()print("%s work compelet!"%self.id)returnelse:print("%s 缺少相关信息"%self.id)return

该线程启动需要一个字典包含正则信息，编码信息，和url集合

最终该程序将会将结果生成一个简单的txt文档存放在f盘中如上一个程序一样

贴出修改后整体程序：

import requests
import re
import threading
import queue
import timeheaders={'User-Agent':r'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0;  TheWorld 7)',}
proxies={#代理配置}
task1=queue.Queue()
task2=queue.Queue()
task3=queue.Queue()
Cannel={'爱下电影':task1,'电影天堂':task2,'BT天堂':task3}#队列的字典
downloadurl={'爱下电影':[],'电影天堂':[],'BT天堂':[]}"""
website中字典数据格式:
{'name':'网站名','url':'网站地址半加工','pat':[正则1，正则2],'root':'原本地址''encode':'编码格式',
}
"""
aixiamovie={'name':'爱下电影','url':r'http://www.aixia.cc/plus/search.php?searchtype=titlekeyword&q=','root':r'http://www.aixia.cc','pat':['<h1 class=".*?"><a href="(.*?)" target="_blank">','οnclick="copyUrl(.*?)">'],'encode':'utf-8'}
tiantang={'name':'电影天堂','url':r'http://s.dydytt.net/plus/so.php?kwtype=0&searchtype=title&keyword=','root':r'http://s.dydytt.net','pat':["<td width='.*?'><b><a href='(.*?)'>",'<td style=".*?" bgcolor=".*?"><a href="(.*?)">'],'encode':'gb2312',}
btmovie={'name':'BT天堂','root':'http://www.bttiantangs.com','posturl':'http://www.bttiantangs.com/e/search/new.php','dict':{'keyboard':''},'encode':'utf-8','pat':['<a href="(.*?)" class="zoom" rel="bookmark" target="_blank" title="(.*?)">',r'<em>.*?</em></a><a href="(.*?)" target="_blank">']}
weblist=[]
weblist.append(aixiamovie)
weblist.append(tiantang)postlist=[]
postlist.append(btmovie)class spdier(threading.Thread):#通用爬虫def __init__(self,dic):#dic是一个字典key为pat，encode，namethreading.Thread.__init__(self)self.id=dic['name']self.pat=dic['pat']self.encode=dic['encode']self.data=[]self.wait=5self.timeout=3def connect(self):try:if not Cannel[self.id].empty():#检测队列是否为空url=Cannel[self.id].get()print("%s has running for %s"%(self.id,url))webpage=requests.get(url=url,headers=headers,timeout=5)webpage.encoding=self.encodeself.data.append(webpage.text)self.timeout=3self.wait=5self.connect()else:print("%s wait for task!"%(self.id))if self.wait>0:self.wait-=1time.sleep(1)self.connect()else:print("%s connect compelet!"%(self.id))returnexcept Exception as f:print(f)if self.timeout>0:self.timeout-=1time.sleep(1)self.connect()else:print("连接失败")self.connect()def getres(self):for each in self.data:res=re.findall(self.pat,each)if res:for item in res:downloadurl[self.id].append(item)else:print("没有相关连接")def run(self):self.connect()if self.data:self.getres()print("%s has make the result!"%self.id)save=open(r'f://'+self.id+'.txt','w')for d in downloadurl[self.id]:save.write(d)save.write('\n')save.close()print("%s work compelet!"%self.id)returnelse:print("%s 缺少相关信息"%self.id)returnclass findurls(threading.Thread):def __init__(self,website):threading.Thread.__init__(self)#website是一个key为网站名,网址,正则表达式的字典集合成的列表self.website=websiteself.data=''self.id=''self.pat=''self.root=''self.encode=''def connect(self,url,counts=3):try:webpage=requests.get(headers=headers,url=url)webpage.encoding=self.encodeself.data=webpage.textexcept Exception as f:print(f)if counts > 0:print('%s 连接失败，即将重新连接'%url)time.sleep(1)counts-=1self.connect(url=url,counts=counts)else:print("爬取失败")returndef urlgets(self):if self.data:res=re.findall(self.pat[0],self.data)canshu={'name':self.id,'pat':self.pat[1],'encode':self.encode}if res:#这里可以开启爬虫线程了thread1=spdier(dic=canshu)thread1.start()for item in res:item=self.root+itemCannel[self.id].put(item)#根据网站名投入队列Cannel[self.id].task_done()else:print("没有相关结果")else:print("没有返回数据，爬虫失败")def run(self):for item in self.website:self.id=item['name']self.pat=item['pat']#第一个正则获取详情连接 第二个正则获取下载连接self.encode=item['encode']self.connect(url=item['url'])self.root=item['root']self.urlgets()print("任务分配完成")returnclass taskstart(threading.Thread):def __init__(self,keyword):threading.Thread.__init__(self)self.keyword=keyworddef run(self):for item in weblist:#加工搜索地址temp=str(self.keyword.encode(item['encode']))temp=temp.replace(r'\x','%')temp=temp[2:]item['url']=item['url']+tempbe=findurls(website=weblist)be.start()return
class posturlget(threading.Thread):def __init__(self,urls):self.urls=urlsself.encode=''self.pat=[]self.id=''self.data={}self.res=''self.root=''threading.Thread.__init__(self)def postlist(self,url,counts=3):try:webpage=requests.post(headers=headers,data=self.data,url=url)webpage.encoding=self.encodeself.res=webpage.textexcept Exception as f:print(f)if counts>0:counts-=1time.sleep(1)self.postlist(url,counts)else:print('请求错误')def fenpei(self):urllist=re.findall(self.pat[0],self.res)canshu={'name':self.id,'pat':self.pat[1],'encode':self.encode}task=spdier(dic=canshu)task.start()if urllist:for item in urllist:item=self.root+item[0]Cannel[self.id].put(item)Cannel[self.id].task_done()print("分配完成")else:print("没有匹配项")def run(self):for item in self.urls:self.encode=item['encode']self.pat=item['pat']self.data=item['dict']self.id=item['name']self.postlist(url=item['posturl'])self.root=item['root']if self.data:self.fenpei()class posttask(threading.Thread):def __init__(self,keyword):self.keyword=keywordthreading.Thread.__init__(self)def run(self):for item in postlist:#对发送的字典进行加工if item['name'] == 'BT天堂':   item['dict']['keyboard']=self.keywordse=posturlget(urls=postlist)se.start()keyword=input("请输入电影名称 ")
get=taskstart(keyword=keyword)
post=posttask(keyword=keyword)
get.start()
post.start()

Python爬虫获取电影链接(续)相关推荐

python爬虫获取电影天堂中电影的标题与下载地址，并用正则表达匹配电影类型
在电影天堂的列表页面,爬取每个链接的子页面中的,电影标题以及下载地址,并用正则表达式匹配出想要的电影类型源代码获取: https://github.com/akh5/Python/blob/mast ...
python爬虫获取url_Python爬虫如何获取页面内所有URL链接？本文详解
如何获取一个页面内所有URL链接?在Python中可以使用urllib对网页进行爬取,然后利用Beautiful Soup对爬取的页面进行解析,提取出所有的URL. 什么是Beautiful Soup ...
python爬虫获取下一页url_Python爬虫获取页面所有URL链接过程详解
如何获取一个页面内所有URL链接?在python中可以使用urllib对网页进行爬取,然后利用Beautiful Soup对爬取的页面进行解析,提取出所有的URL. 什么是Beautiful Soup ...
python爬虫之js链接跳转抓取_Python爬虫获取页面所有URL链接过程详解
如何获取一个页面内所有URL链接?在Python中可以使用urllib对网页进行爬取,然后利用Beautiful Soup对爬取的页面进行解析,提取出所有的URL. 什么是Beautiful Soup ...
Python爬虫获取文章的标题及你的博客的阅读量，评论量。所有数据写入本地记事本。最后输出你的总阅读量！
Python爬虫获取文章的标题及你的博客的阅读量,评论量.所有数据写入本地记事本.最后输出你的总阅读量!还可以进行筛选输出!比如阅读量大于1000,之类的! 完整代码在最后.依据阅读数量进行降序输出! ...
突破次元壁障，Python爬虫获取二次元女友
突破次元壁障,Python爬虫获取二次元女友前言程序说明二次元女友获取程序观察网页结构页面解析创建图片保存路径图片下载格式转换爬取结果展示完整程序前言 (又到了常见的无中生友环节 ...
Python爬虫 | Python爬虫获取女友图片
Python爬虫 | Python爬虫获取女友图片前言程序说明二次元女友获取程序观察网页结构页面解析创建图片保存路径图片下载格式转换爬取结果展示完整程序前言 (又到了常见的无中生 ...
python爬虫获取天猫店铺信息（更新到2020年）
python爬虫获取天猫店铺信息爬取需求在天猫搜索一个关键词,然后抓取这个关键词下的相关店铺,由于taobao的反爬策略,只能爬取到第十页大概200个店铺的信息. 效果预览最终爬取的数据用exc ...
python爬虫获取起点中文网人气排行Top100(快速入门,新手必备!)
本篇博客小菌为大家带来的是用python爬虫获取起点中文网人气排行Top100的分享,希望大家能在学习的过程中感受爬虫的魅力! 我们先根据网址https://www.qidian.co ...
python爬虫—豆瓣电影海报（按类别）
原文地址:http://www.alannah.cn/2019/04/06/getdouban/ python爬虫-豆瓣电影海报目标:通过python爬虫在豆瓣电影上按类别对电影海报等数据进行抓取, ...

Python爬虫获取电影链接(续)

Python爬虫获取电影链接(续)相关推荐

最新文章

热门文章