Python爬取第一电影天堂最新电影（5000多部）代码实例（一）

1、实例代码：

#!/usr/bin/env python
#-*- coding: utf-8 -*-
#@Time    : 2020/4/7 16:28
#@Author  : zhangliangliang
#@File    : crawlerDemo3.py
#@Software: PyCharm
from urllib import requestfrom lxml import etree
import random
import requests,threading,datetime
from bs4 import BeautifulSoupBASE_URL = "http://www.dytt8.net"def readFile(path):content_list = []with open(path,'r') as f:for content in f:content_list.append(content.rstrip())return content_listdef writeFile(path,text):with open(path,'a') as f:f.write(text)f.write('\n')def truncateFile(path):with open(path, 'w', encoding='utf-8') as f:f.truncate()def getHeaders():user_agent_list = readFile("/Users/zll/pycharmProjects/studyPython/crawler_poject_base_part1/config/user_agent.txt")UserAgent = random.choice(user_agent_list)headers = {'User-Agent': UserAgent}print(headers)return headersdef getIp():ip_list = readFile('/Users/zll/pycharmProjects/studyPython/crawler_poject_base_part1/config/ip.txt')#print(ip_list)ip = random.choice(ip_list)print(ip)return ipdef checkip(targeturl,ip):headers =getHeaders()  #定制请求头proxies = {"http": "http://"+ip, "https": "https://"+ip}  #代理iptry:response=requests.get(url=targeturl,proxies=proxies,headers=headers,timeout=5).status_codeif response == 200 :return Trueelse:return Falseexcept:return Falsedef getProxies(url):ip = getIp()if checkip(url,ip) is True:proxies = {'http':'http://'+ip}print(proxies)return proxieselse:return#print(proxies)#def getProxies(url):def get_detail_url(url):proxies = getProxies(url)header = getHeaders()try:response = requests.get(url, headers=header,proxies=proxies)#print(response.content.decode('gbk'))text = response.text  #拿到数据，，再解码html = etree.HTML(text)detail_urls = html.xpath("//table[@class='tbspan']//a/@href")#for detail_url in detail_urls:#    print(BASE_URL + detail_url)detail_urls=map(lambda url:BASE_URL+url,detail_urls)return detail_urlsexcept requests.exceptions.ConnectionError:print("Connection refused")def parse_detail_page(url):proxies = getProxies(url)header = getHeaders()movie={}Zoome={}try:response=requests.get(url, headers=header,proxies=proxies)text = response.content  #text = response.text.encode("utf-8")html = etree.HTML(text)try:title = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]#for x in title:#    print(etree.tostring(x,encoding="utf-8").decode("utf-8"))#print(title)if len(title) > 0:movie['title'] = titleZoome = html.xpath("//div[@id='Zoom']")[0]  #return listexcept:print("这条数据处理失败")except requests.exceptions.ConnectionError:print("Connection refused")#imgs=Zoome.xpath(".//img/@src")#print(cover)#cover=imgs[0]#screenshot=imgs[1]#movie['cover']=cover#movie['screenshot']=screenshot  #not all movie has screenshot ,so discard for this momentdef parse_info(info,rule):return info.replace(rule,"").strip()infos={}try:infos=Zoome.xpath(".//text()")except:print("no this xpath")#print(infos) each line is a element of the listfor index,info in enumerate(infos):if info.startswith("◎年　　代"):info=parse_info(info,"◎年　　代")#print(info)movie['year']=infoelif info.startswith("◎产　　地"):info=parse_info(info,"◎产　　地")movie['country']=infoelif info.startswith("◎类　　别"):info=parse_info(info,"◎类　　别")movie['category']=infoelif info.startswith("◎语　　言"):info=parse_info(info,"◎语　　言")movie['language']=infoelif info.startswith("◎字　　幕"):info=parse_info(info,"◎字　　幕")movie['sub_title']=infoelif info.startswith("◎上映日期"):info=parse_info(info,"◎上映日期")movie['release_time']=infoelif info.startswith("◎豆瓣评分"):info=parse_info(info,"◎豆瓣评分")movie['douban_score']=infoelif info.startswith("◎片　　长"):info=parse_info(info,"◎片　　长")movie['length']=infoelif info.startswith("◎导　　演"):info=parse_info(info,"◎导　　演")movie['director']=infoelif info.startswith("◎主　　演"):info=parse_info(info,"◎主　　演")actors=[info]for x in range(index+1,len(infos)):actor=infos[x].strip()if actor.startswith("◎"):breakactors.append(actor)movie['actors']=actorselif info.startswith("◎简　　介"):info=parse_info(info,"◎简　　介")text = []for x in range(index+1,len(infos)):profile=infos[x].strip()if profile.startswith("【下载地址】"):breaktext.append(profile)movie['profiles']=text[0]try:download_url=html.xpath("//td[@bgcolor='#fdfddf']/a/@href")#print(download_url)movie['download_url']=download_urlexcept:print("no this xpath")return moviedef spider(path):base_url = 'http://www.dytt8.net/html/gndy/dyzz/list_23_{}.html'movies = []movie = {}for x in range(138,213):  #how much page depend on you#print("==="*30)#print(x)url=base_url.format(x)print(url)detail_urls=get_detail_url(url)for detail_url in detail_urls:print(detail_url)movie=parse_detail_page(detail_url)text = str(movie)writeFile(path,text)#print(movie)movies.append(movie)print(movie)return moviesif __name__ == '__main__':path = "/Users/zll/Desktop/movies.txt"truncateFile(path)spider(path)

代理IP：淘宝上一块钱买了10000个，还能用
列举几个：

60.217.152.231:8060
119.179.143.126:8060
39.106.205.147:8085
111.231.239.143:1081
110.243.24.193:9999
110.243.10.153:9999
110.243.7.228:9999
39.137.69.7:80
121.232.148.106:9000

User-Agent: 上网百度了一下
列举几个：

Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML like Gecko) Chrome/22.0.1207.1 Safari/537.1
Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML like Gecko) Chrome/20.0.1132.57 Safari/536.11
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML like Gecko) Chrome/20.0.1092.0 Safari/536.6
Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML like Gecko) Chrome/20.0.1090.0 Safari/536.6
Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML like Gecko) Chrome/19.77.34.5 Safari/537.1
Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML like Gecko) Chrome/19.0.1084.9 Safari/536.5
Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML like Gecko) Chrome/19.0.1084.36 Safari/536.5
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML like Gecko) Chrome/19.0.1063.0 Safari/536.3
Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML like Gecko) Chrome/19.0.1063.0 Safari/536.3
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML like Gecko) Chrome/19.0.1063.0 Safari/536.3
Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML like Gecko) Chrome/19.0.1062.0 Safari/536.3
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML like Gecko) Chrome/19.0.1062.0 Safari/536.3
Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML like Gecko) Chrome/19.0.1061.1 Safari/536.3
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML like Gecko) Chrome/19.0.1061.1 Safari/536.3
Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML like Gecko) Chrome/19.0.1061.1 Safari/536.3
Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML like Gecko) Chrome/19.0.1061.0 Safari/536.3
Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML like Gecko) Chrome/19.0.1055.1 Safari/535.24
Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML like Gecko) Chrome/19.0.1055.1 Safari/535.24

多遇到的问题总结：
一、
首先要知道自己浏览器的User-Agent：
可以使服务器识别客户使用的操作系统及版本、浏览器及版本等信息。在做爬虫时加上此信息，可以伪装为浏览器；如果不加，很可能会被识别出为爬虫。

怎样获取？
在浏览器的搜索框内输入about:version

二、

写入文件遇到的问题
问题一：
报错：
TypeError: write() takes no keyword arguments
原因：
typeerror:write（）不接受关键字参数，因为write（）方法所接受的参数仅仅有一个就是要写入文件的内容，所以不会有其他的参数，所以，如果参数多了，就会出问题
改正：
write()里只留要写入的内容，不配置其他参数

问题二：
报错：
TypeError: write() argument must be str, not dict
原因：
写文件只能是String类型
改正：
转化要写入数据的类型

{‘title’: ‘2019年获奖剧情《未成年》BD韩语中字’, ‘year’: ‘2019’, ‘country’: ‘韩国’, ‘category’: ‘剧情’, ‘language’: ‘韩语’, ‘sub_title’: ‘中文’, ‘release_time’: ‘2019-04-11(韩国)’, ‘douban_score’: ‘7.1/10 from 1591 users’, ‘length’: ‘96分钟’, ‘director’: ‘金允石 Yoon-suk Kim’, ‘actors’: [‘廉晶雅 Jung-ah Yum’, ‘金素真 So-Jin Kim’, ‘金惠’], ‘download_url’: []}
原因：没有找到问题，暂时没有解决，造成的问题是一些电影的数据丢失

IndexError: list index out of range
title = html.xpath("//div[@class=‘title_all’]//font[@color=’#07519a’]/text()")[0]

UnboundLocalError: local variable ‘Zoome’ referenced before assignment

AttributeError: ‘dict’ object has no attribute ‘xpath’

以上三个问题我是加了try except，不报错了，但还是有点问题
欢迎各位大佬批评指正，优化代码

爬取的数据在上传的资源里，后期用于Python做数据分析加前端展示
.
.
.
.
.
下面是我的公众号，收集了现在主流的大数据技能和架构，欢迎大家一起来学习交流。

Python爬取第一电影天堂最新电影（5000多部）代码实例（一）相关推荐

爬取电影天堂最新电影（xpath结合lxml）
完整代码 import requests from lxml import etree from openpyxl import WorkbookBASEURL='https://www.dytt8. ...
使用Python爬取不同类别的豆瓣电影简介
使用Python爬取不同类别的豆瓣电影简介之前做过一点文本分类的工作,从豆瓣上爬取了不同类别的数千条电影的简介. 爬取目标我们爬取的目标是豆瓣影视,打开豆瓣网,随便点击一部电影,即可看到电影的介 ...
python实战（一）Python爬取猫眼评分排行前100电影及简单数据分析可视化python实战（一）Python爬取猫眼排行前一百电影及简单数据分析可视化
python实战(一)Python爬取猫眼排行前一百电影及简单数据分析可视化一.抓取数据需要的库 request库响应http请求 json库将文本保存成json形式 pyquery 类似JQ ...
Python爬取豆瓣正在上映的电影
Python爬取豆瓣正在上映的电影 #爬取豆瓣正在上映的电影 import requests from lxml import etree #1.将目标从网站上的页面抓取下来 headers = {' ...
Python 利用requests+BeautifulSoup4编写原生爬虫，爬取电影天堂最新电影，并打造最新电影下载及查询器
可能有许多人有这样的一种烦恼,当想要查看最近更新的电影时,不得不打开电影天堂的官网进行查询(当然如果你习惯用电影天堂下载电影的话/微笑),当点击了解电影详情的时候,网页往往就切换到了广告页面,很烦有没 ...
爬取电影天堂最新电影的名称和下载链接
此次的目标是爬取电影天堂最新200页的最新电影的电影名称和下载链接,电影的下载链接在二级页面,所以需要先匹配一级页面的所有链接,然后逐个请求二级页面,代码如下: """爬 ...
【宅男宅女们的福音】电影天堂最新电影爬取及搜索脚本
多线程电影天堂最新资源爬取脚本.电影搜索脚本 PS:方便大家使用写到了HTML中生成表格. 线程可以在脚本里直接改,测试线程为30时IP可能会被限制访问.[阳光电影是电影天堂的马甲] 环境: Pyth ...
python爬取猫眼正在热映电影
用python写爬虫爬取需要的数据比较容易,以Python简洁的语法和一大波成熟的库,写起来相当的快 python的版本以及使用的库 Python 3.6.4 requests lxml 这次主要是爬 ...
SpringBoot集成jsoup多线程爬取美剧天堂全部电影资源
SpringBoot集成jsoup爬取美剧天堂全部美剧资源准备工作这次我的目的是获取这个网站的所有美剧的信息和迅雷的BT地址,我们需要获取的信息都在上万个类似于下面个页面结构的页面上确定了目标, ...
python 爬取B站原视频的实站代码
本文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理以下文章来源于腾讯云,作者:python学习教程 ( 想要学习Python?Pyt ...

Python爬取第一电影天堂最新电影（5000多部）代码实例（一）

Python爬取第一电影天堂最新电影（5000多部）代码实例（一）相关推荐

最新文章

热门文章