完整代码

import requests
from lxml import etreeBASE_DOMAIN = 'http://www.ygdy8.net'
# url = 'http://www.ygdy8.net/html/gndy/dyzz/list_23_1.html'HEADERS = {'Referer': 'http://www.ygdy8.net/html/gndy/dyzz/index.html','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}# 得到详情页面的url
def get_detail_urls(url):response = requests.get(url, headers=HEADERS)text = response.texthtml = etree.HTML(text)detail_urls = html.xpath('//table[@class="tbspan"]//a/@href')detail_urls = map(lambda url: BASE_DOMAIN + url, detail_urls)return detail_urls# 得到页面详情里面的数据
def parse_detail_page(url):movie = {}respoonse = requests.get(url, headers=HEADERS)text = respoonse.content.decode('gbk')html = etree.HTML(text)title = html.xpath('//div[@class="title_all"]//font[@color="#07519a"]/text()')[0]movie['title'] = titlezoomE = html.xpath("//div[@id='Zoom']")[0]cover = zoomE.xpath(".//img/@src")[0]movie['cover'] = coverinfos = zoomE.xpath('.//text()')for index,info in enumerate(infos):if info.startswith('◎年　　代'):info = info.replace('◎年　　代', "").strip()  # strip是去前后空格movie['year'] = infoelif info.startswith('◎产　　地',):info = info.replace('◎产　　地', "").strip()movie['country'] = infoelif info.startswith('◎类　　别',):info = info.replace('◎类　　别', "").strip()movie['category'] = infoelif info.startswith('◎语　　言',):info = info.replace('◎语　　言', "").strip()movie['language'] = infoelif info.startswith('◎豆瓣评分',):info = info.replace('◎豆瓣评分', "").strip()movie['grade'] = infoelif info.startswith('◎导　　演',):info = info.replace('◎导　　演', "").strip()movie['director'] = infoelif info.startswith('◎主　　演',):info = info.replace('◎主　　演', "").strip()actors = [info]for x in range(index+1,len(infos)):actor = infos[x].strip()if actor.startswith("◎简　　介"):breakactors.append(actor)movie['actors'] = actorselif info.startswith('◎简　　介 ',):info = info.replace('◎简　　介 ', "").strip()for x in range(index+1,len(infos)):profile = infos[x].strip()movie['profile'] = profilereturn movie# 得到每一页的url
def spider():base_url = 'http://www.ygdy8.net/html/gndy/dyzz/list_23_{}.html'movies = []for x in range(1, 2):# 第一个for循环，使用来控制7页的url = base_url.format(x)detail_urls = get_detail_urls(url)for detail_url in detail_urls:# 第二个for循环，是用来遍历所有的详情页面movie = parse_detail_page(detail_url)movies.append(movie)return (movies)if __name__ == '__main__':spider()for i in spider():for key, value in i.items():print("{!s}:{!s}".format(key, value))

网页分析

我们要爬取电影天堂最新上传的电影和每一部电影的详情页面
先分析一下页面

我们检查一下，找到他所在href元素

然后我们点进去发现，跟我们的所看到的url不同，在基础域名下，所以我们在爬取详情页面的是时候要加上基础域名。

结合xpath语法我们可以提取出来每一个详情页面的url
代码如下

BASE_DOMAIN = 'http://www.ygdy8.net'
# url = 'http://www.ygdy8.net/html/gndy/dyzz/list_23_1.html'HEADERS = {'Referer': 'http://www.ygdy8.net/html/gndy/dyzz/index.html','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}# 得到详情页面的url
def get_detail_urls(url):response = requests.get(url, headers=HEADERS)text = response.texthtml = etree.HTML(text)detail_urls = html.xpath('//table[@class="tbspan"]//a/@href')detail_urls = map(lambda url: BASE_DOMAIN + url, detail_urls)return detail_urls

因为我们要获取多页的url，所以我们定义的一个获取每一页url的函数

代码如下

def spider():base_url = 'http://www.ygdy8.net/html/gndy/dyzz/list_23_{}.html'movies = []for x in range(1, 2):# 第一个for循环，使用来控制7页的url = base_url.format(x)detail_urls = get_detail_urls(url)for detail_url in detail_urls:# 第二个for循环，是用来遍历所有的详情页面movie = parse_detail_page(detail_url)movies.append(movie)return (movies)

打印一下 OK ，没问题我们就开始提取每一页详情页面的数据。

提取数据

# 得到页面详情里面的数据
def parse_detail_page(url):movie = {}respoonse = requests.get(url, headers=HEADERS)text = respoonse.content.decode('gbk')html = etree.HTML(text)title = html.xpath('//div[@class="title_all"]//font[@color="#07519a"]/text()')[0]movie['title'] = titlezoomE = html.xpath("//div[@id='Zoom']")[0]cover = zoomE.xpath(".//img/@src")[0]movie['cover'] = coverinfos = zoomE.xpath('.//text()')for index,info in enumerate(infos):if info.startswith('◎年　　代'):info = info.replace('◎年　　代', "").strip()  # strip是去前后空格movie['year'] = infoelif info.startswith('◎产　　地',):info = info.replace('◎产　　地', "").strip()movie['country'] = infoelif info.startswith('◎类　　别',):info = info.replace('◎类　　别', "").strip()movie['category'] = infoelif info.startswith('◎语　　言',):info = info.replace('◎语　　言', "").strip()movie['language'] = infoelif info.startswith('◎豆瓣评分',):info = info.replace('◎豆瓣评分', "").strip()movie['grade'] = infoelif info.startswith('◎导　　演',):info = info.replace('◎导　　演', "").strip()movie['director'] = infoelif info.startswith('◎主　　演',):info = info.replace('◎主　　演', "").strip()actors = [info]for x in range(index+1,len(infos)):actor = infos[x].strip()if actor.startswith("◎简　　介"):breakactors.append(actor)movie['actors'] = actorselif info.startswith('◎简　　介 ',):info = info.replace('◎简　　介 ', "").strip()for x in range(index+1,len(infos)):profile = infos[x].strip()movie['profile'] = profilereturn movie

提取数据遇到的问题

提取年代，语言这些数据的时候我们不需要前面的文字，所以我们用startswith来开始，然后用replace来去除他，最后用strip()来去除前后空格。
在获取主演时我们发现他是一个列表，所以我们要用for循环来打印出来。
最后返回到一个列表里打印出来

效果如下

使用requests爬取电影天堂数据，用lxml和Xpath相关推荐

python下载电影天堂_Python爬虫初学：爬取电影天堂数据
本文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,如有问题请及时联系我们以作处理. 以下文章来源于IT共享之家,作者:IT共享者 [一.项目背景] 相信大家都有一种头疼的体验,要下载 ...
requests+xpath爬取电影天堂电影信息
电影天地网址:http://www.ygdy8.net/html/gndy/china/list_4_1.html 目标: 1.爬取电影天堂的国内电影一栏的所有电影的url 2.进入每个电影的url获 ...
爬取电影天堂最新电影（xpath结合lxml）
完整代码 import requests from lxml import etree from openpyxl import WorkbookBASEURL='https://www.dytt8. ...
Python，爬取电影天堂，你觉得怎么样？
一.爬虫的重要性: 如果把互联网比喻成一个蜘蛛网,那么Spider就是在网上爬来爬去的蜘蛛.网络蜘蛛通过网页的链接地址来寻找网页,从网站某一个页面(通常是首页)开始,读取网页的内容,找到在网页中的其它 ...
爬虫学习（一）---爬取电影天堂下载链接
欢迎加入python学习交流群 667279387 爬虫学习爬虫学习(一)-爬取电影天堂下载链接爬虫学习(二)–爬取360应用市场app信息主要利用了python3.5 requests,Bea ...
python爬电影天堂_python爬虫爬取电影天堂电影
python爬虫爬取电影天堂电影?本项目实现一个简单的爬虫,通过requests和BeautifulSoup爬取电影天堂电影信息,包括片名.年代.产地.类别.语言.海报链接和视频链接等内容.pytho ...
python 爬取_Python爬取电影天堂
前言: 本文非常浅显易懂,可以说是零基础也可快速掌握.如有疑问,欢迎留言,笔者会第一时间回复.本文代码存于github 一.爬虫的重要性: 如果把互联网比喻成一个蜘蛛网,那么Spider就是在网上爬来 ...
scrapy初步-简单静态爬虫(爬取电影天堂所有电影)
之前用java写过一个简单的爬取电影天堂信息的爬虫,后来发现用python写这种简单的爬虫程序更简单,异步网络框架在不使用多线程和多进程的情况下也能增加爬取的速度,目前刚开始学scrapy,用这个写了 ...
python爬取电影天堂的下载链接
python爬取电影天堂dytt8的下载链接电影天堂下载链接都是magnet的,搞下来想下就下没有广告建一个main.py 一个一个挨着去爬肯定慢啊,建一个多线程的去爬 mui.py 多线程有可能 ...
python爬电影_使用Python多线程爬虫爬取电影天堂资源
最近花些时间学习了一下Python,并写了一个多线程的爬虫程序来获取电影天堂上资源的迅雷下载地址,代码已经上传到GitHub上了,需要的同学可以自行下载.刚开始学习python希望可以获得宝贵的意见. ...

使用requests爬取电影天堂数据，用lxml和Xpath

完整代码

网页分析

提取数据

效果如下

使用requests爬取电影天堂数据，用lxml和Xpath相关推荐

最新文章

热门文章