使用python爬取电影下载地址并使用transmissionrpc下载

说明

python练手，爬取电影天堂的新电影，获取到磁力链接，输出到日志文件，使用transmissionrpc下载，
涉及知识点：
1、python 操作mongodBD,参考文档
2、BeautifulSoup解析html文档，参考官方开发文档

爬取电影磁力地址

配置文件setting.py

# 数据库配置
db_config = {"host": "localhost","port": 27017,"db_name": "spider"
}# 数据库集合映射关系
db_collections = {# 电影集合"movies": "movies",# url爬取管理"urlManager": "urlManager"
}

使用了mongodDB存储数据,db_utils.py

import conf.settings as settings
import pymongo# 返回项目数据库
def _get_db():url = "mongodb://" + settings.db_config["host"] + ":" + str(settings.db_config["port"])my_client = pymongo.MongoClient(url)return my_client[settings.db_config["db_name"]]# 新增单个
def insert_one(col_name, entry):collection = _get_db()[col_name]collection.insert_one(entry)# 批量新增
def insert_list(col_name, entry_list):collection = _get_db()[col_name]collection.insert_many(entry_list)# 查找唯一
def find_one(col_name, param):collection = _get_db()[col_name]return collection.find_one(param)def find(col_name, param):collection = _get_db()[col_name]return collection.find(param)# 更新单个
def update_one(col_name, param, update):collection = _get_db()[col_name]collection.update_one(param, update)# 更新
def update_many(col_name, param, update):collection = _get_db()[col_name]collection.update_many(param, update)

使用数据库制作简单网址爬取链，去重并获取下一个执行网址，url_manager.py

import conf.settings as settings
import libs.db_utils as db_utilsclass UrlManager:def __init__(self):self.col_name = settings.db_collections["urlManager"]def add_url(self, url):url_doc = db_utils.find_one(self.col_name, {"url": url})if url_doc is None:doc = {"url": url,"isCrawl": False}db_utils.insert_one(self.col_name, doc)def has_crawl(self, url):url_doc = db_utils.find_one(self.col_name, {"url": url, "isCrawl": True})if url_doc is not None:return Truereturn False# 去除urldef remove_url(self, url):param = {"url": url, "isCrawl": False}db_utils.update_one(self.col_name, param, {"$set": {"isCrawl": True}})# 随机获取一个urldef get_next_url(self):return db_utils.find_one(self.col_name, {"isCrawl": False})

日志工具类，log_utils.py

import loggingclass Logger:def __init__(self, file_path):# 第一步，创建一个loggerself.logger = logging.getLogger()self.logger.setLevel(logging.INFO)  # Log等级总开关# 第二步，创建一个handler，用于写入日志文件fh = logging.FileHandler(file_path, mode='a')fh.setLevel(logging.INFO)  # 用于写到file的等级开关# 第三步，再创建一个handler,用于输出到控制台ch = logging.StreamHandler()ch.setLevel(logging.INFO)  # 输出到console的log等级的开关# 第四步，定义handler的输出格式formatter = logging.Formatter("%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s")fh.setFormatter(formatter)ch.setFormatter(formatter)# 第五步，将logger添加到handler里面self.logger.addHandler(fh)self.logger.addHandler(ch)

以下为爬取核心代码，关于使用BeautifulSoup获取下载地址连接，
可以参考官方开发文档

import libs.db_utils as db_utils
import conf.settings as settings
from libs.log_utils import Logger
from libs.url_manager import UrlManager
import requests
import re
from bs4 import BeautifulSoupclass Movies:def __init__(self):self.headers = {"Host": "www.dytt8.net","Connection": "keep-alive","Cache-Control": "max-age=0","Accept": "text/html, */*; q=0.01","X-Requested-With": "XMLHttpRequest","User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ""Chrome/76.0.3809.132 Safari/537.36","DNT": "1","Referer": "http://dytt8.net/","Accept-Encoding": "gzip, deflate, sdch","Accept-Language": "zh-CN,zh;q=0.8,ja;q=0.6"}self.domain = "https://www.dytt8.net"self.url_domain = "https://www.dytt8.net/html/gndy/dyzz/"self.url_manager = UrlManager()self.url_manager.add_url("https://www.dytt8.net/html/gndy/dyzz/index.html")self.download_url1_reg = re.compile(r'ftp://\S+')self.download_url2_reg = re.compile(r'magnet:\S+')self.logger = Logger("../log/movie_service.log").loggerdef spider_movies(self):while True:url_doc = self.url_manager.get_next_url()if url_doc is None:returnself.single_spider(url_doc["url"])def single_spider(self, url):response = requests.get(url=url, headers=self.headers)response.encoding = "GBK"html_str = response.textsoup = BeautifulSoup(html_str)tables = soup.select(".co_content8 table")list_a = soup.select(".x td a")for a in list_a:a_url = self.url_domain + a["href"]# self.logger.info("新url链：" + a_url)self.url_manager.add_url(a_url)for table_tag in tables:move_doc = self.get_move_doc(table_tag)if move_doc is None:continueparam = {"name": str(move_doc["name"])}select = db_utils.find_one(settings.db_collections["movies"], param)if select is None:db_utils.insert_one(settings.db_collections["movies"], move_doc)else:db_utils.update_many(settings.db_collections["movies"], param, {"$set": move_doc})self.url_manager.remove_url(url)def get_move_doc(self, table_tag):try:name = table_tag.select_one(".ulink").contents[0]des = table_tag.select("tr")[3].select_one("td").contents[0]detail_url = self.domain + table_tag.select_one(".ulink")["href"]response = requests.get(url=detail_url, headers=self.headers)response.encoding = "GBK"html_str = response.textsoup = BeautifulSoup(html_str)zoom = soup.select_one("#Zoom")download_url1 = ""table_a1_List = zoom.select("table a")# 获取ftp下载地址if table_a1_List is not None:for table_a1 in table_a1_List:if table_a1.get("href") is not None and self.download_url1_reg.match(table_a1.get("href")) is not None:download_url1 = table_a1.get("href")# 磁力链接有些页面没有download_url2 = ""a2_List = zoom.select("p a")if a2_List is not None:for a2 in a2_List:if a2.get("href") is not None and self.download_url2_reg.match(a2.get("href")) is not None:download_url2 = a2.get("href")icon = zoom.select_one("img").get("src")self.logger.info("电影名称：" + name)self.logger.info("ftp下载地址：" + download_url1)self.logger.info("磁力链接地址：" + download_url2)self.logger.info("电影详情" + detail_url)self.logger.info(icon)self.logger.info("========================================================================================")return {"name": name,"des": des,"icon": icon,"detailUrl": detail_url,"downloadUrl1": download_url1,"download_url2": download_url2}except:return Nonemovie = Movies()
movie.spider_movies()

使用transmissionrpc来下载磁力链接地址，一下为python操作transmissionrp代码
请先下载Transmission Qt Client并安装具体配置如下

#安装
pip install transmissionrpc#开发代码
import transmissionrpc
#有帐号密码的使用：
tc = transmissionrpc.Client(address='127.0.0.1', port=9091, user='test', password='abcdefg123')
#无帐号密码的使用：
tc = transmissionrpc.Client(address='127.0.0.1', port=9091)
#添加下载任务（torrent文件或torrent url）
tc.add_torrent(torrent=r"/data/1.torrent")
tc.add_torrent(torrent="magnet:?xt=urn:btih:...")
#操作
#获取torrent_id
tc.get_torrents()
#删除下载任务，需要先获取torrent_id
tc.remove_torrent({torrent_id})
#获取任务对象：
tr1 = tc.get_torrent(1)
#torrent对象的控制：开始、暂停、状态、更新分别用start()，stop()，status()，update()
tr1.start()
tr1.stop()
tr1.status()
#注意：每次调用start()、stop()后都要再调用一次update()，否则不会生效。
tr1.update()

操作示例代码

class MovieDownload:def __init__(self):self.tc = transmissionrpc.Client(address='127.0.0.1', port=9091, user='test', password='123456')passdef download_movie(self, torrent_url):self.tc.add_torrent(torrent=torrent_url)movie_download = MovieDownload()
torrent = "magnet:?xt=..."
movie_download.download_movie(torrent)

使用python爬取电影下载地址并使用transmissionrpc下载相关推荐

python爬取电影天堂的下载链接
python爬取电影天堂dytt8的下载链接电影天堂下载链接都是magnet的,搞下来想下就下没有广告建一个main.py 一个一个挨着去爬肯定慢啊,建一个多线程的去爬 mui.py 多线程有可能 ...
Python 爬取电影天堂top最新电影
Python爬虫有他无可比拟的优势:语法简单,经常几十行代码就能轻松解决问题,相比于JAVA,C,PHP;第三方库丰富,Python强大而又丰富的第三方库使他几乎可以无所不能.今天我们就来用用Pyth ...
python爬取电影天堂新片精品模块电影列表，并用迅雷下载
python版本是3.6.5,上代码: # 爬取电影天堂 from selenium import webdriver import requests from bs4 import Beautifu ...
python爬取电影评分_用Python爬取猫眼上的top100评分电影
代码如下: # 注意encoding = 'utf-8'和ensure_ascii = False,不写的话不能输出汉字 import requests from requests.exception ...
python爬取电影天堂首页
用python写了个小爬虫,用来爬取电影天堂首页放置的几十部电影的名称,上映日期和下载链接,用到了beautifulsoup库和lxml库用来解析代码如下: import requests impo ...
python爬取斗图啦表情包并下载到本地
迫于无聊,又刚好正在学习python,就来记录一篇关于python爬取图片链接下载本地的入门文章... 主要用到的模块: request 和 BeautifulSoup4 开发之前建议先看一下官方给出 ...
python爬取电影信息并插入至MySQL数据库
在上篇博文中,博主使用python爬取了豆瓣电影的影片信息,接下来,博主考虑到在之前做的JavaWeb电影院项目中需要向数据库中一个个的插入影片数据,十分的繁琐,那么在使用了python爬虫后,这个操 ...
python 爬取电影天堂电影
主要介绍爬取电影天堂首页的电影列表,并将结果保存为csv文件. 1.首先导入需要的模块 import requests from bs4 import BeautifulSoup import csv ...
python爬取电影天堂电影信息
from lxml import etree import requests # url='https://www.dytt8.net/html/gndy/dyzz/index.html' heade ...

使用python爬取电影下载地址并使用transmissionrpc下载

说明

爬取电影磁力地址

使用python爬取电影下载地址并使用transmissionrpc下载相关推荐

最新文章

热门文章