本人愚笨,代码如下

# -*- coding: UTF-8 -*-
# --author:valecalida--
# 2021/3/11 10:34
from re import findall
from bs4 import BeautifulSoup
from random import choice
from tqdm import tqdm
from time import sleep
import urllib3
from requests import get, HTTPError
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, create_engine
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)class Get_Rank(object):def __init__(self):self.names, self.stars, self.times, self.score, self.infos = [], [], [], [], []self.urls = [("https://maoyan.com/board/4?offset=" + str(i)) for i in range(0, 100, 10)]@staticmethoddef get_user_agent():user_agent = ["Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6","Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1","Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6","Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5","Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3","Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3","Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3","Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3","Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3","Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24","Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"]return choice(user_agent)@staticmethoddef process_score(score):content = []for i in score:content.append(str(i[0]) + str(i[1]))return content@staticmethoddef process_regex(html):movie_time, movie_star = [], []movie_name = findall('<a data-act.*">(.*)</a>', html)movie_score = findall('<p class="score"><i class="integer">(.*)</i><i class="fraction">(.*)</i>', html)t = findall('<p class="releasetime">(.*)</p>', html)s = findall('<p class="star">\n[\s]*(.*)\n[\s]*</p>', html)for i in range(len(t)):movie_time.append(t[i][5:])movie_star.append(s[i][3:])return movie_name, movie_score, movie_time, movie_stardef get_infos(self):print("[+] 开始爬取猫眼电影排行榜")for i in tqdm(range(0, len(self.urls))):header = {"User-Agent": Get_Rank().get_user_agent()}try:res = get(self.urls[i], header)soup = BeautifulSoup(res.text, 'lxml')html = str(soup.find_all('dd'))except HTTPError:print("[-] HTTP ERROR Occur!")else:movie_name, movie_score, movie_time, movie_star = Get_Rank().process_regex(html)self.names.extend(movie_name)self.stars.extend(movie_star)self.times.extend(movie_time)self.score.extend(movie_score)for num in range(100):self.infos.append([self.names[num], self.stars[num], self.times[num], Get_Rank().process_score(self.score)[num]])return self.infosclass Create_Table(object):engine = create_engine("mysql+mysqlconnector://root:root@localhost:3306/rank_list", encoding="utf-8")Base = declarative_base()class movies(Base):__tablename__ = 'movies'id = Column(Integer, primary_key=True, autoincrement=True, nullable=False)names = Column(String(50))stars = Column(String(50))times = Column(String(20))score = Column(String(10))Base.metadata.create_all(engine)class Mysql_Operate(object):def __init__(self):self.engine = create_engine("mysql+mysqlconnector://root:root@localhost:3306/rank_list", encoding="utf-8")self.Session_Class = sessionmaker(bind=self.engine)self.Session = self.Session_Class()self.infos = Get_Rank().get_infos()self.names, self.stars, self.times, self.score = [], [], [], []def commit_mysql(self):start_id = 1for info in self.infos:names, stars, times, score = info[0], info[1], info[2], info[3]t = Create_Table.movies(id=start_id, names=names, stars=stars, times=times, score=score)self.Session.add(t)self.Session.commit()start_id += 1if __name__ == '__main__':print("[*] 在开始爬取之前,请确认您可以访问该网站!")Mysql_Operate().commit_mysql()

运行结果比较简单,验证如下:

[+] 开始爬取猫眼电影排行榜
100%|██████████| 10/10 [00:07<00:00,  1.27it/s]

MySQL验证操作如下:

mysql> use rank_list;
Database changed
mysql> show tables;
+---------------------+
| Tables_in_rank_list |
+---------------------+
| movies              |
+---------------------+
1 row in set (0.00 sec)
mysql> select * from movies where id=1;
+----+-----------------+----------------------------+------------+-------+
| id | names           | stars                      | times      | score |
+----+-----------------+----------------------------+------------+-------+
|  1 | 我不是药神      | 徐峥,周一围,王传君         | 2018-07-05 | 9.6   |
+----+-----------------+----------------------------+------------+-------+
1 row in set (0.00 sec)mysql> select * from movies where id=100;
+-----+-----------+----------------------------------------------------------------+--------------------+-------+
| id  | names     | stars                                                          | times              | score |
+-----+-----------+----------------------------------------------------------------+--------------------+-------+
| 100 | 禁闭岛    | 莱昂纳多·迪卡普里奥,马克·鲁法洛,本·金斯利                      | 2010-02-13(德国)   | 8.7   |
+-----+-----------+----------------------------------------------------------------+--------------------+-------+
1 row in set (0.00 sec)mysql> select * from movies where id=99;
+----+--------------+----------------------------------------------------------------+------------+-------+
| id | names        | stars                                                          | times      | score |
+----+--------------+----------------------------------------------------------------+------------+-------+
| 99 | 模仿游戏     | 本尼迪克特·康伯巴奇,凯拉·奈特莉,马修·古迪                      | 2015-07-21 | 9.3   |
+----+--------------+----------------------------------------------------------------+------------+-------+
1 row in set (0.00 sec)

Python爬取猫眼电影排行榜并写入MySQL相关推荐

  1. 2019-01-18-Python爬取猫眼电影排行榜

    title: Python爬取猫眼电影排行榜 date: 2019-01-18 20:44:16 tags: python lxml requests json categories: python ...

  2. 【Python爬虫】爬取猫眼电影排行榜并存放至csv文件

    在进行本节实战之前,希望您对requests库以及正则表达式有所了解. 运行平台:windows **Python版本: Python3.x ** 一.依赖库的安装 在本节实战之前,请确保已经正确安装 ...

  3. 爬虫(2)-解析库xpath和beautifulsoup爬取猫眼电影排行榜前100部电影

    解析库爬取猫眼电影前100部电影 认为有用的话请点赞,码字不易,谢谢. 其他爬虫实战请查看:https://blog.csdn.net/qq_42754919/category_10354544.ht ...

  4. Python爬取猫眼电影TOP100榜

    Python爬取猫眼电影TOP100榜 兴趣点: 这个没什么特别的兴趣,只是单纯爬猫眼练手的人太多了,所以我也打算加入他们,对猫眼员工说一声不好意思了,哈哈哈! 爬取网址: 传送门:https://m ...

  5. (伪)Python爬取猫眼电影(反反爬虫过程中遇到的坑)

    Python爬取猫眼电影 1.打开一个猫眼电影的URL,例如本月的较火的电影<毒液:致命守护者>http://maoyan.com/films/42964 直接F12,查看审核元素,发现上 ...

  6. python爬取猫眼电影数据

    每天一点点,记录学习每一步 近期爬虫项目: 1:python 爬取菜鸟教程python100题,百度贴吧图片反爬虫下载,批量下载 2:python爬虫爬取百度贴吧图片,requests方法 3:pyt ...

  7. 爬虫(1)-正则化表达式爬取猫眼电影排行榜前100部电影

    爬取猫眼电影排行榜前100部电影 文章目录 爬取猫眼电影排行榜前100部电影 1.抓取首页 2.正则化表达式提取信息 3.保存到文件中 4.抓取前100部电影 认为有用的话请点赞,码字不易,谢谢. 其 ...

  8. python猫眼电影分析_用Python 爬取猫眼电影数据分析《无名之辈》

    前言 作者: 罗昭成 PS:如有需要Python学习资料的小伙伴可以加点击下方链接自行获取 http://note.youdao.com/noteshare?id=3054cce4add8a909e7 ...

  9. 利用python爬取猫眼电影,分析《大侦探皮卡丘》|凹凸数读

    利用python爬取猫眼电影,分析<大侦探皮卡丘>,看看当皮卡丘长出绒毛,"丑拒"还是"真香"都在猫眼短评里了. 本文首发于微信公众号<凹凸数 ...

  10. Python语言实现用requests和正则表达式方法爬取猫眼电影排行榜前100部电影

    #爬取猫眼电影排名前100的电影 import requests #电脑向服务器发送的请求库 from requests.exceptions import RequestException impo ...

最新文章

  1. Sentinel v1.4.2 发布,更好用的集群限流功能
  2. 前端工程师系列,TCP复习及浓缩总结(全干货,支持面试)
  3. Redis的Hash操作
  4. Java 集合系列(1): Collection架构
  5. 网络请求与本地函数调用的区别
  6. Linux FTP安装问题
  7. php h5读写数据库,H5学习_番外篇_PHP数据库操作
  8. android+桌面+横屏,安卓怎么强制桌面横屏
  9. 详细介绍MATLAB导入文本文件、excel等数据文件
  10. 苹果手机各种尺寸列表
  11. 人脸识别:技术应用与商业实践
  12. iptables 中 SNAT、DNAT 和 MASQUERADE 的含义
  13. OpenSSL sm2 签名源码讲解
  14. 什么是CDN,为什么用CDN,如何用CDN
  15. SGX初始化中ELF文件解析
  16. Python Animation 画动态图形
  17. 阿里测试开发python面试题_[阿里面试]测试开发工程师面试
  18. 同花顺的故事(7)业务逻辑相关
  19. Week8学习总结-数据库
  20. Oracle表数据转换为XML格式数据

热门文章

  1. 从零实现深度学习框架——实现常见运算的计算图(上)
  2. 十分钟弄懂字节对编码
  3. 词性标注-隐马尔可夫模型
  4. 白板推导系列Pytorch-隐马尔可夫模型-学习问题
  5. 走向TensorFlow2.0,一步到位
  6. Mesos和Docker的集成
  7. 操作系统直接决定了计算机系统的整体性能
  8. 《我也能做CTO之程序员职业规划》和《.NET软件设计新思维——像搭积木一样搭建软件》新书发布会 回顾
  9. 二分法05:搜索旋转排序数组
  10. BPTT算法推导以及LSTM是如何解决梯度消失的