Python3.5爬取cbooo.cn数据并且同步到mysql中
#!/usr/local/bin/python # -*- coding: utf-8 -*- # Python: 3.5 # Author: wucl(),zhenghai.zhang # Program: 爬取CBO网站上所有电影的名称并写入数据库。 # Version: 0.1 # History: 2017.10.25import requests,time, pymysql, re, datetime from exchangelib import DELEGATE, Account, Credentials, Message, Mailbox, HTMLBodyhost = 'xxx' user = 'xxx' passwd = 'xxx' dbme = 'crawl' dbtarget = 'back_brace' table = 'movie_hotwords' tabledelta = 'movie_hotwords_delta' tablesync = 'slot_value' port = 3306 tolist = ['xxx@xxx.com']def get_info():try:url = 'http://www.cbooo.cn/Mdata/getMdata_movie?area=50&type=0&year=0&initial=%E5%85%A8%E9%83%A8&pIndex=1'pData = requests.get(url).json()return pData['tPage'], pData['tCount']except:print("获取总页数和总电影数失败")def get_movies(page):try:url = 'http://www.cbooo.cn/Mdata/getMdata_movie?area=50&type=0&year=0&initial=%E5%85%A8%E9%83%A8&pIndex=' + str(page)pData = requests.get(url).json()movies_list = pData['pData']return movies_listexcept:print('获取第%s页电影列表失败' % page)def Movie_insert(host, user, passwd, dbme, port, table, movies_list):conn=pymysql.connect(host=host, user=user, passwd=passwd, db=dbme, port=port, charset="utf8")cur=conn.cursor()new_movies = []punc = "!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.()::。·"punctuation = puncfor movie in movies_list:try:movie['MovieName'] = re.sub(r"[%s]+" % punctuation, "", movie["MovieName"])cmd = 'insert into %s(movie_id, movie_name) values("%s", "%s")' % (table, movie['ID'], movie['MovieName'])cur.execute(cmd)new_movies.append(movie)except pymysql.Error:print(" "*20, movie['MovieName'], "already exists, skip……")cur.close()conn.commit()conn.close()return new_moviesdef Movie_new_and_sync(host, user, passwd, dbme, dbtarget, port, tabledelta, movies_list, tablesync):conn = pymysql.connect(host=host, user=user, passwd=passwd, db=dbme, port=port, charset="utf8")cur = conn.cursor()cur.execute("delete from %s " % dbme+"."+tabledelta)for movie in movies_list:try:cmd = 'insert into %s(movie_id, movie_name) values("%s", "%s")' % (tabledelta, movie['ID'], movie['MovieName'])cmdsync = 'insert into %s(slot_type_id, slot_value, create_by, modify_by, gmt_create, gmt_modify, out_value) values("%s", "%s", "%s", "%s", "%s", "%s", "%s")' % (dbtarget+"."+tablesync, "xxxxxx", movie['MovieName'], "system", "system", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),"")cur.execute(cmd)cur.execute(cmdsync)except pymysql.Error:print(" " * 20, movie['MovieName'], "already exists, skip……")try:cmdbacktoskill = 'insert into back_brace.release_task(app_type,app_status,type,ref_id,status,register_id,create_by,modify_by,gmt_create,gmt_modify) values("BACKBRACE","testpass","SLOT","xxxxxx","init","SLOT_BACKBRACE_TESTPASS" ,"zhenghai.zhang","zhenghai.zhang","%s","%s")' % (datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))cmdskilltoskillpro = 'insert into back_brace.release_task(app_type,app_status,type,ref_id,status,register_id,create_by,modify_by,gmt_create,gmt_modify) values("SKILL","deploy","SLOT","xxxxxx","init","SLOT_SKILL_DEPLOY" ,"zhenghai.zhang","zhenghai.zhang","%s","%s")' % (datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))print(cmdbacktoskill)cur.execute(cmdbacktoskill)print(cmdskilltoskillpro)cur.execute(cmdskilltoskillpro)except pymysql.Error:print("write into back_brace.release_task error!!!")cur.close()conn.commit()conn.close()def Email(to, subject, body):creds = Credentials(username='xxxxxx',password='xxxxxx')account = Account(primary_smtp_address='xxx@xxx.com',credentials=creds,autodiscover=True,access_type=DELEGATE)m = Message(account=account,subject=subject,body=HTMLBody(body),to_recipients=[Mailbox(email_address=to)])m.send_and_save()if __name__ == '__main__':update_movies = []pages, counts = get_info()pages = 1for i in range(1,pages + 1):print("*"*30,i,"*"*30)movies_list = get_movies(i)new_movies = Movie_insert(host, user, passwd, dbme, port, table, movies_list)for new_movie in new_movies:print(new_movie['MovieName'],"Added")onemovie = {}onemovie["ID"] = new_movie["ID"]onemovie["MovieName"] = new_movie["MovieName"]update_movies.append(onemovie)time.sleep(1)print(update_movies)try:Movie_new_and_sync(host, user, passwd, dbme, dbtarget, port, tabledelta, update_movies, tablesync) # 将增加的电影写入movie_hotwords_delta表中except:print("Movie update and sync Error!")subject = '本次新增电影名称'body = "本次新增的电影名称为:<hr>"for movie in update_movies:body += movie["MovieName"] + "<br>"for to in tolist:Email(to, subject, body)
欢迎大侠指点
Python3.5爬取cbooo.cn数据并且同步到mysql中相关推荐
- python豆瓣mysql_Python3.5爬取豆瓣电视剧数据并且同步到mysql中
#!/usr/local/bin/python # -*- coding: utf-8 -*- # Python: 3.5 # Author: zhenghai.zhang@xxx.com # Pro ...
- python爬豆瓣电视剧_Python3.5爬取豆瓣电视剧数据并且同步到mysql中
#!/usr/local/bin/python#-*- coding: utf-8 -*-#Python: 3.5#Author: zhenghai.zhang@xxx.com#Program: 爬取 ...
- 爬取豆瓣评论连接mysql_Python3.5爬取豆瓣电视剧数据并且同步到mysql中
#!/usr/local/bin/python#-*- coding: utf-8 -*-#Python: 3.5#Author: zhenghai.zhang@xxx.com#Program: 爬取 ...
- python爬取微博数据存入数据库_Python爬取新浪微博评论数据,写入csv文件中
因为新浪微博网页版爬虫比较困难,故采取用手机网页端爬取的方式 操作步骤如下: 1. 网页版登陆新浪微博 2.打开m.weibo.cn 3.查找自己感兴趣的话题,获取对应的数据接口链接 4.获取cook ...
- 使用Streamsets将Oracle数据实时同步到MySQL中
相关环境: Oracle 11g:11.2.0.1.0 MySQL:8.0.22 前期准备: 1.打开Oracle的logminer a.在SQL Shell中,以具有DBA的用户身份登录数据库: ...
- Python爬取新浪微博评论数据,写入csv文件中
因为新浪微博网页版爬虫比较困难,故采取用手机网页端爬取的方式 操作步骤如下: 1. 网页版登陆新浪微博 2.打开m.weibo.cn 3.查找自己感兴趣的话题,获取对应的数据接口链接 4.获取cook ...
- 自动化爬取淘宝数据--(保存到文本中)
普通版本的爬取淘宝网页 #淘宝商品信息 #从selenium中引入webdriver from selenium import webdriver import time #输入查询关键词 keywo ...
- 读书笔记(4)——python爬取糗事百科,并存到MySQL中
2019独角兽企业重金招聘Python工程师标准>>> 安装MySQL.使用phpStudy集成工具来安装MySQL服务器,或者可以用USBwebserve进行安装. 打开USBwe ...
- Python爬取热门微博,并存储到MySQL中
目标网站:m.weibo.cn url的获取可以从浏览器的F12中的network的XHR中找到. weibo_demo.py: import requests import json from w3 ...
- 对爬虫爬取到的数据进行存储
已写章节 第一章 网络爬虫入门 第二章 基本库的使用 第三章 解析库的使用 第四章 数据存储 第五章 动态网页的抓取 文章目录 已写章节 第四章 数据存储 4.1 文件存储 4.1.1 TXT文件存储 ...
最新文章
- 某快手程序员吐槽:月薪四万很惶恐!和老婆亲热时都在想工作,薪资越高,做人越怂!...
- linux wget命令详解
- 汽车常识全面介绍 - 车身
- 实验三——vlan间路由
- 2008 r2彻底删除 server sql_mysql添加列、删除列,创建主键、备份等常用操作总结...
- python创建sqlite3数据库_树莓派使用 Python + SQLite 建立温度数据库
- 11.4 final类
- 苹果推出新款iPad Air和iPad mini,升级A12处理器
- 利用truffle与智能合约进行交互
- 数据分析|如何利用BI工具,探索各商品的潜在关联价值
- 漫画:如何给初学者讲“为什么计算机只认识 0 和 1”?
- python 交换机巡检脚本_Python自动巡检H3C交换机实现过程解析
- 综合计算机工时,计算机辅助工时定额制定与管理系统的研究与开发
- 【Spark ML】第 3 章:监督学习
- 如何有效破解PDF文件的密码?
- 【解码芯片MIPI输出 四合一】XS9922B 国产 4通道模拟复合视频解码芯片 对标TP2815
- 网络配置问题Bringing up interface eth0: Device eth0 does not seem to be present, delaying initialization.
- Mac上一款简单实用音频剪辑工具——QuickTime Player
- android 图片编辑工具,照片编辑器:Photo Editor
- K8S重启后coredns pod无法正常运行
热门文章
- python-sklearn实现一个简易的智能问答机器人
- 微信公众号 网页授权登入
- python找不到模块pyodbc_Python:找不到pyodbc导入模块
- linux目录更改权限不够,Linux中文件夹访问权限不足
- matlab中二阶偏导数,matlab中二元函数的一阶和二阶偏导数
- 网站搭建教程(怎么建网站详细步骤)
- ArcGIS for Android 100.3的学习与应用(三) 实现地图添加自定义指北针
- 第二章作业习题答案续
- 三维实景建模技术在智慧交通领域的新发展与深入应用
- mysql计算同比和环比的区别_SQL 求同比 环比