#!/usr/local/bin/python
# -*- coding: utf-8 -*-
# Python:                  3.5
# Author:                  wucl(),zhenghai.zhang
# Program:                 爬取CBO网站上所有电影的名称并写入数据库。
# Version:                 0.1
# History:                 2017.10.25import requests,time, pymysql, re, datetime
from exchangelib import DELEGATE, Account, Credentials, Message, Mailbox, HTMLBodyhost = 'xxx'
user = 'xxx'
passwd = 'xxx'
dbme = 'crawl'
dbtarget = 'back_brace'
table = 'movie_hotwords'
tabledelta = 'movie_hotwords_delta'
tablesync = 'slot_value'
port = 3306
tolist = ['xxx@xxx.com']def get_info():try:url = 'http://www.cbooo.cn/Mdata/getMdata_movie?area=50&type=0&year=0&initial=%E5%85%A8%E9%83%A8&pIndex=1'pData = requests.get(url).json()return pData['tPage'], pData['tCount']except:print("获取总页数和总电影数失败")def get_movies(page):try:url = 'http://www.cbooo.cn/Mdata/getMdata_movie?area=50&type=0&year=0&initial=%E5%85%A8%E9%83%A8&pIndex=' + str(page)pData = requests.get(url).json()movies_list = pData['pData']return movies_listexcept:print('获取第%s页电影列表失败' % page)def Movie_insert(host, user, passwd, dbme, port, table, movies_list):conn=pymysql.connect(host=host, user=user, passwd=passwd, db=dbme, port=port, charset="utf8")cur=conn.cursor()new_movies = []punc = "!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.()::。·"punctuation = puncfor movie in movies_list:try:movie['MovieName'] = re.sub(r"[%s]+" % punctuation, "", movie["MovieName"])cmd = 'insert into %s(movie_id, movie_name) values("%s", "%s")' % (table, movie['ID'], movie['MovieName'])cur.execute(cmd)new_movies.append(movie)except pymysql.Error:print(" "*20, movie['MovieName'], "already exists, skip……")cur.close()conn.commit()conn.close()return new_moviesdef Movie_new_and_sync(host, user, passwd, dbme, dbtarget, port, tabledelta, movies_list, tablesync):conn = pymysql.connect(host=host, user=user, passwd=passwd, db=dbme, port=port, charset="utf8")cur = conn.cursor()cur.execute("delete from %s " % dbme+"."+tabledelta)for movie in movies_list:try:cmd = 'insert into %s(movie_id, movie_name) values("%s", "%s")' % (tabledelta, movie['ID'], movie['MovieName'])cmdsync = 'insert into %s(slot_type_id, slot_value, create_by, modify_by, gmt_create, gmt_modify, out_value) values("%s", "%s", "%s", "%s", "%s", "%s", "%s")' % (dbtarget+"."+tablesync, "xxxxxx", movie['MovieName'], "system", "system", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),"")cur.execute(cmd)cur.execute(cmdsync)except pymysql.Error:print(" " * 20, movie['MovieName'], "already exists, skip……")try:cmdbacktoskill = 'insert into back_brace.release_task(app_type,app_status,type,ref_id,status,register_id,create_by,modify_by,gmt_create,gmt_modify) values("BACKBRACE","testpass","SLOT","xxxxxx","init","SLOT_BACKBRACE_TESTPASS" ,"zhenghai.zhang","zhenghai.zhang","%s","%s")' % (datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))cmdskilltoskillpro = 'insert into back_brace.release_task(app_type,app_status,type,ref_id,status,register_id,create_by,modify_by,gmt_create,gmt_modify) values("SKILL","deploy","SLOT","xxxxxx","init","SLOT_SKILL_DEPLOY" ,"zhenghai.zhang","zhenghai.zhang","%s","%s")' % (datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))print(cmdbacktoskill)cur.execute(cmdbacktoskill)print(cmdskilltoskillpro)cur.execute(cmdskilltoskillpro)except pymysql.Error:print("write into back_brace.release_task error!!!")cur.close()conn.commit()conn.close()def Email(to, subject, body):creds = Credentials(username='xxxxxx',password='xxxxxx')account = Account(primary_smtp_address='xxx@xxx.com',credentials=creds,autodiscover=True,access_type=DELEGATE)m = Message(account=account,subject=subject,body=HTMLBody(body),to_recipients=[Mailbox(email_address=to)])m.send_and_save()if __name__ == '__main__':update_movies = []pages, counts = get_info()pages = 1for i in range(1,pages + 1):print("*"*30,i,"*"*30)movies_list = get_movies(i)new_movies = Movie_insert(host, user, passwd, dbme, port, table, movies_list)for new_movie in new_movies:print(new_movie['MovieName'],"Added")onemovie = {}onemovie["ID"] = new_movie["ID"]onemovie["MovieName"] = new_movie["MovieName"]update_movies.append(onemovie)time.sleep(1)print(update_movies)try:Movie_new_and_sync(host, user, passwd, dbme, dbtarget, port, tabledelta, update_movies, tablesync)  # 将增加的电影写入movie_hotwords_delta表中except:print("Movie update and sync Error!")subject = '本次新增电影名称'body = "本次新增的电影名称为:<hr>"for movie in update_movies:body += movie["MovieName"] + "<br>"for to in tolist:Email(to, subject, body)

欢迎大侠指点

Python3.5爬取cbooo.cn数据并且同步到mysql中相关推荐

  1. python豆瓣mysql_Python3.5爬取豆瓣电视剧数据并且同步到mysql中

    #!/usr/local/bin/python # -*- coding: utf-8 -*- # Python: 3.5 # Author: zhenghai.zhang@xxx.com # Pro ...

  2. python爬豆瓣电视剧_Python3.5爬取豆瓣电视剧数据并且同步到mysql中

    #!/usr/local/bin/python#-*- coding: utf-8 -*-#Python: 3.5#Author: zhenghai.zhang@xxx.com#Program: 爬取 ...

  3. 爬取豆瓣评论连接mysql_Python3.5爬取豆瓣电视剧数据并且同步到mysql中

    #!/usr/local/bin/python#-*- coding: utf-8 -*-#Python: 3.5#Author: zhenghai.zhang@xxx.com#Program: 爬取 ...

  4. python爬取微博数据存入数据库_Python爬取新浪微博评论数据,写入csv文件中

    因为新浪微博网页版爬虫比较困难,故采取用手机网页端爬取的方式 操作步骤如下: 1. 网页版登陆新浪微博 2.打开m.weibo.cn 3.查找自己感兴趣的话题,获取对应的数据接口链接 4.获取cook ...

  5. 使用Streamsets将Oracle数据实时同步到MySQL中

    相关环境: Oracle 11g:11.2.0.1.0  MySQL:8.0.22 前期准备: 1.打开Oracle的logminer a.在SQL Shell中,以具有DBA的用户身份登录数据库: ...

  6. Python爬取新浪微博评论数据,写入csv文件中

    因为新浪微博网页版爬虫比较困难,故采取用手机网页端爬取的方式 操作步骤如下: 1. 网页版登陆新浪微博 2.打开m.weibo.cn 3.查找自己感兴趣的话题,获取对应的数据接口链接 4.获取cook ...

  7. 自动化爬取淘宝数据--(保存到文本中)

    普通版本的爬取淘宝网页 #淘宝商品信息 #从selenium中引入webdriver from selenium import webdriver import time #输入查询关键词 keywo ...

  8. 读书笔记(4)——python爬取糗事百科,并存到MySQL中

    2019独角兽企业重金招聘Python工程师标准>>> 安装MySQL.使用phpStudy集成工具来安装MySQL服务器,或者可以用USBwebserve进行安装. 打开USBwe ...

  9. Python爬取热门微博,并存储到MySQL中

    目标网站:m.weibo.cn url的获取可以从浏览器的F12中的network的XHR中找到. weibo_demo.py: import requests import json from w3 ...

  10. 对爬虫爬取到的数据进行存储

    已写章节 第一章 网络爬虫入门 第二章 基本库的使用 第三章 解析库的使用 第四章 数据存储 第五章 动态网页的抓取 文章目录 已写章节 第四章 数据存储 4.1 文件存储 4.1.1 TXT文件存储 ...

最新文章

  1. 某快手程序员吐槽:月薪四万很惶恐!和老婆亲热时都在想工作,薪资越高,做人越怂!...
  2. linux wget命令详解
  3. 汽车常识全面介绍 - 车身
  4. 实验三——vlan间路由
  5. 2008 r2彻底删除 server sql_mysql添加列、删除列,创建主键、备份等常用操作总结...
  6. python创建sqlite3数据库_树莓派使用 Python + SQLite 建立温度数据库
  7. 11.4 final类
  8. 苹果推出新款iPad Air和iPad mini,升级A12处理器
  9. 利用truffle与智能合约进行交互
  10. 数据分析|如何利用BI工具,探索各商品的潜在关联价值
  11. 漫画:如何给初学者讲“为什么计算机只认识 0 和 1”?
  12. python 交换机巡检脚本_Python自动巡检H3C交换机实现过程解析
  13. 综合计算机工时,计算机辅助工时定额制定与管理系统的研究与开发
  14. 【Spark ML】第 3 章:监督学习
  15. 如何有效破解PDF文件的密码?
  16. 【解码芯片MIPI输出 四合一】XS9922B 国产 4通道模拟复合视频解码芯片 对标TP2815
  17. 网络配置问题Bringing up interface eth0: Device eth0 does not seem to be present, delaying initialization.
  18. Mac上一款简单实用音频剪辑工具——QuickTime Player
  19. android 图片编辑工具,照片编辑器:Photo Editor
  20. K8S重启后coredns pod无法正常运行

热门文章

  1. python-sklearn实现一个简易的智能问答机器人
  2. 微信公众号 网页授权登入
  3. python找不到模块pyodbc_Python:找不到pyodbc导入模块
  4. linux目录更改权限不够,Linux中文件夹访问权限不足
  5. matlab中二阶偏导数,matlab中二元函数的一阶和二阶偏导数
  6. 网站搭建教程(怎么建网站详细步骤)
  7. ArcGIS for Android 100.3的学习与应用(三) 实现地图添加自定义指北针
  8. 第二章作业习题答案续
  9. 三维实景建模技术在智慧交通领域的新发展与深入应用
  10. mysql计算同比和环比的区别_SQL 求同比 环比