# -*- coding=utf-8 -*-
import datetime,bs4,time,requests,json,pymysqldef db_exe(sql,judge):num = 1if judge == 'find':while True:if num > 5:return '失败'try:cur.execute(sql)return cur.fetchall()except:print('查询用户失败,正在重试!')num+=1else:while True:if num > 5:return '失败'pymysql.escape_string(sql)try:cur.execute(sql)db.commit()print('写入数据库成功')breakexcept Exception  as e:print('写入数据库失败,正在重试'+str(num)+str(e))time.sleep(2)num+=1db.rollback()def soup_bs(url):num = 1while True:try:if num>5:print('重试失败超过五次,退出本次循环')return '失败'res = requests.get(url, headers=headers, timeout=10)soup = bs4.BeautifulSoup(res.content, 'lxml')  # 解析网页源码print('正在解析网页源码,请稍等')return soupexcept:print('网页源码解析失败,正在重试第:'+str(num)+'次')num+=1time.sleep(3)def add_user():sql = "select ins_number from ins_index where ins_name='' or ins_name is null "results = db_exe(sql,'find')#返回ins_name为空的用户print(len(results))if results =='失败':return '失败'if len(results) == 0:print('没有新添加的用户!,执行下一步')returnelse:for result in results:print('新添加用户为:'+result[0])print('正在获取该用户的用户名=====')url = veryins_url + '/' +result[0]soup = soup_bs(url)if soup =='失败':print('获取用户'+result[0]+'的页面失败,跳出该用户查询')continueins_name = soup.find(attrs={'id': "username"}).get('data-fullname')now_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')print('当前用户的用户名为:'+ins_name)if ins_name == '' :sql = ("update ins_index SET ins_name='%s',time_add='%s',time_update='%s' where ins_number='%s'" %(result[0],now_time,now_time,result[0]))db_exe_1 = db_exe(sql,'add')if db_exe_1 == '失败':print('添加用户失败,跳出本次添加!')continueprint('新用户添加成功,进入页面分析')else:sql = ("update ins_index set ins_name='%s',time_add='%s',time_update='%s' where ins_number='%s'"%(ins_name,now_time,now_time,result[0]))db_exe_1 = db_exe(sql,'add')if db_exe_1 == '失败':print('添加用户失败,跳出本次添加!')continueprint('新用户添加成功,进入页面分析')def judge_ins():sql = 'select ins_number from ins_index'results = db_exe(sql,'find')print('一共'+str(len(results))+'位用户')for result in results:sql = ("select * from ins_mes where ins_num='%s'"%(result[0]))db_exe_1 = db_exe(sql,'find')if len(db_exe_1)>0:print('用户:'+result[0]+'执行更新操作')ins_update_1 = ins_add(result[0],'u')if ins_update_1 == '失败':print('用户'+result[0]+'执行更新操作失败,跳过该用户')continueif ins_update_1 == '更新完成':print(result[0]+'用户更新完成')continueelse:print('用户:' + result[0] + '执行录入操作')ins_add_1 = ins_add(result[0],'a')if ins_add_1 == '失败':print('用户' + result[0] + '执行录入操作失败,跳过该用户')continuedef ins_add(add_ins,aoru):num =1num_1 =1while True:if num_1 >=5:print('用户网页打不开,跳过'+add_ins)return '失败'url = veryins_url+'/' +add_insprint(url)soup = soup_bs(url)if soup =='失败':print('分析'+add_ins+'失败,跳过该用户')return '失败'try:items = soup.findAll(attrs={'class': "item"})num_item = soup.findAll(attrs={'class': "count"})[0].get_text().split('帖子')[0]#找到帖子总数print(num_item)now_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')for item in items:img_wrap = item.find(class_="img-wrap")data_code = img_wrap.get('data-code')img_p_link = str(r'https://www.veryins.com/p/' + data_code)sql = ("select * from ins_mes where ins_code ='%s'" %data_code)db_exe_1 = db_exe(sql,'find')if len(db_exe_1) == 0:sql = ("insert into ins_mes (ins_num,ins_code,ins_p_link,time_add,time_update) values('%s','%s','%s','%s','%s')"%(add_ins, data_code, img_p_link, now_time, now_time))db_exe_1 = db_exe(sql,'add')if db_exe_1 != '失败':print('已写入数据库' + str(num) + '条')num+=1else:if aoru == 'u':return '更新完成'breakexcept Exception  as e:print('写入前12条数据出现错误,错误原因:'+ str(e))print('正在重试第'+str(num_1)+'次')time.sleep(2)num_1+=1#开始执行post,刷新更多数据try:uid_class_1 = soup.findAll(attrs={'class': "row"})[0]uid_class = uid_class_1.findAll('div')[0].get('class')[0].lower()uid_num = uid_class_1.findAll('div')[0].get(uid_class)next_cursor = soup.find(class_='list').get('next-cursor')except Exception  as e:print('获取当前博主的post数据失败,跳过该博主,错误原因:'+str(e))return '失败'while True:post_mes = str(r'https://www.veryins.com/user/post?next=' + next_cursor + r'&uid=' + uid_num)print(post_mes)while True:try:res1 = json.loads(requests.post(url=post_mes, headers=headers, timeout=10).text)breakexcept:print('post失败,正在重试!')for k in res1['nodes']:data_code = k['code']img_p_link = str(r'https://www.veryins.com/p/' + data_code)sql = ("select * from ins_mes where ins_code ='%s'"%data_code)db_exe_1 = db_exe(sql, 'find')if len(db_exe_1) == 0:sql = ("insert into ins_mes (ins_num,ins_code,ins_p_link,time_add,time_update) values('%s','%s','%s','%s','%s')"%(add_ins, data_code, img_p_link, now_time, now_time))db_exe_1 = db_exe(sql, 'add')if db_exe_1 != '失败':print('已写入数据库' + str(num) + '条')num += 1else:if aoru == 'u':return '更新完成'print('该条数据已存在,跳过')if str(res1['page_info']['has_next_page']) == 'True':next_cursor = res1['page_info']['end_cursor']time.sleep(3)else:breakdef ins_info():sql = ('select ins_mes.ins_code from ins_mes left join all_pic_link on ins_mes.ins_code=all_pic_link.ins_code  where all_pic_link.ins_code is null')results_code = db_exe(sql,'find')print('获得所有未处理code')print(str(len(results_code)))for result_code in results_code:err = 1print('开始读取数据组建链接')url_2 = 'https://www.veryins.com/p/' +result_code[0]print(url_2)num = 1print('获取网页成功,正在分析图片地址')while True:try:soup = soup_bs(url_2)swiper_slide = soup.findAll(class_='swiper-slide')if len(swiper_slide) == 0:try:img_wrapper = soup.find(class_='imgwrapper').find('img').attrs['src'].replace('amp', '')sql = ("insert into all_pic_link (ins_code,ins_pic_link) values('%s','%s')" % (result_code[0], img_wrapper))db_exe_1 = db_exe(sql, 'add')if db_exe_1 == '失败':print('写入失败')else:print('已写入数据库第' + str(num) + '张')num += 1except:video_wrapper = soup.find(class_='imgwrapper').find('source').attrs['src'].replace('amp', '')sql = ("insert into all_pic_link (ins_code,ins_pic_link) values('%s','%s')" % (result_code[0], video_wrapper))db_exe_1 = db_exe(sql, 'add')if db_exe_1 == '失败':print('写入失败')else:print('已写入数据库第' + str(num) + '部')num += 1else:for i in swiper_slide:try:img_link = i.find('img').attrs['src'].replace('amp', '')sql = ("insert into all_pic_link (ins_code,ins_pic_link) values('%s','%s')" % (result_code[0], img_link))db_exe_1 = db_exe(sql, 'add')if db_exe_1 == '失败':print('写入失败')else:print('已写入数据库第' + str(num) + '张')num += 1except:video_wrapper = i.find('source').attrs['src'].replace('amp', '')try:sql = ("insert into all_pic_link (ins_code,ins_pic_link) values('%s','%s')" % (result_code[0], video_wrapper))db_exe_1 = db_exe(sql, 'add')if db_exe_1 == '失败':print('写入失败')else:print('已写入数据库第' + str(num) + '部')num += 1except:print('出错,回滚2')time.sleep(2)continuedele = 1breakexcept:print('出错重试!')time.sleep(2)print('获取网页失败,正在重试第:' + str(err) + '次')err+=1if err >5:print(result_code[0]+'的网址分析失败次数超过五次,删除该链接')sql = ("delete from ins_mes where ins_code='%s'"%(result_code[0]))db_exe(sql,'add')dele = 0breakif dele == 0:continuecomments_link = soup.findAll(class_='comment-txt')for i in comments_link:herf_txt = i.find('a').get_text()comments_txt = pymysql.escape_string(i.find('p').get_text())try:sql = """insert into all_comments (ins_code,ins_commenter,comments) values('%s','%s','%s')"""%(result_code[0], herf_txt, comments_txt)print(sql)db_exe_1 = db_exe(sql, 'add')if db_exe_1 == '失败':print('写入失败')else:print('将评论写入数据库')except:print('出错,回滚3')time.sleep(2)article = pymysql.escape_string(soup.find(class_='caption').get_text())try:sql ="insert into all_articles (ins_code,articles) values('%s','%s')"%(result_code[0],article)print(sql)db_exe_1 = db_exe(sql, 'add')if db_exe_1 == '失败':print('写入失败')else:print('将帖子内容写入数据库')except:print('出错,回滚4')time.sleep(2)if __name__ == "__main__":headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'}veryins_url = 'https://www.veryins.com'db =pymysql.connect('localhost',user = 'root',passwd = 'toor',db = 'veryins')cur = db.cursor()ins_long = cur.execute('select * from ins_index')add_user_1 = add_user()if add_user_1 =='失败':print('查询当前用户失败,跳出本次查询')judge_ins_1 = judge_ins()if judge_ins_1 =='失败':print('获取用户图片失败,跳出本次爬取')ins_info()print('进程执行完毕')sql = ("select ins_number from ins_index")results = db_exe(sql,'find')print('共有'+str(len(results))+'位博主')for result in results:sql = ("select * from ins_mes where ins_num='%s'"%result[0])db_exe_1 = db_exe(sql,'find')sql = ("select all_pic_link.id from ins_mes inner join all_pic_link on ins_mes.ins_num='%s' and ins_mes.ins_code=all_pic_link.ins_code"%result[0])db_exe_2 = db_exe(sql,'find')print(result[0]+'博主共有'+str(len(db_exe_1))+'篇帖子和'+str(len(db_exe_2))+'张照片和视频')

由于小站的内容非常好,但是不是自己喜欢的版面,曾经求助站长,自己收藏的博主能否合并显示,就像刷ins一样,登陆自己的账号可以只看自己喜欢的博主,但是站长大人没有回信,于是自己写了段代码爬取自己喜欢的博主的帖子和所有照片链接并存入数据库,打算自己做一个接口,可以自己看,如果站长大人看到此篇有任何不满,我会立刻删除。谢谢

由于是初学,python爬虫的格式肯定是不规范的,并且有很多地方写的不是很好,欢迎指正,让我进步!!谢谢

python-爬虫 爬取veryins网页2.0版相关推荐

  1. chrome动态ip python_用Python爬虫爬取动态网页,附带完整代码,有错误欢迎指出!...

    系统环境: 操作系统:Windows8.1专业版 64bit Python:anaconda.Python2.7 Python modules:requests.random.json Backgro ...

  2. Python爬虫爬取动态网页

    系统环境: 操作系统:Windows8.1专业版 64bit Python:anaconda.Python2.7 Python modules:requests.random.json Backgro ...

  3. Python爬虫爬取静态网页基本方法介绍

    爬取静态网页的技术 数据请求模块 一.Requests库 发送GET请求 发送POST请求 get请求和post请求两者之间的区别 处理响应 定制请求头 验证Cookie 保持会话 二.urllib库 ...

  4. Python爬虫爬取静态网页实例一:爬取内涵段子吧上的段子

    最近在学爬虫,这里用实例来与大家分享一下我学习的经验. 这里讲一个爬取静态网页内容的实例,Python一般利用正则表达式爬取静态静态网页的内容,而且因为静态网页源代码固定,不会发生变化,所以比较简单, ...

  5. Python爬虫-爬取斗鱼网页selenium+bs

    爬取斗鱼网页(selenium+chromedriver得到网页,用Beasutiful Soup提取信息) ============================= =============== ...

  6. python爬虫爬取微信网页_python下爬虫爬取微信公众号文章给网站的相关操作与问题...

    一.出发点 在dodo团队知乎号开刊文章中已介绍过本团队平常的实际工作,我们是一个从事游戏与金融结合的项目开发与运营团队.技术上主要是从事游戏分期.玩后付支付插件.游戏充值app等前后端开发,主要使用 ...

  7. 使用Python爬虫爬取简单网页(Python爬虫入门)

    今天我们来看一看使用Python爬取一些简单的网页. 所用工具:IDLE (Python 3.6 64-bit) 一. 爬取京东商品页面 我将要爬取的是这个东京商品页面信息,代码如下: import ...

  8. python爬虫 爬取京东网页

    import json import requests from bs4 import BeautifulSoupinput_name = input('请输入搜索关键字:')# 获取京东商品前50页 ...

  9. python爬虫网页中的图片_Python爬虫爬取一个网页上的图片地址实例代码

    本文实例主要是实现爬取一个网页上的图片地址,具体如下. 读取一个网页的源代码: import urllib.request def getHtml(url): html=urllib.request. ...

最新文章

  1. 计算机科学技术作文600,生活因科技而精彩作文600字
  2. Java知识全面总结:并发编程+JVM+设计模式+常用框架+....
  3. oracle错误15260,【案例】Oracle报错ORA-00600 2663 产生的原因和解决办法
  4. 用 Java 对 hbase 进行CRUD增删改查操作
  5. 鸟哥的Linux私房菜(基础篇)-第零章、计算机概论(零.2)
  6. 自定义Mybatis框架
  7. 循环语句 for循环、while循环、do while循环
  8. python画激活函数图像
  9. BeyondCompare3提示许可密钥过期完美解决方法:3281-0350
  10. MVC安全:打破固定会话
  11. 安装Visual Studio 2013以及简单使用
  12. 大数据hadoop培训总结
  13. vs 2008 连接 tfs 2010
  14. ensp官方停止下载
  15. 充气泵方案设计-汽车打气泵PCBA
  16. CSS度量单位px/pt/em/in/pc/mm/cm
  17. Ingress暴露服务的方式
  18. 智鼎在线测评是测什么_(详解)人才测评工具和人才测评方法
  19. 如何给边框添加阴影效果
  20. mvn No proxies configured downloading directly

热门文章

  1. [每日一书] 机器学习的数学“百科全书”
  2. 【机器学习】【数学】机器学习涉及的数学知识
  3. 艾永亮:亚朵如何把IP价值发挥极致,超级产品才是亚朵的护城河
  4. 作为一名普通的程序员,聊聊这四年的工作感悟
  5. 模式识别感知器算法matlab,模式识别第三章-感知器算法.doc
  6. 感知器算法及其Matlab实现
  7. c语言实现五子棋(真人对战+人机大战)
  8. java voliate_voliate关键字及其示例
  9. linux 标准输入、标准输出、错误输出的重定向
  10. CCE集群切换OBS共享存储方案