Python_爬虫_案例汇总:
1.豆瓣采集
1 #coding:utf-8 2 #采集豆瓣书信息和图片,写进数据库 3 4 from urllib import request 5 # from bs4 import BeautifulSoup 6 from lxml import etree 7 import json,pymysql 8 9 # from my_pymysql import pymysql 10 11 url="https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4" 12 headers={ 13 'Host':'book.douban.com', 14 'Upgrade-Insecure-Requests':'1', 15 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36' 16 } 17 req = request.Request(url=url,headers=headers,method="GET") 18 content = request.urlopen(req).read().decode("utf-8") 19 content_dict=etree.HTML(content) #格式化 20 # print(content_dict) 21 content_dict_allli = content_dict.xpath(r'//*[@id="subject_list"]/ul/li') #拿到列表 22 info_all = '' 23 24 for li in content_dict_allli: 25 # 书名/标题 26 title_list = li.xpath(r'div[2]/h2/a/@title') #取标签里的内容,注意地址是相对地址,不能直接拿来用 (注:和bs4不一样) 27 title =title_list[0] 28 title=title.replace(" ",'') 29 print(title) 30 #信息 作者、出版社 31 info_list = li.xpath(r'div[2]/div[1]/text()') 32 author = info_list[0].split('/')[0] 33 author = author.replace('\n','').replace(" ",'') 34 chubanshe = info_list[0].split('/')[1] 35 print(author) 36 print(chubanshe) 37 #评分 38 pingfen_list = li.xpath(r'div[2]/div[2]/span[2]/text()') 39 pingfen = pingfen_list[0] 40 print(pingfen) 41 42 #图片 43 img_net_addr =li.xpath(r'div[1]/a/img/@src') 44 img_net_addr = img_net_addr[0] 45 print(img_net_addr) 46 data = request.urlopen(img_net_addr).read() 47 img_name =str('douban/') + title + str('.jpg') 48 with open(img_name,'wb')as f: 49 f.write(data) 50 51 #数据库 52 db = pymysql.connect(host='localhost',port=3306,user="root",password='root',db='douban',charset='utf8') # 53 cur=db.cursor() 54 sql = "insert into douban(title,author,chubanshe,pingfen)values('%s','%s','%s','%s')"%(title,author,chubanshe,pingfen) 55 cur.execute(sql) 56 db.commit() 57 58 db.close()
采集豆瓣书信息和图片;带请求头、存数据库、图片;写进数据库
2.链家
#coding:utf-8 #完成,,取出链家数据存到文件里 from urllib import request,error from bs4 import BeautifulSoup import pymysql# from my_pymysql import pymysql #引入数据库 #创建数据库 db = pymysql.connect(host='localhost',user='root',password='root',db='lianjia',charset='utf8') cur = db.cursor() #实例化游标for i in range(1,33):req=request.urlopen('https://xa.lianjia.com/ershoufang/pg'+str(i)).read().decode('utf-8')req_bs4 = BeautifulSoup(req,'html.parser') #建立对象,才能用bs4body_ul=req_bs4.find('ul',class_="sellListContent")try:s=''for li in body_ul:# info_all = li.find('div',class_="info clear").get_text() #全部信息tit = li.find('div',class_="title").get_text() #标题addr = li.find('div',class_="houseInfo").get_text() #地址pric = li.find('div',class_="totalPrice").get_text() #价格s+=tits+=addrs+=prics+='\n\n'print(i) #提示采集的位置# 采集图片开始++++++++++++++++++++++++++++++++++++++++++++img = li.find("img", class_='lj-lazy')['data-original'] #图片地址img_format = img.split('.')[-1] # 用点隔开,取图片的后缀img_name = 'lianjia/images/' + li.find("img", class_='lj-lazy')['alt'] + '.' + img_format # 名字adr = request.urlopen(img).read() # 读取图片地址,拿到字节流形式的图片,,写进去try: #;空的话就跳过with open(img_name, 'wb')as f:f.write(adr)except:pass# 采集图片完毕----------------------------#存到数据库sql = "insert into lianjia_hotel(title,address) values ('%s','%s')"%(tit,addr)cur.execute(sql)db.commit()except:print("本页完毕~") #最后再关闭数据库 db.close()#写到一个txt文件里面 # with open('lianjia/lianjia.txt','w',encoding="utf-8")as f: # f.write(s)
链家下载,文字与图片,用bs4解析
3.今日头条
from selenium import webdriver from lxml import etree from pyquery import PyQuery as pq import timedriver = webdriver.Chrome() driver.maximize_window() driver.get('https://www.toutiao.com/') driver.implicitly_wait(10) driver.find_element_by_link_text('科技').click() driver.implicitly_wait(10) for x in range(3):js="var q=document.documentElement.scrollTop="+str(x*500)driver.execute_script(js)time.sleep(2)time.sleep(5) page = driver.page_source doc = pq(page) doc = etree.HTML(str(doc)) contents = doc.xpath('//div[@class="wcommonFeed"]/ul/li') print(contents) for x in contents:title = x.xpath('div/div[1]/div/div[1]/a/text()')if title:title = title[0]with open('toutiao.txt','a+',encoding='utf8')as f:f.write(title+'\n')print(title)else:pass
今日头条,selenium控制翻页
4.微信群信息(包括成员)和联系人
# -*- coding:utf-8 -*- ''' 扫码登陆微信后获取该微信账号的微信群(包括群内人员)和通讯录联系人信息【注:好像不全】 '''import os import re import time import sys import subprocess import requests import xml.dom.minidom import json# 微信登陆 class WebwxLogin(object):def __init__(self):self.session = requests.session()self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 5.1; rv:33.0) Gecko/20100101 Firefox/33.0'}self.QRImgPath = os.path.split(os.path.realpath(__file__))[0] + os.sep + 'webWeixinQr.jpg'self.uuid = ''self.tip = 0self.base_uri = ''self.redirect_uri = ''self.skey = ''self.wxsid = ''self.wxuin = ''self.pass_ticket = ''self.deviceId = 'e000000000000000'self.BaseRequest = {}self.ContactList = []self.My = []self.SyncKey = ''def getUUID(self):url = 'https://login.weixin.qq.com/jslogin'params = {'appid': 'wx782c26e4c19acffb','redirect_uri': 'https://wx.qq.com/cgi-bin/mmwebwx-bin/webwxnewloginpage','fun': 'new','lang': 'zh_CN','_': int(time.time() * 1000), # 时间戳 }response = self.session.get(url, params=params)target = response.content.decode('utf-8')pattern = r'window.QRLogin.code = (\d+); window.QRLogin.uuid = "(\S+?)"'ob = re.search(pattern, target) # 正则提取uuid code = ob.group(1)self.uuid = ob.group(2)if code == '200': # 判断请求是否成功return Truereturn Falsedef showQRImage(self):url = 'https://login.weixin.qq.com/qrcode/' + self.uuidresponse = self.session.get(url)self.tip = 1with open(self.QRImgPath, 'wb') as f:f.write(response.content)f.close()# 打开二维码if sys.platform.find('darwin') >= 0:subprocess.call(['open', self.QRImgPath]) # 苹果系统elif sys.platform.find('linux') >= 0:subprocess.call(['xdg-open', self.QRImgPath]) # linux系统else:os.startfile(self.QRImgPath) # windows系统print('请使用微信扫描二维码登录')def checkLogin(self):url = 'https://login.weixin.qq.com/cgi-bin/mmwebwx-bin/login?tip=%s&uuid=%s&_=%s' % (self.tip, self.uuid, int(time.time() * 1000))response = self.session.get(url)target = response.content.decode('utf-8')pattern = r'window.code=(\d+);'ob = re.search(pattern, target)code = ob.group(1)if code == '201': # 已扫描print('成功扫描,请在手机上点击确认登录')self.tip = 0elif code == '200': # 已登录print('正在登录中...')regx = r'window.redirect_uri="(\S+?)";'ob = re.search(regx, target)self.redirect_uri = ob.group(1) + '&fun=new'self.base_uri = self.redirect_uri[:self.redirect_uri.rfind('/')]elif code == '408': # 超时passreturn codedef login(self):response = self.session.get(self.redirect_uri, verify=False)data = response.content.decode('utf-8')doc = xml.dom.minidom.parseString(data)root = doc.documentElement# 提取响应中的参数for node in root.childNodes:if node.nodeName == 'skey':self.skey = node.childNodes[0].dataelif node.nodeName == 'wxsid':self.wxsid = node.childNodes[0].dataelif node.nodeName == 'wxuin':self.wxuin = node.childNodes[0].dataelif node.nodeName == 'pass_ticket':self.pass_ticket = node.childNodes[0].dataif not all((self.skey, self.wxsid, self.wxuin, self.pass_ticket)):return Falseself.BaseRequest = {'Uin': int(self.wxuin),'Sid': self.wxsid,'Skey': self.skey,'DeviceID': self.deviceId,}return Truedef webwxinit(self):url = self.base_uri + \'/webwxinit?pass_ticket=%s&skey=%s&r=%s' % (self.pass_ticket, self.skey, int(time.time() * 1000))params = {'BaseRequest': self.BaseRequest}h = self.headersh['ContentType'] = 'application/json; charset=UTF-8'response = self.session.post(url, data=json.dumps(params), headers=h, verify=False)data = response.content.decode('utf-8')print(data)dic = json.loads(data)self.ContactList = dic['ContactList']self.My = dic['User']SyncKeyList = []for item in dic['SyncKey']['List']:SyncKeyList.append('%s_%s' % (item['Key'], item['Val']))self.SyncKey = '|'.join(SyncKeyList)ErrMsg = dic['BaseResponse']['ErrMsg']Ret = dic['BaseResponse']['Ret']if Ret != 0:return Falsereturn Truedef webwxgetcontact(self):url = self.base_uri + \'/webwxgetcontact?pass_ticket=%s&skey=%s&r=%s' % (self.pass_ticket, self.skey, int(time.time()))h = self.headersh['ContentType'] = 'application/json; charset=UTF-8'response = self.session.get(url, headers=h, verify=False)data = response.content.decode('utf-8')# print(data) dic = json.loads(data)MemberList = dic['MemberList']# 倒序遍历,不然删除的时候出问题..SpecialUsers = ["newsapp", "fmessage", "filehelper", "weibo", "qqmail", "tmessage", "qmessage", "qqsync","floatbottle", "lbsapp", "shakeapp", "medianote", "qqfriend", "readerapp", "blogapp","facebookapp", "masssendapp","meishiapp", "feedsapp", "voip", "blogappweixin", "weixin", "brandsessionholder","weixinreminder", "wxid_novlwrv3lqwv11", "gh_22b87fa7cb3c", "officialaccounts","notification_messages", "wxitil", "userexperience_alarm"]for i in range(len(MemberList) - 1, -1, -1):Member = MemberList[i]if Member['VerifyFlag'] & 8 != 0: # 公众号/服务号 MemberList.remove(Member)elif Member['UserName'] in SpecialUsers: # 特殊账号 MemberList.remove(Member)elif Member['UserName'].find('@@') != -1: # 群聊 MemberList.remove(Member)elif Member['UserName'] == self.My['UserName']: # 自己 MemberList.remove(Member)return MemberListdef main(self):if not self.getUUID():print('获取uuid失败')returnself.showQRImage()time.sleep(1)while self.checkLogin() != '200':passos.remove(self.QRImgPath)if not self.login():print('登录失败')return# 登录完成, 下面查询好友if not self.webwxinit():print('初始化失败')returnMemberList = self.webwxgetcontact()print('通讯录共%s位好友' % len(MemberList))for x in MemberList:sex = '未知' if x['Sex'] == 0 else '男' if x['Sex'] == 1 else '女'print('昵称:%s, 性别:%s, 备注:%s, 签名:%s' % (x['NickName'], sex, x['RemarkName'], x['Signature']))if __name__ == '__main__':print('开始')wx = WebwxLogin()wx.main()
爬取微信群信息(包括成员)和联系人信息
5.爬取淘宝固定类别商品信息+保存到mysql数据库【格式很规范】
import requests import re import pymysqldef getHTMLtext(url):try:r=requests.get(url,timeout=100)r.raise_for_status()r.encoding=r.apparent_encodingreturn r.textexcept:return "" def getpage(itl,html):try:plt=re.findall(r'"view_price":"[\d.]*"',html)nlt=re.findall(r'"raw_title":".*?"',html)for i in range(len(plt)):price = eval(plt[i].split(':')[1]) # eval(fun,obj)title = eval(nlt[i].split(':')[1])itl.append([price, title])except:print("")def printgoods(itl):tplt = "{:2}\t{:8}\t{:16}"print(tplt.format("序号", "价格", "商品名称"))count = 0conn = pymysql.connect(host='127.0.0.1', user='root', password='123456', db='company',charset="utf8")cur = conn.cursor()sqlc = '''create table coffee(id int(11) not null auto_increment primary key,name varchar(255) not null,price float not null)DEFAULT CHARSET=utf8;'''try:A = cur.execute(sqlc)conn.commit()print('成功')except:print("错误")for g in itl:count = count + 1b=tplt.format(count, g[0], g[1])sqla = '''insert into coffee(name,price)values(%s,%s);'''try:B = cur.execute(sqla,(g[1],g[0]))conn.commit()print('成功')except:print("错误")# save_path = 'D:/taobao.txt'# f=open(save_path,'a')# # f.write(b+'\n')# f.close() conn.commit()cur.close()conn.close()def main():goods="咖啡"depth =2start_url='https://s.taobao.com/search?q='+goodsList =[]for i in range(depth):try:url =start_url +"&s="+ str(i*44)html=getHTMLtext(url)getpage(List,html)except:continueprint(printgoods(List))# savefiles(data) main()
淘宝信息采集+保存到Mysql数据库
转载于:https://www.cnblogs.com/hellangels333/p/8621368.html
Python_爬虫_案例汇总:相关推荐
- Python爬虫_案例分析(二)
Python爬虫_案例分析(二) 一.电影天堂案例 import scrapy from scrapy_movie.items import ScrapyMovieItem class MvSpide ...
- Python_爬虫_网页图片下载_その日の紋
Python_爬虫_网页图片下载_その日の紋 项目效果 项目需求 项目分析 URL分析 页面分析 项目实施 项目源码 项目效果 项目需求 目标页面:https://www.hanakomon.jp/c ...
- Python_爬虫_猫眼电影网电影预告片批量下载
非常简单的一个基础爬虫代码,可以根据不同的url自动下载同一页中的所有预告片 import requests from lxml import etree import re# 1.确定url地址 u ...
- python_爬虫_七麦网
本文用于学习交流使用,如有侵权,联系删除 1 爬取需求 1.1 七麦网简介 七麦网(https://www.qimai.cn/),该平台支持提供iOS.Android应用市场.微信.小程序等数据查询, ...
- python_爬虫_豆瓣TOP250_url
本文仅供学习使用,如有侵权,联系删除. 获得豆瓣top 250书单的url import lxml import requests import re import csv from requests ...
- Python_爬虫_中文乱码
今天在用Python2.7爬取百度百科的一个网页时发现输出时中文为乱码. 尝试一: 查看网页页面信息,发现其中文字编码为"GBK",遂准备对其进行解码. content = url ...
- python_爬虫_豆瓣TOP250_页面内容
本文仅供学习使用,如有侵权,联系删除 豆瓣TOP250书籍页面内容如下,此次将爬取图片中的内容 from bs4 import BeautifulSoup import lxml import req ...
- Python_机器学习_常用科学计算库_第6章_ Seaborn+综合案例
Python_机器学习_常用科学计算库_第6章_ Seaborn+综合案例 文章目录 Python_机器学习_常用科学计算库_第6章_ Seaborn+综合案例 Seaborn 学习目标 6.1 Se ...
- Python爬虫_某宝网案例
Python爬虫_某宝网案例 一.导入第三方库,确定url,定义headers ,伪装爬虫代码 import requests url = 'https://s.taobao.com/search?q ...
最新文章
- Linux命令之du
- HenCoder Android UI 部分 2-3 定制 Layout 的内部布局
- Table 'xxxxx' is marked as crashed and last 解决办法
- __attribute__机制介绍
- ftp 速度_如何评价我的骑行功率(FTP)?
- 解决PRINT函数UTF-8问题
- FPGA定点数字信号处理技巧(一)
- 土地土壤数据下载网站整理
- 计算机主机装机注意,自己组装电脑要注意什么?DIY老司机教你装机注意事项 (全文)...
- 中国新能源汽车产业销售模式与十四五竞争格局展望报告2022版
- Tecplot新手进阶--使用tecplot宏操作批量处理数据输出图片(详细步骤)
- java 数组写入txt_java编程,如何把一个数组中的内容写入一个txt文档中
- 二分图匹配 Hopcroft-Carp (HK) 算法详解 附例题
- 目前为止最全的微信小程序项目实例 --- demo锦集
- 如何消除win10文件右上角的蓝色箭头
- 基于Mui与H5+开发webapp的Android原生工程打包步骤(使用新版本5+SDK与Android studio)(部分内容转自dcloud官网)...
- Mysql-可重复读的隔离级别在什么情况下会出现幻读
- Zabbix unreachable poller processes more than 75% busy
- 华为无线ap由fit升级到fat的方法
- 中国移动MobileMarket重点支持机型信息-12月24日
热门文章
- 第37课 神奇的大自然 《小学生C++趣味编程》
- 计算机硬软件故障实训报告,计算机维护维修实训报告.docx
- html怎么给变量添加样式,通过CSS变量修改样式
- 信息安全工程师笔记-网络攻击常见技术
- Python笔记-Can’t reconnect until invalid transaction is rolled back
- Java笔记-Spring Boot SSL(https)实例
- Linux笔记-centos7替换yum及编译安装mydumper
- flash绘制荷花多个图层_Flash鼠绘入门第八课:绘制脱俗荷花
- python爬虫下载模块_python爬虫系列(4.5-使用urllib模块方式下载图片)
- linux比较小数大小,带有小数点的数值对比大小