1.豆瓣采集

 1 #coding:utf-8
 2 #采集豆瓣书信息和图片,写进数据库
 3
 4 from urllib import request
 5 # from bs4 import BeautifulSoup
 6 from lxml import etree
 7 import json,pymysql
 8
 9 # from my_pymysql import pymysql
10
11 url="https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4"
12 headers={
13     'Host':'book.douban.com',
14     'Upgrade-Insecure-Requests':'1',
15     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'
16 }
17 req = request.Request(url=url,headers=headers,method="GET")
18 content = request.urlopen(req).read().decode("utf-8")
19 content_dict=etree.HTML(content)    #格式化
20 # print(content_dict)
21 content_dict_allli = content_dict.xpath(r'//*[@id="subject_list"]/ul/li')  #拿到列表
22 info_all = ''
23
24 for li in content_dict_allli:
25     # 书名/标题
26     title_list = li.xpath(r'div[2]/h2/a/@title')    #取标签里的内容,注意地址是相对地址,不能直接拿来用 (注:和bs4不一样)
27     title =title_list[0]
28     title=title.replace(" ",'')
29     print(title)
30     #信息 作者、出版社
31     info_list = li.xpath(r'div[2]/div[1]/text()')
32     author = info_list[0].split('/')[0]
33     author = author.replace('\n','').replace(" ",'')
34     chubanshe = info_list[0].split('/')[1]
35     print(author)
36     print(chubanshe)
37     #评分
38     pingfen_list = li.xpath(r'div[2]/div[2]/span[2]/text()')
39     pingfen = pingfen_list[0]
40     print(pingfen)
41
42     #图片
43     img_net_addr =li.xpath(r'div[1]/a/img/@src')
44     img_net_addr = img_net_addr[0]
45     print(img_net_addr)
46     data = request.urlopen(img_net_addr).read()
47     img_name =str('douban/') + title + str('.jpg')
48     with open(img_name,'wb')as f:
49         f.write(data)
50
51     #数据库
52     db = pymysql.connect(host='localhost',port=3306,user="root",password='root',db='douban',charset='utf8')    #
53     cur=db.cursor()
54     sql = "insert into douban(title,author,chubanshe,pingfen)values('%s','%s','%s','%s')"%(title,author,chubanshe,pingfen)
55     cur.execute(sql)
56     db.commit()
57
58 db.close()

采集豆瓣书信息和图片;带请求头、存数据库、图片;写进数据库

2.链家

#coding:utf-8
#完成,,取出链家数据存到文件里
from urllib import request,error
from bs4 import BeautifulSoup
import pymysql# from my_pymysql import pymysql  #引入数据库
#创建数据库
db = pymysql.connect(host='localhost',user='root',password='root',db='lianjia',charset='utf8')
cur = db.cursor()   #实例化游标for i in range(1,33):req=request.urlopen('https://xa.lianjia.com/ershoufang/pg'+str(i)).read().decode('utf-8')req_bs4 = BeautifulSoup(req,'html.parser')  #建立对象,才能用bs4body_ul=req_bs4.find('ul',class_="sellListContent")try:s=''for li in body_ul:# info_all = li.find('div',class_="info clear").get_text()    #全部信息tit = li.find('div',class_="title").get_text()  #标题addr = li.find('div',class_="houseInfo").get_text() #地址pric = li.find('div',class_="totalPrice").get_text()    #价格s+=tits+=addrs+=prics+='\n\n'print(i)    #提示采集的位置# 采集图片开始++++++++++++++++++++++++++++++++++++++++++++img = li.find("img", class_='lj-lazy')['data-original'] #图片地址img_format = img.split('.')[-1]  # 用点隔开,取图片的后缀img_name = 'lianjia/images/' + li.find("img", class_='lj-lazy')['alt'] + '.' + img_format  # 名字adr = request.urlopen(img).read()  # 读取图片地址,拿到字节流形式的图片,,写进去try:    #;空的话就跳过with open(img_name, 'wb')as f:f.write(adr)except:pass# 采集图片完毕----------------------------#存到数据库sql = "insert into lianjia_hotel(title,address) values ('%s','%s')"%(tit,addr)cur.execute(sql)db.commit()except:print("本页完毕~")
#最后再关闭数据库
db.close()#写到一个txt文件里面
# with open('lianjia/lianjia.txt','w',encoding="utf-8")as f:
#     f.write(s)

链家下载,文字与图片,用bs4解析

3.今日头条

from selenium import webdriver
from lxml import etree
from pyquery import PyQuery as pq
import timedriver = webdriver.Chrome()
driver.maximize_window()
driver.get('https://www.toutiao.com/')
driver.implicitly_wait(10)
driver.find_element_by_link_text('科技').click()
driver.implicitly_wait(10)
for x in range(3):js="var q=document.documentElement.scrollTop="+str(x*500)driver.execute_script(js)time.sleep(2)time.sleep(5)
page = driver.page_source
doc = pq(page)
doc = etree.HTML(str(doc))
contents = doc.xpath('//div[@class="wcommonFeed"]/ul/li')
print(contents)
for x in contents:title = x.xpath('div/div[1]/div/div[1]/a/text()')if title:title = title[0]with open('toutiao.txt','a+',encoding='utf8')as f:f.write(title+'\n')print(title)else:pass

今日头条,selenium控制翻页

4.微信群信息(包括成员)和联系人

# -*- coding:utf-8 -*-
'''
扫码登陆微信后获取该微信账号的微信群(包括群内人员)和通讯录联系人信息【注:好像不全】
'''import os
import re
import time
import sys
import subprocess
import requests
import xml.dom.minidom
import json# 微信登陆
class WebwxLogin(object):def __init__(self):self.session = requests.session()self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 5.1; rv:33.0) Gecko/20100101 Firefox/33.0'}self.QRImgPath = os.path.split(os.path.realpath(__file__))[0] + os.sep + 'webWeixinQr.jpg'self.uuid = ''self.tip = 0self.base_uri = ''self.redirect_uri = ''self.skey = ''self.wxsid = ''self.wxuin = ''self.pass_ticket = ''self.deviceId = 'e000000000000000'self.BaseRequest = {}self.ContactList = []self.My = []self.SyncKey = ''def getUUID(self):url = 'https://login.weixin.qq.com/jslogin'params = {'appid': 'wx782c26e4c19acffb','redirect_uri': 'https://wx.qq.com/cgi-bin/mmwebwx-bin/webwxnewloginpage','fun': 'new','lang': 'zh_CN','_': int(time.time() * 1000),  # 时间戳
        }response = self.session.get(url, params=params)target = response.content.decode('utf-8')pattern = r'window.QRLogin.code = (\d+); window.QRLogin.uuid = "(\S+?)"'ob = re.search(pattern, target)  # 正则提取uuid
code = ob.group(1)self.uuid = ob.group(2)if code == '200':  # 判断请求是否成功return Truereturn Falsedef showQRImage(self):url = 'https://login.weixin.qq.com/qrcode/' + self.uuidresponse = self.session.get(url)self.tip = 1with open(self.QRImgPath, 'wb') as f:f.write(response.content)f.close()# 打开二维码if sys.platform.find('darwin') >= 0:subprocess.call(['open', self.QRImgPath])  # 苹果系统elif sys.platform.find('linux') >= 0:subprocess.call(['xdg-open', self.QRImgPath])  # linux系统else:os.startfile(self.QRImgPath)  # windows系统print('请使用微信扫描二维码登录')def checkLogin(self):url = 'https://login.weixin.qq.com/cgi-bin/mmwebwx-bin/login?tip=%s&uuid=%s&_=%s' % (self.tip, self.uuid, int(time.time() * 1000))response = self.session.get(url)target = response.content.decode('utf-8')pattern = r'window.code=(\d+);'ob = re.search(pattern, target)code = ob.group(1)if code == '201':  # 已扫描print('成功扫描,请在手机上点击确认登录')self.tip = 0elif code == '200':  # 已登录print('正在登录中...')regx = r'window.redirect_uri="(\S+?)";'ob = re.search(regx, target)self.redirect_uri = ob.group(1) + '&fun=new'self.base_uri = self.redirect_uri[:self.redirect_uri.rfind('/')]elif code == '408':  # 超时passreturn codedef login(self):response = self.session.get(self.redirect_uri, verify=False)data = response.content.decode('utf-8')doc = xml.dom.minidom.parseString(data)root = doc.documentElement# 提取响应中的参数for node in root.childNodes:if node.nodeName == 'skey':self.skey = node.childNodes[0].dataelif node.nodeName == 'wxsid':self.wxsid = node.childNodes[0].dataelif node.nodeName == 'wxuin':self.wxuin = node.childNodes[0].dataelif node.nodeName == 'pass_ticket':self.pass_ticket = node.childNodes[0].dataif not all((self.skey, self.wxsid, self.wxuin, self.pass_ticket)):return Falseself.BaseRequest = {'Uin': int(self.wxuin),'Sid': self.wxsid,'Skey': self.skey,'DeviceID': self.deviceId,}return Truedef webwxinit(self):url = self.base_uri + \'/webwxinit?pass_ticket=%s&skey=%s&r=%s' % (self.pass_ticket, self.skey, int(time.time() * 1000))params = {'BaseRequest': self.BaseRequest}h = self.headersh['ContentType'] = 'application/json; charset=UTF-8'response = self.session.post(url, data=json.dumps(params), headers=h, verify=False)data = response.content.decode('utf-8')print(data)dic = json.loads(data)self.ContactList = dic['ContactList']self.My = dic['User']SyncKeyList = []for item in dic['SyncKey']['List']:SyncKeyList.append('%s_%s' % (item['Key'], item['Val']))self.SyncKey = '|'.join(SyncKeyList)ErrMsg = dic['BaseResponse']['ErrMsg']Ret = dic['BaseResponse']['Ret']if Ret != 0:return Falsereturn Truedef webwxgetcontact(self):url = self.base_uri + \'/webwxgetcontact?pass_ticket=%s&skey=%s&r=%s' % (self.pass_ticket, self.skey, int(time.time()))h = self.headersh['ContentType'] = 'application/json; charset=UTF-8'response = self.session.get(url, headers=h, verify=False)data = response.content.decode('utf-8')# print(data)
dic = json.loads(data)MemberList = dic['MemberList']# 倒序遍历,不然删除的时候出问题..SpecialUsers = ["newsapp", "fmessage", "filehelper", "weibo", "qqmail", "tmessage", "qmessage", "qqsync","floatbottle", "lbsapp", "shakeapp", "medianote", "qqfriend", "readerapp", "blogapp","facebookapp", "masssendapp","meishiapp", "feedsapp", "voip", "blogappweixin", "weixin", "brandsessionholder","weixinreminder", "wxid_novlwrv3lqwv11", "gh_22b87fa7cb3c", "officialaccounts","notification_messages", "wxitil", "userexperience_alarm"]for i in range(len(MemberList) - 1, -1, -1):Member = MemberList[i]if Member['VerifyFlag'] & 8 != 0:  # 公众号/服务号
                MemberList.remove(Member)elif Member['UserName'] in SpecialUsers:  # 特殊账号
                MemberList.remove(Member)elif Member['UserName'].find('@@') != -1:  # 群聊
                MemberList.remove(Member)elif Member['UserName'] == self.My['UserName']:  # 自己
                MemberList.remove(Member)return MemberListdef main(self):if not self.getUUID():print('获取uuid失败')returnself.showQRImage()time.sleep(1)while self.checkLogin() != '200':passos.remove(self.QRImgPath)if not self.login():print('登录失败')return# 登录完成, 下面查询好友if not self.webwxinit():print('初始化失败')returnMemberList = self.webwxgetcontact()print('通讯录共%s位好友' % len(MemberList))for x in MemberList:sex = '未知' if x['Sex'] == 0 else '男' if x['Sex'] == 1 else '女'print('昵称:%s, 性别:%s, 备注:%s, 签名:%s' % (x['NickName'], sex, x['RemarkName'], x['Signature']))if __name__ == '__main__':print('开始')wx = WebwxLogin()wx.main()

爬取微信群信息(包括成员)和联系人信息

5.爬取淘宝固定类别商品信息+保存到mysql数据库【格式很规范】

import requests
import re
import pymysqldef getHTMLtext(url):try:r=requests.get(url,timeout=100)r.raise_for_status()r.encoding=r.apparent_encodingreturn r.textexcept:return ""
def getpage(itl,html):try:plt=re.findall(r'"view_price":"[\d.]*"',html)nlt=re.findall(r'"raw_title":".*?"',html)for i in range(len(plt)):price = eval(plt[i].split(':')[1])  # eval(fun,obj)title = eval(nlt[i].split(':')[1])itl.append([price, title])except:print("")def printgoods(itl):tplt = "{:2}\t{:8}\t{:16}"print(tplt.format("序号", "价格", "商品名称"))count = 0conn = pymysql.connect(host='127.0.0.1', user='root', password='123456', db='company',charset="utf8")cur = conn.cursor()sqlc = '''create table coffee(id int(11) not null auto_increment primary key,name varchar(255) not null,price float not null)DEFAULT CHARSET=utf8;'''try:A = cur.execute(sqlc)conn.commit()print('成功')except:print("错误")for g in itl:count = count + 1b=tplt.format(count, g[0], g[1])sqla = '''insert into  coffee(name,price)values(%s,%s);'''try:B = cur.execute(sqla,(g[1],g[0]))conn.commit()print('成功')except:print("错误")# save_path = 'D:/taobao.txt'# f=open(save_path,'a')#
        # f.write(b+'\n')# f.close()
conn.commit()cur.close()conn.close()def main():goods="咖啡"depth =2start_url='https://s.taobao.com/search?q='+goodsList =[]for i in range(depth):try:url =start_url +"&s="+ str(i*44)html=getHTMLtext(url)getpage(List,html)except:continueprint(printgoods(List))# savefiles(data)
main()

淘宝信息采集+保存到Mysql数据库

转载于:https://www.cnblogs.com/hellangels333/p/8621368.html

Python_爬虫_案例汇总:相关推荐

  1. Python爬虫_案例分析(二)

    Python爬虫_案例分析(二) 一.电影天堂案例 import scrapy from scrapy_movie.items import ScrapyMovieItem class MvSpide ...

  2. Python_爬虫_网页图片下载_その日の紋

    Python_爬虫_网页图片下载_その日の紋 项目效果 项目需求 项目分析 URL分析 页面分析 项目实施 项目源码 项目效果 项目需求 目标页面:https://www.hanakomon.jp/c ...

  3. Python_爬虫_猫眼电影网电影预告片批量下载

    非常简单的一个基础爬虫代码,可以根据不同的url自动下载同一页中的所有预告片 import requests from lxml import etree import re# 1.确定url地址 u ...

  4. python_爬虫_七麦网

    本文用于学习交流使用,如有侵权,联系删除 1 爬取需求 1.1 七麦网简介 七麦网(https://www.qimai.cn/),该平台支持提供iOS.Android应用市场.微信.小程序等数据查询, ...

  5. python_爬虫_豆瓣TOP250_url

    本文仅供学习使用,如有侵权,联系删除. 获得豆瓣top 250书单的url import lxml import requests import re import csv from requests ...

  6. Python_爬虫_中文乱码

    今天在用Python2.7爬取百度百科的一个网页时发现输出时中文为乱码. 尝试一: 查看网页页面信息,发现其中文字编码为"GBK",遂准备对其进行解码. content = url ...

  7. python_爬虫_豆瓣TOP250_页面内容

    本文仅供学习使用,如有侵权,联系删除 豆瓣TOP250书籍页面内容如下,此次将爬取图片中的内容 from bs4 import BeautifulSoup import lxml import req ...

  8. Python_机器学习_常用科学计算库_第6章_ Seaborn+综合案例

    Python_机器学习_常用科学计算库_第6章_ Seaborn+综合案例 文章目录 Python_机器学习_常用科学计算库_第6章_ Seaborn+综合案例 Seaborn 学习目标 6.1 Se ...

  9. Python爬虫_某宝网案例

    Python爬虫_某宝网案例 一.导入第三方库,确定url,定义headers ,伪装爬虫代码 import requests url = 'https://s.taobao.com/search?q ...

最新文章

  1. Linux命令之du
  2. HenCoder Android UI 部分 2-3 定制 Layout 的内部布局
  3. Table 'xxxxx' is marked as crashed and last 解决办法
  4. __attribute__机制介绍
  5. ftp 速度_如何评价我的骑行功率(FTP)?
  6. 解决PRINT函数UTF-8问题
  7. FPGA定点数字信号处理技巧(一)
  8. 土地土壤数据下载网站整理
  9. 计算机主机装机注意,自己组装电脑要注意什么?DIY老司机教你装机注意事项 (全文)...
  10. 中国新能源汽车产业销售模式与十四五竞争格局展望报告2022版
  11. Tecplot新手进阶--使用tecplot宏操作批量处理数据输出图片(详细步骤)
  12. java 数组写入txt_java编程,如何把一个数组中的内容写入一个txt文档中
  13. 二分图匹配 Hopcroft-Carp (HK) 算法详解 附例题
  14. 目前为止最全的微信小程序项目实例 --- demo锦集
  15. 如何消除win10文件右上角的蓝色箭头
  16. 基于Mui与H5+开发webapp的Android原生工程打包步骤(使用新版本5+SDK与Android studio)(部分内容转自dcloud官网)...
  17. Mysql-可重复读的隔离级别在什么情况下会出现幻读
  18. Zabbix unreachable poller processes more than 75% busy
  19. 华为无线ap由fit升级到fat的方法
  20. 中国移动MobileMarket重点支持机型信息-12月24日

热门文章

  1. 第37课 神奇的大自然 《小学生C++趣味编程》
  2. 计算机硬软件故障实训报告,计算机维护维修实训报告.docx
  3. html怎么给变量添加样式,通过CSS变量修改样式
  4. 信息安全工程师笔记-网络攻击常见技术
  5. Python笔记-Can’t reconnect until invalid transaction is rolled back
  6. Java笔记-Spring Boot SSL(https)实例
  7. Linux笔记-centos7替换yum及编译安装mydumper
  8. flash绘制荷花多个图层_Flash鼠绘入门第八课:绘制脱俗荷花
  9. python爬虫下载模块_python爬虫系列(4.5-使用urllib模块方式下载图片)
  10. linux比较小数大小,带有小数点的数值对比大小