利用python requests库模拟登陆知乎

当初搜模拟登陆的时候在知乎上也找到一些内容。

以下是代码

import requests
import time
import json
import os
import re
import sys
import subprocess
from bs4 import BeautifulSoup as BSclass ZhiHuClient(object):"""连接知乎的工具类，维护一个Session2015.11.11用法：client = ZhiHuClient()# 第一次使用时需要调用此方法登录一次，生成cookie文件# 以后可以跳过这一步client.login("username", "password")# 用这个session进行其他网络操作，详见requests库session = client.getSession()"""# 网址参数是账号类型TYPE_PHONE_NUM = "phone_num"TYPE_EMAIL = "email"loginURL = r"http://www.zhihu.com/login/{0}"homeURL = r"http://www.zhihu.com"captchaURL = r"http://www.zhihu.com/captcha.gif"headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36","Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8","Accept-Encoding": "gzip, deflate","Host": "www.zhihu.com","Upgrade-Insecure-Requests": "1",}captchaFile = os.path.join(sys.path[0], "captcha.gif")cookieFile = os.path.join(sys.path[0], "cookie")def __init__(self):os.chdir(sys.path[0])  # 设置脚本所在目录为当前工作目录self.__session = requests.Session()self.__session.headers = self.headers  # 用self调用类变量是防止将来类改名# 若已经有 cookie 则直接登录self.__cookie = self.__loadCookie()if self.__cookie:print("检测到cookie文件，直接使用cookie登录")self.__session.cookies.update(self.__cookie)soup = BS(self.open(r"http://www.zhihu.com/").text, "html.parser")print("已登陆账号： %s" % soup.find("span", class_="name").getText())else:print("没有找到cookie文件，请调用login方法登录一次！")# 登录def login(self, username, password):"""验证码错误返回：{'errcode': 1991829, 'r': 1, 'data': {'captcha': '请提交正确的验证码 :('}, 'msg': '请提交正确的验证码 :('}登录成功返回：{'r': 0, 'msg': '登陆成功'}"""self.__username = usernameself.__password = passwordself.__loginURL = self.loginURL.format(self.__getUsernameType())# 随便开个网页，获取登陆所需的_xsrfhtml = self.open(self.homeURL).textsoup = BS(html, "html.parser")_xsrf = soup.find("input", {"name": "_xsrf"})["value"]# 下载验证码图片while True:captcha = self.open(self.captchaURL).contentwith open(self.captchaFile, "wb") as output:output.write(captcha)# 人眼识别print("=" * 50)print("已打开验证码图片，请识别！")subprocess.call(self.captchaFile, shell=True)captcha = input("请输入验证码：")os.remove(self.captchaFile)# 发送POST请求data = {"_xsrf": _xsrf,"password": self.__password,"remember_me": "true",self.__getUsernameType(): self.__username,"captcha": captcha}res = self.__session.post(self.__loginURL, data=data)print("=" * 50)# print(res.text) # 输出脚本信息，调试用if res.json()["r"] == 0:print("登录成功")self.__saveCookie()breakelse:print("登录失败")print("错误信息 --->", res.json()["msg"])def __getUsernameType(self):"""判断用户名类型经测试，网页的判断规则是纯数字为phone_num，其他为email"""if self.__username.isdigit():return self.TYPE_PHONE_NUMreturn self.TYPE_EMAILdef __saveCookie(self):"""cookies 序列化到文件即把dict对象转化成字符串保存"""with open(self.cookieFile, "w") as output:cookies = self.__session.cookies.get_dict()json.dump(cookies, output)print("=" * 50)print("已在同目录下生成cookie文件：", self.cookieFile)def __loadCookie(self):"""读取cookie文件，返回反序列化后的dict对象，没有则返回None"""if os.path.exists(self.cookieFile):print("=" * 50)with open(self.cookieFile, "r") as f:cookie = json.load(f)return cookiereturn Nonedef open(self, url, delay=0, timeout=10):"""打开网页，返回Response对象"""if delay:time.sleep(delay)return self.__session.get(url, timeout=timeout)def getSession(self):return self.__sessionif __name__ == '__main__':client = ZhiHuClient()client.login('xxxxxx','xxxxxxxx')# 第一次使用时需要调用此方法登录一次，生成cookie文件# 以后可以跳过这一步# client.login("username", "password")# 用这个session进行其他网络操作，详见requests库session = client.getSession()r=session.get('http://www.zhihu.com')print(s.text)

来自知乎：

点击打开链接

这模拟登陆的代码可以作为参考。

最后是关于获取天气预报的爬虫代码：

import urllib.request
import re
def GetHtmlCode(url):page = urllib.request.urlopen(url)htmlCode = page.read().decode('gbk')page.close()return htmlCodedef FindGXUrl(homePage):gx_re_vague=r'<a href="[\S]+" rel="[\S]+">江苏</a>'gx_url_vague=re.search(gx_re_vague,homePage).group()gx_re=r'http://[\w\./]+\.htm'gx_url=re.search(gx_re,gx_url_vague).group()return gx_urldef FindNNUrl(GXPage):by_re_vague=r'<a href="[\S]+?" title="[\S]+?">南京</a>'nn_url_vague=re.search(by_re_vague,GXPage).group()by_re=r'/[\S]+?\.htm'nn_url_suffix=re.search(by_re,nn_url_vague).group()return nn_url_suffixdef GetWeatherBlockList(WeatherPage):weatherBlock_re=r'<li class="week-detail-now" >[\s\S]+?</li>'weather_re=re.compile(weatherBlock_re)weatherList=re.findall(weather_re,WeatherPage)return weatherListclass Weather:date=''daytime=''nighttime=''temperatureL=''temperatureH=''def __init__(self,d,dT,nT,tL,tH):self.date=dself.daytime=dTself.nighttime=nTself.temperatureL=tLself.temperatureH=tHdef print(self):print('\n%s：白天：%s，夜间：%s，\n最低温度：%sC，最高温度：%sC\n'%(self.date,self.daytime,self.nighttime,self.temperatureL,self.temperatureH))def MakeWeatherInfo(block):dA_re=r'[\d]{2}月[\d]{2}日'dA=re.search(dA_re,block).group()dT_re=r'<b><font class="gray">白天：</font>.{1,6}</b>'dT=re.search(dT_re,block).group()dT=re.sub(r'<b>.+</font>','',dT)dT=re.sub(r'</b>','',dT)nT_re=r'<b><font class="gray">夜间：</font>.{1,6}</b>'nT=re.search(nT_re,block).group()nT=re.sub(r'<b>.+</font>','',nT)nT=re.sub(r'</b>','',nT)t_re=r'<font class="blue">.{0,4}</font>～<font class="red">.{0,4}</font>'t=re.search(t_re,block).group()t=re.findall(r'[\d]+',t)return Weather(dA,dT,nT,t[0],t[1])
homePage=GetHtmlCode("http://tianqi.2345.com/")
gx_url=FindGXUrl(homePage)
GXPage=GetHtmlCode(gx_url)
nn_url_suffix=FindNNUrl(GXPage)
nn_url='http://tianqi.2345.com'+nn_url_suffixNNPage=GetHtmlCode(nn_url)
weatherList=GetWeatherBlockList(NNPage)#get a list of two days' weatherweather1=MakeWeatherInfo(weatherList[0])
weather2=MakeWeatherInfo(weatherList[1])
weather1.print()
weather2.print()

思路很简单。但主要也是正则表达式的书写，还是得勤加练习才对。

用Python实现爬虫的确非常简单。但是利用scrapy框架之类的刚接触一会发现利用Python3连安装都是各种error.累觉不爱。

现在都是单线程。以后能做多线程和分布式爬虫的时候再回来补充吧。

之后几个月打算研究django，但是这估计也是个很大的坑呢233333.还得学习SQL语言balabala..挑战性很足。

如果学到什么东西在往博客里放吧。记录一下学习的过程。

大学实在是太枯燥了。也许是我不太喜欢社交呢233333。

利用python requests库模拟登陆知乎相关推荐

利用Python requests库模拟登陆学校教务系统
在研究了一会requests库的实现之后.发现requests的确非常强大.. 几行代码就登陆上了学校的教务系统,但也许是我们学校的教务系统做的太烂了吧23333.动不动就血崩. 下面是代码. imp ...
利用python requests库爬取淘宝商品评论_python
文章目录一.起因二.项目实现 1. 分析实现方式 2. 编码实现 3. 完整代码三.思考与改进一.起因看到一篇文章,感觉自己可以动手试试 Python 不用selenium 带你高效爬取京东 ...
用Python+requests库批量下载知乎高赞回答中的所有表情包
干货分享二十多本Python好书,戳这领取引言今天研究了会requests库.发现和urllib库功能类似,很好上手,因此写了个Demo爬了爬表情包.我选取了几个知乎里关于表情包问题的高赞回答, ...
python wechatsougou_使用Python的requests库模拟登陆微信搜狗，爬取100X10篇微信文章并保存到MySQL...
自学的python和爬虫技术.使用到Redis,MySQL数据库:request请求模块:re,Xpath解析模块:dumps, loads序列化和反序列化.还可以配合代理池使用. 爬取的是https ...
python爬虫设计在哪里_《python 爬虫教程知乎》怎样用Python设计一个爬虫模拟登陆知乎...
<python 爬虫教程知乎> 怎样用Python设计一个爬虫模拟登陆知乎 python 爬虫教程知乎2020-09-23 01:45:13人已围观怎样用Python设计一个爬虫模拟 ...
python 登录知乎_python模拟登陆知乎（最新版)
原因为啥要写这一篇文章呢? (主要是qq群内有人在模拟登陆知乎,一直不成功)然后我抓包看了下,发现知乎登陆页已经改版了,而且难度大大提高了. 开始抓包首先内,还是打开知乎首页,然后输入账号密码,登 ...
利用requests库模拟访问博客来提升文章阅读量
利用requests库模拟访问博客来提升文章阅读量一.概述二.简陋版程序三.升级版程序一.概述有的同学在csdn上写了文章之后,看着自己文章的阅读量,少的可怜,不禁希望能把阅读量快速涨上去, ...
python requests模拟登录淘宝购物车下单_Python使用requests库模拟登录淘宝账号（上）...
学好Python这款编程语言,我们能够设计出很多程序要帮助我们完成数据采集等工作,ET代理今天要跟大家介绍如何用Python模拟登录淘宝账号? 看了下网上有很多关于模拟登录淘宝,但是基本都是使用scr ...
爬虫实战3：模拟登陆知乎并爬取任意帖子数据
刚学爬虫时,看到一篇文章硬核破解知乎登陆,心潮澎湃,真男人!符合我的胃口!哼哧哼哧的立刻安排! (半个小时过后) 似乎我是弱智?很多看不懂?(百度乱搜中-)恩还有种简单的?sel ...

利用python requests库模拟登陆知乎

利用python requests库模拟登陆知乎相关推荐

最新文章

热门文章