爬虫实战:12306登录
爬虫实战:破解点触验证码,实现12306登录
1.目标
实现12306登录,获取登录cookies
2.技术点
- 1.借用第三方打码平台,进行图片验证码识别
- 2.破解selenium webdriver反爬
3.思路
- 1.输入账号密码
- 2.获取验证图片
- 3.识别图片,获取坐标
- 4.图片验证
- 5.登录
- 6.滑动滑块
4.环境
python + selenium + 超级鹰
5.代码
- 1.12306登录.py
# @author: zly
# @function: Touch verification code
# @time: 2020-09-15
# @copyright: All Rights Reversedimport time
import randomfrom selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChainsfrom chaojiying import Chaojiying_Client
from constants import *class MakeTrack:"""Track generator, need to pass a distance parameter"""def __init__(self, distance=DISTANCE):self.distance = distancedef segmentate(self, s):"""Track splitter, the size of each piece of track is not dividedReturns a list object of a track block:params --> Tracks to be segmented, int"""if SEGMENTNUM1 <= abs(s) < SEGMENTNUM2:s = [round(s / 3) - 3, round(s / 3) + 3]elif abs(s) >= SEGMENTNUM2:s = [round(s / 5) - 5, round(s / 5) - 3,round(s / 5),round(s / 5) + 3, round(s / 5) + 5]else:s = [round(s)]return sdef make_track(self):"""Make sliding track to simulate human normal movementReturn a list object of sliding track"""track = []current = v0 = 0while self.distance > current:# 随机事件,随机加速度,生成随机位移t = random.randint(1, 4) / 2a = random.randint(1, 3)# 速度、位移v0 += a * ts = v0 * t + 0.5 * a * t ** 2# 将不和规则的较大的位移进行分割seg = self.segmentate(round(s))track.extend(seg)current += s# 对不超过目标位移或者不足位移做补偿while True:if sum(track) == self.distance:breakelif sum(track) > self.distance:track.pop()else:track.append(self.distance - sum(track))if len(track) > TRACKMAXLENGTH:self.make_track()return trackclass Login12306(Chaojiying_Client):""":paramusername 12306账号 --> strpassword 12306密码 --> strcusername 超级鹰账号 --> strcpassword 超级鹰密码 --> strsoft_id 软件ID --> strcodetype 验证类型 --> intpath 验证码图片路径 --> strThere are three to config your init configration1. by set constant2. by set config dict3. Direct set init configration"""def __init__(self, username=None, password=None,cusername=None, cpassword=None, soft_id=None,codetype=None, path=None,*args, **kwargs):# 配置优化,可以字典的形式传递参数if kwargs.get('configs', 'None'):# 连接超级鹰,初始化super().__init__(username=kwargs['configs'].get('cusername', ''),password=kwargs['configs'].get('cpassword', ''),soft_id=kwargs['configs'].get('soft_id', ''))self.username = kwargs['configs'].get('username', '')self.password = kwargs['configs'].get('password', '')self.cusername = kwargs['configs'].get('cusername', '')self.cpassword = kwargs['configs'].get('cpassword', '')self.soft_id = kwargs['configs'].get('soft_id', '')self.codetype = kwargs['configs'].get('codetype', '')self.path = kwargs['configs'].get('path', '')elif USERNAME:self.username = USERNAMEself.password = PASSWORDself.cusername = CUSERNAMEself.cpassword = CPASSWORDself.soft_id = SOFTIDself.codetype = CODETIPEself.path = PATHelse:# 连接超级鹰,初始化super().__init__(username=cusername,password=cpassword,soft_id=soft_id)self.username = usernameself.password = passwordself.cusername = cusernameself.cpassword = cpasswordself.soft_id = soft_idself.codetype = codetypeself.path = pathself.run@propertydef run(self):"""You can call the run method directly for login verification,or you can also call other methods to achieve this function:returnReturn false means login verification failedReturn true means login verification success"""self.driver = self.prepares()self.driver.get('https://kyfw.12306.cn/otn/resources/login.html')self.driver.implicitly_wait(IMPLICITLYWAIT)self.driver.maximize_window()time.sleep(1)# 1.输入账号密码self.input_user_pwd(username=self.username, password=self.password)# 2.获取验证图片self.get_pic()while True:# 3.识别图片,获取坐标position, pic_id = self.get_position(codetype=self.codetype)if not position:position, pic_id = self.get_position(codetype=self.codetype)# 4.图片验证self.img_click(position)# 5.登录login = self.login(pic_id)if not login:self.driver.refresh()self.input_user_pwd(username=self.username, password=self.password)self.get_pic()continue# 6.滑动滑块return True if self.slide() else Falsedef prepares(self):"""Break through 12306 webriverReturns a webdrive after anti pickling"""# 12306通过图片验证之后依然登陆不上,其中的原因是有webdriver反扒# 要想突破反扒,就必须修改带有webdrive的标志,我们用selenium打开的浏览器# 上面往往都会显示 Chrome正受到自动测试软件的控制# 因此我们需要修改Options和selenium浏览器的js标志navigator# selenium控制的浏览器默认是true/false,正常的是undefinedoptions = webdriver.ChromeOptions()options.add_experimental_option("excludeSwitches", ["enable-automation"])options.add_experimental_option('useAutomationExtension', False)driver = webdriver.Chrome(options=options)driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument",{"source": "Object.defineProperty(""navigator, 'webdriver', ""{get: () => undefined})"})return driverdef input_user_pwd(self, username=None, password=None):"""Enter 12306 account and password@username: 12306账号 --> str, defalut is None@password: 12306密码 --> str, defalut is NoneThe return 0 here has no effect, it just means the end of the function"""# 切换至账号密码登录self.driver.find_element_by_xpath('//li[@class="login-hd-account"]/a').click()# 这里需要睡1-2秒,否则会报错,加载js,浏览器js没有代码快time.sleep(2)# 输入账号密码self.driver.find_element_by_id('J-userName').send_keys(username)self.driver.find_element_by_id('J-password').send_keys(password)return 0def get_pic(self):"""Get touch captcha imageThe return 0 here has no effect, it just means the end of the function"""# 截图self.driver.find_element_by_id('J-loginImg').screenshot(self.path)return 0def get_position(self, codetype=None):"""Get the touch coordinates of super Eagle verification@soft_id: 软件ID --> str, defalut is None@codetype: 验证类型 --> int, defalut is None:returna list object [position, pic_id]"""# 发送图片,获取坐标是verify_data = self.PostPic(self.path, codetype)print(verify_data)# 如果成功获取坐标则格式化,否则return Noneif verify_data['err_no'] == 0:temp = verify_data['pic_str'].split('|')position = [i.split(',') for i in temp]return [position, verify_data['pic_id']]else:self.ReportError(verify_data['pic_id'])return [None, verify_data['pic_id']]def img_click(self, position):"""Get the touch coordinates of super Eagle verification@position: 点触坐标 --> Nested list, [['55', '55'], ['88', '88']...]The return 0 here has no effect, it just means the end of the function"""# 要点触的图片element = self.driver.find_element_by_id('J-loginImg')# 按照坐标值点击for k in position:# x、y需要int的原因:move_to_element_with_offset中x、y只能是int型x = int(k[0])y = int(k[1])ActionChains(self.driver).move_to_element_with_offset(element, x, y).click().perform()return 0def login(self, pic_id=None):"""Its role is to log in and get cookiesReturn true means the verification is successful, otherwise it fails"""# 登录,获取cookiesself.driver.find_element_by_id('J-login').click()# 判断图片验证是否验证成功verify_tag = self.driver.find_element_by_xpath('//*[@class="lgcode-error"]')# 看verify_tag的display属性是否可见,可见则表示验证失败if verify_tag.is_displayed():# 别浪费钱,向超级鹰报个错self.ReportError(pic_id)print("图片验证失败,报错成功")return Falseprint("图片验证成功")time.sleep(3)return Truedef slide(self):"""Sliding verification,if it's successful return cookies, or return False"""try:# 定位滑块element = self.driver.find_element_by_id('nc_1_n1z')# 生成轨迹track = MakeTrack().make_track()# 滑动ActionChains(self.driver).click_and_hold(element).perform()[ActionChains(self.driver).move_by_offset(i, 0).perform() for i in track]ActionChains(self.driver).release(element).perform()# 时间取决于网速time.sleep(5)except Exception as e:# stale element reference: element is not attached to the page document# 页面刷新导致获取不到元素,若能够滑动通过此错误无需再管,不是每次都会发生print(str(e))time.sleep(10)self.driver.quit()return False# 判断是否登陆成功try:self.driver.find_element_by_xpath('//*[@class="btn btn-primary ok"]').click()cookies = self.driver.get_cookies()print("恭喜您登陆成功")print(cookies)time.sleep(10)self.driver.quit()return Trueexcept Exception as e:print(str(e))print("恭喜您登陆失败,再来一次吧")time.sleep(10)self.driver.quit()return Falseconfigs = {'username': '', # 12306账号'password': '', # 12306密码'cusername': '', # 超级鹰账号'cpassword': '', # 超级鹰密码'soft_id': '', # 软件ID'codetype': 9004, # 验证类型'path': '' # 验证码图片路径
}Login12306(configs=configs)
- 2、chaojiying.py
import requests
from hashlib import md5class Chaojiying_Client(object):def __init__(self, username, password, soft_id):self.username = usernamepassword = password.encode('utf8')self.password = md5(password).hexdigest()self.soft_id = soft_idself.base_params = {'user': self.username,'pass2': self.password,'softid': self.soft_id,}self.headers = {'Connection': 'Keep-Alive','User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',}def PostPic(self, path, codetype):"""path: 图片路径codetype: 题目类型 参考 http://www.chaojiying.com/price.html"""with open(path, 'rb') as f:imagecontent = f.read()params = {'codetype': codetype,}params.update(self.base_params)files = {'userfile': ('ccc.jpg', imagecontent)}r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)return r.json()def ReportError(self, im_id):"""im_id:报错题目的图片ID"""params = {'id': im_id,}params.update(self.base_params)r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)return r.json()
- 3、contants.py
# 12306账号
USERNAME = ''# 12306密码
PASSWORD = ''# 超级鹰账号
CUSERNAME = ''# 超级鹰密码
CPASSWORD = ''# 软件ID
SOFTID = ''# 验证类型
CODETIPE = ''# 验证码图片路径
PATH = ''# 滑块滑动的距离,单位:px
DISTANCE = 425# 轨迹分割规定大小
SEGMENTNUM1 = 30
SEGMENTNUM2 = 50# 轨迹最大段数
TRACKMAXLENGTH = 30# # 显性等待时间,单位:s
IMPLICITLYWAIT = 10
温馨提示:千万不要干坏事喲~~,否则抓进局里后果自负…
爬虫实战:12306登录相关推荐
- python 12306登录_python爬虫--模拟12306登录
模拟12306登录 超级鹰: #!/usr/bin/env python # coding:utf-8 import requests from hashlib import md5 class Ch ...
- python 12306登录 2019_python爬虫--模拟12306登录
模拟12306登录 超级鹰: #!/usr/bin/env python # coding:utf-8 import requests from hashlib import md5 class Ch ...
- python成绩查询系统_Python爬虫实战:登录教务系统查成绩
本文记录我用Python登录教务系统查询成绩的过程.手动输入验证码,简单获取成绩页面.后续将可能更新自动识别验证码登录查询 前期准备 本爬虫用到了Python的Requests库和BeautifulS ...
- python爬虫登录12306失败_使用python爬虫模拟12306登录方法
试了好久登录的时候总是显示:系统忙,请刷新,,,太折磨人了,搞了半天才想到是请求头部的问题..... 验证码还是要人工识图..#!/bin/env python # -*- coding=utf-8 ...
- Python3 爬虫实战 — 模拟登陆12306【点触验证码对抗】
登陆时间:2019-10-21 实现难度:★★★☆☆☆ 请求链接:https://kyfw.12306.cn/otn/resources/login.html 实现目标:模拟登陆中国铁路12306,攻 ...
- Python爬虫实战(5):模拟登录淘宝并获取所有订单
Python爬虫入门(1):综述 Python爬虫入门(2):爬虫基础了解 Python爬虫入门(3):Urllib库的基本使用 Python爬虫入门(4):Urllib库的高级用法 Python爬虫 ...
- Python爬虫实战之(五)| 模拟登录wechat
作者:xiaoyu 微信公众号:Python数据科学 知乎:Python数据分析师 不知何时,微信已经成为我们不可缺少的一部分了,我们的社交圈.关注的新闻或是公众号.还有个人信息或是隐私都被绑定在了一 ...
- Python爬虫实战之(五)| 模拟登录wechat 1
作者:xiaoyu 微信公众号:Python数据科学 知乎:Python数据分析师 不知何时,微信已经成为我们不可缺少的一部分了,我们的社交圈.关注的新闻或是公众号.还有个人信息或是隐私都被绑定在了一 ...
- 爬虫三(Bs4搜索、Selenium基本使用、无界面浏览器、Selenium自动登录百度案例、自动获取12306登录验证码案例、切换选项卡、浏览器前进后退、登录Cnblogs获取Cookie自动点赞)
文章标题 一.Bs4搜索文档树 二.CSS选择器 三.selenium基本使用 四.无界面浏览器 五.selenium其他使用 1)自动登录百度案例 2)获取位置属性大小.文本 3)自动获取12306 ...
最新文章
- 管理者如何打造一个有执行力的团队?
- MySQL存储过程的创建及调用
- 【数据结构】数组和广义表
- linux ram 权限,我如何将Linux二进制文件限制为有限的RAM数量?
- 中国电信发布转型升级战略:构建一横四纵生态圈
- Oracle创建、删除、备份表
- 路由器网络性能测试软件,路由器性能测试
- 前端虚拟列表的实现原理
- 软件开发需要学好数学吗?
- Apache启用GZIP压缩网页传输方法
- KC伺服舵机带参控制程序程序
- Windows 10 网络和Internet设置中WLAN选项消失
- 说给自己听 -- 三毛
- html 伸缩盒子布局,详解CSS3伸缩布局盒模型Flex布局
- 【ArcGIS】利用字段计算器按 OSM 道路等级生成道路速度
- css 输密码键盘,CSS实现迷你键盘
- 通过数据分析,了解外国人眼里的真实李子柒
- maven 国内可用的中央仓库 阿里云
- 你做的网页在哪些浏览器测试过,这些浏览器的内核分别是什么?
- javaSE进阶学习笔记
热门文章
- 方舟进化私服找不到服务器,方舟生存进化私服怎么设置 私人服务器设置方法分享...
- 内存快速分配和慢速分配
- 面试之站在面试官的角度去面试
- 华硕路由器信息发现服务器,研究人员发现华硕路由器收集用户访问记录等隐私数据...
- 企业微信网络抓包工具devtools_resources
- 智慧煤矿理论篇2-煤矿5G与WiFi6
- erdas裁剪影像_ERDAS软件应用(一)遥感影像数据裁剪
- 如何准备OCP考试?
- VM虚拟机20G磁盘扩展到40G的Linux操作记录
- html象棋开题报告设计要求,C++游戏设计中国象棋开题报告.docx