1. 打开浏览器,登录 “龙de船人”

需要下载selenium,下载浏览器对应版本chromedriver至python安装目录下

1.1在浏览器输入chrome://version,即可查询版本号

1.2下载对应版本chrome浏览器内核文件至python安装目录下

下载地址:http://chromedriver.storage.googleapis.com/index.html

  • 选择正确的版本号

  • windows64位选win32即可

  • 解压放到python安装目录

打开页面代码如下:

from selenium import webdriver
from selenium.webdriver.chrome.options import Optionsdef __init__(self):super(CrackSlider, self).__init__()self.opts = Options()self.opts.add_argument('--no-sandbox')  # 沙箱机制self.driver = webdriver.Chrome(options=self.opts)self.login_url = "https://www.imarine.cn/member.php?mod=logging&action=login"try:self.driver.get(self.login_url)    # 由于是使用浏览器直接访问页面,因此无需区分 get 和 post 方法,直接 get 方法打开页面即可except Exception as e:print("开始!")# 等待2秒钟time.sleep(2)
  1. 自动登录(难点:滑块验证)

from io import BytesIO
import cv2
import numpy as np
from PIL import Image
from selenium import webdriver
from selenium.webdriver import ActionChainsdef __init__(self):super(CrackSlider, self).__init__()# 打开浏览器,代码省略# 等待2秒钟time.sleep(2)# 输入账号密码self.driver.find_element(by=By.XPATH,value="/html/body/div[1]/main/div/div[1]/div/div[2]/div[1]/div/form/div[1]/div[""2]/div[1]/input").send_keys("账号")time.sleep(1)self.driver.find_element(by=By.XPATH,value="/html/body/div[1]/main/div/div[1]/div/div[2]/div[1]/div/form/div[2]/div[""2]/div[1]/input").send_keys("密码")time.sleep(1)self.driver.find_element(by=By.XPATH,value="/html/body/div[1]/main/div/div[1]/div/div[2]/div[1]/div/form/button").click()time.sleep(1)self.wait = WebDriverWait(self.driver, 10)def get_pic(self):# self.driver.get(self.login_url)time.sleep(5)self.driver.switch_to.frame('tcaptcha_iframe')# 定位需要滑动的元素target_link = self.driver.find_element(by=By.XPATH, value="/html/body/div/div[3]/div[2]/div[1]/div[2]/img") \.get_attribute('src')template_link = self.driver.find_element(by=By.XPATH, value="/html/body/div/div[3]/div[2]/div[1]/div[3]/img") \.get_attribute('src')target_img = Image.open(BytesIO(requests.get(target_link).content))template_img = Image.open(BytesIO(requests.get(template_link).content))target_img.save('target.jpg')template_img.save('template.png')def crack_slider(self, distance):slider = self.driver.find_element(by=By.ID, value='tcaptcha_drag_thumb')ActionChains(self.driver).click_and_hold(slider).perform()ActionChains(self.driver).move_by_offset(xoffset=distance, yoffset=0).perform()time.sleep(2)ActionChains(self.driver).release().perform()ActionChains(self.driver).click(slider).perform()def add_alpha_channel(img):""" 为jpg图像添加alpha通道 """r_channel, g_channel, b_channel = cv2.split(img)  # 剥离jpg图像通道alpha_channel = np.ones(b_channel.shape, dtype=b_channel.dtype) * 255  # 创建Alpha通道img_new = cv2.merge((r_channel, g_channel, b_channel, alpha_channel))  # 融合通道return img_newdef handel_img(img):imgGray = cv2.cvtColor(img, cv2.COLOR_RGBA2GRAY)  # 转灰度图imgBlur = cv2.GaussianBlur(imgGray, (5, 5), 1)  # 高斯模糊imgCanny = cv2.Canny(imgBlur, 60, 60)  # Canny算子边缘检测return imgCannydef match(img_jpg_path, img_png_path):# 读取图像img_jpg = cv2.imread(img_jpg_path, cv2.IMREAD_UNCHANGED)img_png = cv2.imread(img_png_path, cv2.IMREAD_UNCHANGED)# 判断jpg图像是否已经为4通道if img_jpg.shape[2] == 3:img_jpg = add_alpha_channel(img_jpg)img = handel_img(img_jpg)small_img = handel_img(img_png)res_TM_CCOEFF_NORMED = cv2.matchTemplate(img, small_img, 3)value = cv2.minMaxLoc(res_TM_CCOEFF_NORMED)value = value[3][0]  # 获取到移动距离return valuedef job():# 1. 打开chromedriver,下载图片cs = CrackSlider()cs.get_pic()# 2. 对比图片,计算距离img_jpg_path = 'target.jpg'  # 读者可自行修改文件路径img_png_path = 'template.png'  # 读者可自行修改文件路径distance = match(img_jpg_path, img_png_path)distance = distance / 680 * 340 - 25    # 自我调整比例# 3. 移动cs.crack_slider(distance)
  1. 船舶订单查询

根据元素full xpath定位元素,进行点击或是文本输入

    def query(self):time.sleep(10)print("正在点击查询按钮......")time.sleep(5)# 选择查询query = self.driver.find_element(by=By.XPATH, value="/html/body/div[6]/div/div[1]/div[2]/a[5]")time.sleep(3)query.click()time.sleep(5)print("正在输入查询条件......")# 选择开始时间self.driver.find_element(by=By.XPATH, value="/html/body/div[8]/form/div/div[2]/dl/dd[1]/input[1]").click()# 年份self.driver.find_element(by=By.XPATH,value="/html/body/div[1]/div/div[1]/div/table/tbody/tr[1]/td[2]/a[1]").click()self.driver.find_element(by=By.XPATH, value="/html/body/div[1]/div/div[2]/div[7]/a[9]/span").click()# 月份self.driver.find_element(by=By.XPATH,value="/html/body/div[1]/div/div[1]/div/table/tbody/tr[1]/td[2]/a[2]").click()self.driver.find_element(by=By.XPATH, value="/html/body/div[1]/div/div[3]/a[1]/span").click()# 日期self.driver.find_element(by=By.XPATH, value="/html/body/div[1]/div/div[1]/div/table/tbody/tr[3]/td[7]").click()# 选择结束时间self.driver.find_element(by=By.XPATH, value="/html/body/div[8]/form/div/div[2]/dl/dd[1]/input[2]").click()# 当前日期self.driver.find_element(by=By.XPATH,value="/html/body/div[1]/div/div[1]/div/table/tbody/tr[5]/td[4]/a").click()# 输入国家-中国self.driver.find_element(by=By.XPATH, value="/html/body/div[8]/form/div/div[2]/dl/dd[5]/input").send_keys("中国")# 点击搜索self.driver.find_element(by=By.XPATH, value="/html/body/div[8]/form/div/div[2]/dl/dd[7]/button/span").click()time.sleep(2)return 0
  1. 获取表单内容

def get_and_update(self):td = self.driver.find_element(by=By.XPATH, value="/html/body/div[8]/div[1]")  # 进一步定位到表格内容所在的td节点td_txt = td.textprint(td_txt)arr = td_txt.split("\n")for y in range(3, len(arr)):arr1 = arr[y].split(" ")tup1 = (arr1[0], arr1[1], arr1[2], arr1[3], arr1[4], arr1[6], arr1[7])if arr1[0] < '2022':  # 只取2022年后数据return Falseself.arr_res.append(tup1)    # 所有结果汇聚于此
  1. 数据处理

import cx_Oracle
from sys import modules# 连接Oracle数据库
class oracleOperation():def openOracleConn(self):# highway = cx_Oracle.connect('c##sxx/c##sxx@localhost:1521/orcltest')  # 用户名/密码@host:端口/sidhighway = cx_Oracle.connect('用户名/密码@host:端口/sid')  # 用户名/密码@host:端口/sid# 获取cursor指针# cursor=highway.cursor()return highway# 条件查询def factorSelect(self, connection, param):cursor = connection.cursor()# 带参数的查询  ,例子如下:sql = 'select * from Longde where  create_date =:create_date and dockyard =:dockyard ' \'and ship_type =:ship_type and quantity =:quantity and specification =:specification ' \'and region =:region and shipowner =:shipowner'query1 = cursor.execute(sql, param)  # 特别的注意,具体 的条件查询的格式row = cursor.fetchall()cursor.close()# connection.close()    # 连接关闭在所有数据处理完成后if len(row) != 0:return Trueelse:return Falsepassdef insert(self, connection, insertParam=[]):cursor = connection.cursor()sql = "insert into Longde (create_date,dockyard,ship_type,quantity,specification,region,shipowner) " \"values (:create_date,:dockyard,:ship_type,:quantity,:specification,:region,:shipowner)"for i in range(0, len(insertParam)):if insertParam[i] == '-':insertParam[i] = Noneif len(insertParam) == 0:print("插入的数据行的参数不能为空!")else:cursor.prepare(sql)result = cursor.executemany(None, insertParam)connection.commit()cursor.close()passif __name__ == '__main__':db = oracleOperation()connection = db.openOracleConn()# 能运行的无条件查询语句db.select(connection)
  1. 定时任务

from apscheduler.schedulers.blocking import BlockingSchedulerdef job():""" 反复执行内容 """if __name__ == "__main__":scheduler = BlockingScheduler()scheduler.add_job(job, 'interval', seconds=180)    # job为上面方法执行方法名scheduler.start()
  1. 代码(无数据库操作部分)

# coding=utf-8
import datetime
import re
import requests
import time
from io import BytesIOimport cv2
import numpy as np
from PIL import Image
from apscheduler.schedulers.blocking import BlockingScheduler
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Optionsfrom helloworld.Longde import db_oracle_Longdeclass CrackSlider:# 通过浏览器截图,识别验证码中缺口位置,获取需要滑动距离,并破解滑动验证码def __init__(self):super(CrackSlider, self).__init__()# 打开浏览器self.opts = Options()self.opts.add_argument('--no-sandbox')  # Bypass OS security modelself.driver = webdriver.Chrome(options=self.opts)self.login_url = "https://www.imarine.cn/member.php?mod=logging&action=login"self.Longde_url = "https://www.imarine.cn/order/?wtime=&shipyard=&rocker=&power=&countryarea=&shipowner="  # 新造船订单查询self.arr_res = []  # 存储为listtry:self.driver.get(self.login_url)except Exception as e:print("开始!")# 等待2秒钟time.sleep(2)# 输入账号密码self.driver.find_element(by=By.XPATH,value="/html/body/div[1]/main/div/div[1]/div/div[2]/div[1]/div/form/div[1]/div[""2]/div[1]/input").send_keys("账号")time.sleep(1)self.driver.find_element(by=By.XPATH,value="/html/body/div[1]/main/div/div[1]/div/div[2]/div[1]/div/form/div[2]/div[""2]/div[1]/input").send_keys("密码")time.sleep(1)self.driver.find_element(by=By.XPATH,value="/html/body/div[1]/main/div/div[1]/div/div[2]/div[1]/div/form/button").click()time.sleep(1)self.wait = WebDriverWait(self.driver, 10)def get_pic(self):# self.driver.get(self.login_url)time.sleep(5)self.driver.switch_to.frame('tcaptcha_iframe')# 定位需要滑动的元素target_link = self.driver.find_element(by=By.XPATH, value="/html/body/div/div[3]/div[2]/div[1]/div[2]/img") \.get_attribute('src')template_link = self.driver.find_element(by=By.XPATH, value="/html/body/div/div[3]/div[2]/div[1]/div[3]/img") \.get_attribute('src')target_img = Image.open(BytesIO(requests.get(target_link).content))template_img = Image.open(BytesIO(requests.get(template_link).content))target_img.save('target.jpg')template_img.save('template.png')def crack_slider(self, distance):slider = self.driver.find_element(by=By.ID, value='tcaptcha_drag_thumb')ActionChains(self.driver).click_and_hold(slider).perform()ActionChains(self.driver).move_by_offset(xoffset=distance, yoffset=0).perform()time.sleep(2)ActionChains(self.driver).release().perform()ActionChains(self.driver).click(slider).perform()def query(self):time.sleep(10)print("正在点击查询按钮......")time.sleep(5)# 选择查询query = self.driver.find_element(by=By.XPATH, value="/html/body/div[6]/div/div[1]/div[2]/a[5]")time.sleep(3)query.click()time.sleep(5)print("正在输入查询条件......")# 选择开始时间self.driver.find_element(by=By.XPATH, value="/html/body/div[8]/form/div/div[2]/dl/dd[1]/input[1]").click()# 年份self.driver.find_element(by=By.XPATH,value="/html/body/div[1]/div/div[1]/div/table/tbody/tr[1]/td[2]/a[1]").click()self.driver.find_element(by=By.XPATH, value="/html/body/div[1]/div/div[2]/div[7]/a[9]/span").click()# 月份self.driver.find_element(by=By.XPATH,value="/html/body/div[1]/div/div[1]/div/table/tbody/tr[1]/td[2]/a[2]").click()self.driver.find_element(by=By.XPATH, value="/html/body/div[1]/div/div[3]/a[1]/span").click()# 日期self.driver.find_element(by=By.XPATH, value="/html/body/div[1]/div/div[1]/div/table/tbody/tr[3]/td[7]").click()# 选择结束时间self.driver.find_element(by=By.XPATH, value="/html/body/div[8]/form/div/div[2]/dl/dd[1]/input[2]").click()# 当前日期self.driver.find_element(by=By.XPATH,value="/html/body/div[1]/div/div[1]/div/table/tbody/tr[5]/td[4]/a").click()# 输入国家-中国self.driver.find_element(by=By.XPATH, value="/html/body/div[8]/form/div/div[2]/dl/dd[5]/input").send_keys("中国")# 点击搜索self.driver.find_element(by=By.XPATH, value="/html/body/div[8]/form/div/div[2]/dl/dd[7]/button/span").click()time.sleep(2)return 0def get_order(self):self.db = db_oracle_Longde.oracleOperation()self.connection = self.db.openOracleConn()pageNum = self.driver.find_element(by=By.XPATH, value="/html/body/div[8]/div[2]/div/label/span").textpageNum = pageNum.strip("/ ").strip(" 页")print("第一次--> 一共有" + pageNum + "页")self.get_and_update()page = self.driver.find_element(by=By.CLASS_NAME, value="nxt")page.click()time.sleep(2)# 第一次进入页面时显示的总页数不正确,点击下一页后显示的才是正确的总页数pageNum = self.driver.find_element(by=By.XPATH, value="/html/body/div[8]/div[2]/div/label/span").textpageNum = pageNum.strip("/ ").strip(" 页")print("点击后--> 一共有" + pageNum + "页")# 提取表格内容td,减去上面已获取的一页内容for i in range(1, int(pageNum) - 1):print('现在是第', str(i), '页')flag = self.get_and_update()if flag is False:return Falsepage = self.driver.find_element(by=By.CLASS_NAME, value="nxt")page.click()time.sleep(2)self.get_and_update()print("所有数据:")print(self.arr_res)self.driver.quit()# 关闭数据库连接self.connection.close()def get_and_update(self):td = self.driver.find_element(by=By.XPATH, value="/html/body/div[8]/div[1]")  # 进一步定位到表格内容所在的td节点td_txt = td.textprint(td_txt)arr = td_txt.split("\n")for y in range(3, len(arr)):arr1 = arr[y].split(" ")tup1 = (arr1[0], arr1[1], arr1[2], arr1[3], arr1[4], arr1[6], arr1[7])if arr1[0] < '2022':  # 只取2022年后数据return Falseself.arr_res.append(tup1)dict_select = {'create_date': arr1[0], 'dockyard': arr1[1], 'ship_type': arr1[2],'quantity': arr1[3], 'specification': arr1[4], 'region': arr1[6],'shipowner': arr1[7]}res = self.db.factorSelect(self.connection, dict_select)list_insert = [(arr1[0], arr1[1], arr1[2], arr1[3], arr1[4], arr1[6], arr1[7])]if res is False:  # insertself.db.insert(self.connection, list_insert)return Truedef add_alpha_channel(img):""" 为jpg图像添加alpha通道 """r_channel, g_channel, b_channel = cv2.split(img)  # 剥离jpg图像通道alpha_channel = np.ones(b_channel.shape, dtype=b_channel.dtype) * 255  # 创建Alpha通道img_new = cv2.merge((r_channel, g_channel, b_channel, alpha_channel))  # 融合通道return img_newdef handel_img(img):imgGray = cv2.cvtColor(img, cv2.COLOR_RGBA2GRAY)  # 转灰度图imgBlur = cv2.GaussianBlur(imgGray, (5, 5), 1)  # 高斯模糊imgCanny = cv2.Canny(imgBlur, 60, 60)  # Canny算子边缘检测return imgCannydef match(img_jpg_path, img_png_path):# 读取图像img_jpg = cv2.imread(img_jpg_path, cv2.IMREAD_UNCHANGED)img_png = cv2.imread(img_png_path, cv2.IMREAD_UNCHANGED)# 判断jpg图像是否已经为4通道if img_jpg.shape[2] == 3:img_jpg = add_alpha_channel(img_jpg)img = handel_img(img_jpg)small_img = handel_img(img_png)res_TM_CCOEFF_NORMED = cv2.matchTemplate(img, small_img, 3)value = cv2.minMaxLoc(res_TM_CCOEFF_NORMED)value = value[3][0]  # 获取到移动距离return valuedef job():# 1. 打开chromedriver,下载图片cs = CrackSlider()cs.get_pic()# 2. 对比图片,计算距离img_jpg_path = 'target.jpg'  # 读者可自行修改文件路径img_png_path = 'template.png'  # 读者可自行修改文件路径distance = match(img_jpg_path, img_png_path)distance = distance / 680 * 340 - 25# 3. 移动cs.crack_slider(distance)# 4. 查询cs.query()cs.get_order()if __name__ == "__main__":print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))# BlockingScheduler 定时任务,毎3分钟获取一次数据scheduler = BlockingScheduler()scheduler.add_job(job, 'interval', seconds=180)scheduler.start()# job()

Python自动化实现船舶订单抓取相关推荐

  1. Python爬虫实战六之抓取爱问知识人问题并保存至数据库

    大家好,本次为大家带来的是抓取爱问知识人的问题并将问题和答案保存到数据库的方法,涉及的内容包括: Urllib的用法及异常处理 Beautiful Soup的简单应用 MySQLdb的基础用法 正则表 ...

  2. Atitit.web的自动化操作与信息抓取 attilax总结

    Atitit.web的自动化操作与信息抓取 attilax总结 1. Web操作自动化工具,可以简单的划分为2大派系: 1.录制回放 2.手工编写0 U' z; D! s2 d/ Q! ^1 2. 常 ...

  3. 基于python的今日头条文章抓取内含signature算法

    基于python的今日头条文章抓取内含signature算法 扫二维码添加微信 备注:爬虫 , 拉你进爬虫交流群 或许你会成为第一个加群的人~ 刚有的创群想法! 1. 简单文字描述头条爬虫注意点 由于 ...

  4. Python爬虫包 BeautifulSoup 递归抓取实例详解

    Python爬虫包 BeautifulSoup 递归抓取实例详解 概要: 爬虫的主要目的就是为了沿着网络抓取需要的内容.它们的本质是一种递归的过程.它们首先需要获得网页的内容,然后分析页面内容并找到另 ...

  5. [Python爬虫] 三、数据抓取之Requests HTTP 库

    往期内容提要: [Python爬虫] 一.爬虫原理之HTTP和HTTPS的请求与响应 [Python爬虫] 二.爬虫原理之定义.分类.流程与编码格式 一.urllib 模块 所谓网页抓取,就是把URL ...

  6. Python爬虫之gif图片抓取

    Python爬虫之gif图片抓取 标签:图片爬虫 这几天,对于怎么去爬取图片很感兴趣,就研究了一下,图片爬虫可以说是有简单,更有复杂的,今天,我做了一个比较简单的gif的图片爬虫,仅仅学习一下怎么进行 ...

  7. Python之Email邮箱账号抓取

    Python之Email邮箱账号抓取

  8. [Python爬虫] 四、数据抓取之HTTP/HTTPS抓包工具Fiddler

    往期内容提要: [Python爬虫] 一.爬虫原理之HTTP和HTTPS的请求与响应 [Python爬虫] 二.爬虫原理之定义.分类.流程与编码格式 [Python爬虫] 三.数据抓取之Request ...

  9. 10分钟教你用Python玩转微信之抓取好友个性签名制作词云

    10分钟教你用Python玩转微信之抓取好友个性签名制作词云 01 前言+展示 各位小伙伴我又来啦.今天带大家玩点好玩的东西,用Python抓取我们的微信好友个性签名,然后制作词云.怎样,有趣吧~好了 ...

最新文章

  1. oracle 分区使用情况,Oracle Hash分区的使用总结
  2. php 如何实现表格行列的对齐,excel怎样把每列文字对齐
  3. updatebyprimarykeyselective的where条件是全部字段_ArcGIS 字段计算器
  4. python实践心得体会_“Python自然语言实践”——总结(一),实战
  5. [html] 写页面布局时你有考虑过分辨率因素吗?还要考虑哪些因素呢
  6. Linux 中安装 Mysql
  7. RedHat6.7安装教程,图解,超详细
  8. Python Flask Web 第三课 —— 模板
  9. mac上安装mongoDb以及简单使用
  10. PHP 高并发秒杀解决方案
  11. python风格logo_十分钟利用Python制作属于你自己的个性logo
  12. vue中点击打开新的页面window.open()
  13. ui设计app设计风格有哪些?ui设计app界面设计流程是什么?
  14. 控制萤石云摄像头转头
  15. web使用百度地图实现经纬度定位和轨迹
  16. 计算机房宣传标语,机房安全标语
  17. 【java】BeanUtils.populate()的使用
  18. 【C++学习笔记】处理类型和自定义数据结构
  19. 扩展坞可以把手机投到显示器吗_轻松转接显示器,通吃PC和手机,ORICO USB-C扩展坞体验...
  20. 对话模型,DialogBERT和DialogWAE优势何在?

热门文章

  1. [MTK][FAQ14772] 如何实现插上电池自动开机
  2. 对innerHtml的理解(附上例子)
  3. Spinner用法详解
  4. 桶装水价格表 it 计算机,桶装水配送价格价格如何计算?「大力水手」
  5. 朝鲜女性择偶的四大标准
  6. html首行下沉效果,css仿word首字下沉效果示例
  7. [ 利器篇 ] - LInux 下百度网盘客户端 BaiduPCS
  8. 计算机视觉2021年3月28最新论文
  9. Head First 设计模式(四)装饰者模式
  10. thinkpad x1 carbon 黑苹果macOS Mojave 10.14.6 安装