Python自动化实现船舶订单抓取

打开浏览器，登录 “龙de船人”

需要下载selenium，下载浏览器对应版本chromedriver至python安装目录下

1.1在浏览器输入chrome://version，即可查询版本号

1.2下载对应版本chrome浏览器内核文件至python安装目录下

下载地址：http://chromedriver.storage.googleapis.com/index.html

选择正确的版本号

windows64位选win32即可

打开页面代码如下：

from selenium import webdriver
from selenium.webdriver.chrome.options import Optionsdef __init__(self):super(CrackSlider, self).__init__()self.opts = Options()self.opts.add_argument('--no-sandbox')  # 沙箱机制self.driver = webdriver.Chrome(options=self.opts)self.login_url = "https://www.imarine.cn/member.php?mod=logging&action=login"try:self.driver.get(self.login_url)    # 由于是使用浏览器直接访问页面，因此无需区分 get 和 post 方法，直接 get 方法打开页面即可except Exception as e:print("开始！")# 等待2秒钟time.sleep(2)

自动登录（难点：滑块验证）

from io import BytesIO
import cv2
import numpy as np
from PIL import Image
from selenium import webdriver
from selenium.webdriver import ActionChainsdef __init__(self):super(CrackSlider, self).__init__()# 打开浏览器，代码省略# 等待2秒钟time.sleep(2)# 输入账号密码self.driver.find_element(by=By.XPATH,value="/html/body/div[1]/main/div/div[1]/div/div[2]/div[1]/div/form/div[1]/div[""2]/div[1]/input").send_keys("账号")time.sleep(1)self.driver.find_element(by=By.XPATH,value="/html/body/div[1]/main/div/div[1]/div/div[2]/div[1]/div/form/div[2]/div[""2]/div[1]/input").send_keys("密码")time.sleep(1)self.driver.find_element(by=By.XPATH,value="/html/body/div[1]/main/div/div[1]/div/div[2]/div[1]/div/form/button").click()time.sleep(1)self.wait = WebDriverWait(self.driver, 10)def get_pic(self):# self.driver.get(self.login_url)time.sleep(5)self.driver.switch_to.frame('tcaptcha_iframe')# 定位需要滑动的元素target_link = self.driver.find_element(by=By.XPATH, value="/html/body/div/div[3]/div[2]/div[1]/div[2]/img") \.get_attribute('src')template_link = self.driver.find_element(by=By.XPATH, value="/html/body/div/div[3]/div[2]/div[1]/div[3]/img") \.get_attribute('src')target_img = Image.open(BytesIO(requests.get(target_link).content))template_img = Image.open(BytesIO(requests.get(template_link).content))target_img.save('target.jpg')template_img.save('template.png')def crack_slider(self, distance):slider = self.driver.find_element(by=By.ID, value='tcaptcha_drag_thumb')ActionChains(self.driver).click_and_hold(slider).perform()ActionChains(self.driver).move_by_offset(xoffset=distance, yoffset=0).perform()time.sleep(2)ActionChains(self.driver).release().perform()ActionChains(self.driver).click(slider).perform()def add_alpha_channel(img):""" 为jpg图像添加alpha通道 """r_channel, g_channel, b_channel = cv2.split(img)  # 剥离jpg图像通道alpha_channel = np.ones(b_channel.shape, dtype=b_channel.dtype) * 255  # 创建Alpha通道img_new = cv2.merge((r_channel, g_channel, b_channel, alpha_channel))  # 融合通道return img_newdef handel_img(img):imgGray = cv2.cvtColor(img, cv2.COLOR_RGBA2GRAY)  # 转灰度图imgBlur = cv2.GaussianBlur(imgGray, (5, 5), 1)  # 高斯模糊imgCanny = cv2.Canny(imgBlur, 60, 60)  # Canny算子边缘检测return imgCannydef match(img_jpg_path, img_png_path):# 读取图像img_jpg = cv2.imread(img_jpg_path, cv2.IMREAD_UNCHANGED)img_png = cv2.imread(img_png_path, cv2.IMREAD_UNCHANGED)# 判断jpg图像是否已经为4通道if img_jpg.shape[2] == 3:img_jpg = add_alpha_channel(img_jpg)img = handel_img(img_jpg)small_img = handel_img(img_png)res_TM_CCOEFF_NORMED = cv2.matchTemplate(img, small_img, 3)value = cv2.minMaxLoc(res_TM_CCOEFF_NORMED)value = value[3][0]  # 获取到移动距离return valuedef job():# 1. 打开chromedriver，下载图片cs = CrackSlider()cs.get_pic()# 2. 对比图片，计算距离img_jpg_path = 'target.jpg'  # 读者可自行修改文件路径img_png_path = 'template.png'  # 读者可自行修改文件路径distance = match(img_jpg_path, img_png_path)distance = distance / 680 * 340 - 25    # 自我调整比例# 3. 移动cs.crack_slider(distance)

船舶订单查询

根据元素full xpath定位元素，进行点击或是文本输入

    def query(self):time.sleep(10)print("正在点击查询按钮......")time.sleep(5)# 选择查询query = self.driver.find_element(by=By.XPATH, value="/html/body/div[6]/div/div[1]/div[2]/a[5]")time.sleep(3)query.click()time.sleep(5)print("正在输入查询条件......")# 选择开始时间self.driver.find_element(by=By.XPATH, value="/html/body/div[8]/form/div/div[2]/dl/dd[1]/input[1]").click()# 年份self.driver.find_element(by=By.XPATH,value="/html/body/div[1]/div/div[1]/div/table/tbody/tr[1]/td[2]/a[1]").click()self.driver.find_element(by=By.XPATH, value="/html/body/div[1]/div/div[2]/div[7]/a[9]/span").click()# 月份self.driver.find_element(by=By.XPATH,value="/html/body/div[1]/div/div[1]/div/table/tbody/tr[1]/td[2]/a[2]").click()self.driver.find_element(by=By.XPATH, value="/html/body/div[1]/div/div[3]/a[1]/span").click()# 日期self.driver.find_element(by=By.XPATH, value="/html/body/div[1]/div/div[1]/div/table/tbody/tr[3]/td[7]").click()# 选择结束时间self.driver.find_element(by=By.XPATH, value="/html/body/div[8]/form/div/div[2]/dl/dd[1]/input[2]").click()# 当前日期self.driver.find_element(by=By.XPATH,value="/html/body/div[1]/div/div[1]/div/table/tbody/tr[5]/td[4]/a").click()# 输入国家-中国self.driver.find_element(by=By.XPATH, value="/html/body/div[8]/form/div/div[2]/dl/dd[5]/input").send_keys("中国")# 点击搜索self.driver.find_element(by=By.XPATH, value="/html/body/div[8]/form/div/div[2]/dl/dd[7]/button/span").click()time.sleep(2)return 0

获取表单内容

def get_and_update(self):td = self.driver.find_element(by=By.XPATH, value="/html/body/div[8]/div[1]")  # 进一步定位到表格内容所在的td节点td_txt = td.textprint(td_txt)arr = td_txt.split("\n")for y in range(3, len(arr)):arr1 = arr[y].split(" ")tup1 = (arr1[0], arr1[1], arr1[2], arr1[3], arr1[4], arr1[6], arr1[7])if arr1[0] < '2022':  # 只取2022年后数据return Falseself.arr_res.append(tup1)    # 所有结果汇聚于此

数据处理

import cx_Oracle
from sys import modules# 连接Oracle数据库
class oracleOperation():def openOracleConn(self):# highway = cx_Oracle.connect('c##sxx/c##sxx@localhost:1521/orcltest')  # 用户名/密码@host:端口/sidhighway = cx_Oracle.connect('用户名/密码@host:端口/sid')  # 用户名/密码@host:端口/sid# 获取cursor指针# cursor=highway.cursor()return highway# 条件查询def factorSelect(self, connection, param):cursor = connection.cursor()# 带参数的查询  ，例子如下：sql = 'select * from Longde where  create_date =:create_date and dockyard =:dockyard ' \'and ship_type =:ship_type and quantity =:quantity and specification =:specification ' \'and region =:region and shipowner =:shipowner'query1 = cursor.execute(sql, param)  # 特别的注意，具体 的条件查询的格式row = cursor.fetchall()cursor.close()# connection.close()    # 连接关闭在所有数据处理完成后if len(row) != 0:return Trueelse:return Falsepassdef insert(self, connection, insertParam=[]):cursor = connection.cursor()sql = "insert into Longde (create_date,dockyard,ship_type,quantity,specification,region,shipowner) " \"values (:create_date,:dockyard,:ship_type,:quantity,:specification,:region,:shipowner)"for i in range(0, len(insertParam)):if insertParam[i] == '-':insertParam[i] = Noneif len(insertParam) == 0:print("插入的数据行的参数不能为空！")else:cursor.prepare(sql)result = cursor.executemany(None, insertParam)connection.commit()cursor.close()passif __name__ == '__main__':db = oracleOperation()connection = db.openOracleConn()# 能运行的无条件查询语句db.select(connection)

定时任务

from apscheduler.schedulers.blocking import BlockingSchedulerdef job():""" 反复执行内容 """if __name__ == "__main__":scheduler = BlockingScheduler()scheduler.add_job(job, 'interval', seconds=180)    # job为上面方法执行方法名scheduler.start()

代码（无数据库操作部分）

# coding=utf-8
import datetime
import re
import requests
import time
from io import BytesIOimport cv2
import numpy as np
from PIL import Image
from apscheduler.schedulers.blocking import BlockingScheduler
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Optionsfrom helloworld.Longde import db_oracle_Longdeclass CrackSlider:# 通过浏览器截图，识别验证码中缺口位置，获取需要滑动距离，并破解滑动验证码def __init__(self):super(CrackSlider, self).__init__()# 打开浏览器self.opts = Options()self.opts.add_argument('--no-sandbox')  # Bypass OS security modelself.driver = webdriver.Chrome(options=self.opts)self.login_url = "https://www.imarine.cn/member.php?mod=logging&action=login"self.Longde_url = "https://www.imarine.cn/order/?wtime=&shipyard=&rocker=&power=&countryarea=&shipowner="  # 新造船订单查询self.arr_res = []  # 存储为listtry:self.driver.get(self.login_url)except Exception as e:print("开始！")# 等待2秒钟time.sleep(2)# 输入账号密码self.driver.find_element(by=By.XPATH,value="/html/body/div[1]/main/div/div[1]/div/div[2]/div[1]/div/form/div[1]/div[""2]/div[1]/input").send_keys("账号")time.sleep(1)self.driver.find_element(by=By.XPATH,value="/html/body/div[1]/main/div/div[1]/div/div[2]/div[1]/div/form/div[2]/div[""2]/div[1]/input").send_keys("密码")time.sleep(1)self.driver.find_element(by=By.XPATH,value="/html/body/div[1]/main/div/div[1]/div/div[2]/div[1]/div/form/button").click()time.sleep(1)self.wait = WebDriverWait(self.driver, 10)def get_pic(self):# self.driver.get(self.login_url)time.sleep(5)self.driver.switch_to.frame('tcaptcha_iframe')# 定位需要滑动的元素target_link = self.driver.find_element(by=By.XPATH, value="/html/body/div/div[3]/div[2]/div[1]/div[2]/img") \.get_attribute('src')template_link = self.driver.find_element(by=By.XPATH, value="/html/body/div/div[3]/div[2]/div[1]/div[3]/img") \.get_attribute('src')target_img = Image.open(BytesIO(requests.get(target_link).content))template_img = Image.open(BytesIO(requests.get(template_link).content))target_img.save('target.jpg')template_img.save('template.png')def crack_slider(self, distance):slider = self.driver.find_element(by=By.ID, value='tcaptcha_drag_thumb')ActionChains(self.driver).click_and_hold(slider).perform()ActionChains(self.driver).move_by_offset(xoffset=distance, yoffset=0).perform()time.sleep(2)ActionChains(self.driver).release().perform()ActionChains(self.driver).click(slider).perform()def query(self):time.sleep(10)print("正在点击查询按钮......")time.sleep(5)# 选择查询query = self.driver.find_element(by=By.XPATH, value="/html/body/div[6]/div/div[1]/div[2]/a[5]")time.sleep(3)query.click()time.sleep(5)print("正在输入查询条件......")# 选择开始时间self.driver.find_element(by=By.XPATH, value="/html/body/div[8]/form/div/div[2]/dl/dd[1]/input[1]").click()# 年份self.driver.find_element(by=By.XPATH,value="/html/body/div[1]/div/div[1]/div/table/tbody/tr[1]/td[2]/a[1]").click()self.driver.find_element(by=By.XPATH, value="/html/body/div[1]/div/div[2]/div[7]/a[9]/span").click()# 月份self.driver.find_element(by=By.XPATH,value="/html/body/div[1]/div/div[1]/div/table/tbody/tr[1]/td[2]/a[2]").click()self.driver.find_element(by=By.XPATH, value="/html/body/div[1]/div/div[3]/a[1]/span").click()# 日期self.driver.find_element(by=By.XPATH, value="/html/body/div[1]/div/div[1]/div/table/tbody/tr[3]/td[7]").click()# 选择结束时间self.driver.find_element(by=By.XPATH, value="/html/body/div[8]/form/div/div[2]/dl/dd[1]/input[2]").click()# 当前日期self.driver.find_element(by=By.XPATH,value="/html/body/div[1]/div/div[1]/div/table/tbody/tr[5]/td[4]/a").click()# 输入国家-中国self.driver.find_element(by=By.XPATH, value="/html/body/div[8]/form/div/div[2]/dl/dd[5]/input").send_keys("中国")# 点击搜索self.driver.find_element(by=By.XPATH, value="/html/body/div[8]/form/div/div[2]/dl/dd[7]/button/span").click()time.sleep(2)return 0def get_order(self):self.db = db_oracle_Longde.oracleOperation()self.connection = self.db.openOracleConn()pageNum = self.driver.find_element(by=By.XPATH, value="/html/body/div[8]/div[2]/div/label/span").textpageNum = pageNum.strip("/ ").strip(" 页")print("第一次--> 一共有" + pageNum + "页")self.get_and_update()page = self.driver.find_element(by=By.CLASS_NAME, value="nxt")page.click()time.sleep(2)# 第一次进入页面时显示的总页数不正确，点击下一页后显示的才是正确的总页数pageNum = self.driver.find_element(by=By.XPATH, value="/html/body/div[8]/div[2]/div/label/span").textpageNum = pageNum.strip("/ ").strip(" 页")print("点击后--> 一共有" + pageNum + "页")# 提取表格内容td，减去上面已获取的一页内容for i in range(1, int(pageNum) - 1):print('现在是第', str(i), '页')flag = self.get_and_update()if flag is False:return Falsepage = self.driver.find_element(by=By.CLASS_NAME, value="nxt")page.click()time.sleep(2)self.get_and_update()print("所有数据：")print(self.arr_res)self.driver.quit()# 关闭数据库连接self.connection.close()def get_and_update(self):td = self.driver.find_element(by=By.XPATH, value="/html/body/div[8]/div[1]")  # 进一步定位到表格内容所在的td节点td_txt = td.textprint(td_txt)arr = td_txt.split("\n")for y in range(3, len(arr)):arr1 = arr[y].split(" ")tup1 = (arr1[0], arr1[1], arr1[2], arr1[3], arr1[4], arr1[6], arr1[7])if arr1[0] < '2022':  # 只取2022年后数据return Falseself.arr_res.append(tup1)dict_select = {'create_date': arr1[0], 'dockyard': arr1[1], 'ship_type': arr1[2],'quantity': arr1[3], 'specification': arr1[4], 'region': arr1[6],'shipowner': arr1[7]}res = self.db.factorSelect(self.connection, dict_select)list_insert = [(arr1[0], arr1[1], arr1[2], arr1[3], arr1[4], arr1[6], arr1[7])]if res is False:  # insertself.db.insert(self.connection, list_insert)return Truedef add_alpha_channel(img):""" 为jpg图像添加alpha通道 """r_channel, g_channel, b_channel = cv2.split(img)  # 剥离jpg图像通道alpha_channel = np.ones(b_channel.shape, dtype=b_channel.dtype) * 255  # 创建Alpha通道img_new = cv2.merge((r_channel, g_channel, b_channel, alpha_channel))  # 融合通道return img_newdef handel_img(img):imgGray = cv2.cvtColor(img, cv2.COLOR_RGBA2GRAY)  # 转灰度图imgBlur = cv2.GaussianBlur(imgGray, (5, 5), 1)  # 高斯模糊imgCanny = cv2.Canny(imgBlur, 60, 60)  # Canny算子边缘检测return imgCannydef match(img_jpg_path, img_png_path):# 读取图像img_jpg = cv2.imread(img_jpg_path, cv2.IMREAD_UNCHANGED)img_png = cv2.imread(img_png_path, cv2.IMREAD_UNCHANGED)# 判断jpg图像是否已经为4通道if img_jpg.shape[2] == 3:img_jpg = add_alpha_channel(img_jpg)img = handel_img(img_jpg)small_img = handel_img(img_png)res_TM_CCOEFF_NORMED = cv2.matchTemplate(img, small_img, 3)value = cv2.minMaxLoc(res_TM_CCOEFF_NORMED)value = value[3][0]  # 获取到移动距离return valuedef job():# 1. 打开chromedriver，下载图片cs = CrackSlider()cs.get_pic()# 2. 对比图片，计算距离img_jpg_path = 'target.jpg'  # 读者可自行修改文件路径img_png_path = 'template.png'  # 读者可自行修改文件路径distance = match(img_jpg_path, img_png_path)distance = distance / 680 * 340 - 25# 3. 移动cs.crack_slider(distance)# 4. 查询cs.query()cs.get_order()if __name__ == "__main__":print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))# BlockingScheduler 定时任务，毎3分钟获取一次数据scheduler = BlockingScheduler()scheduler.add_job(job, 'interval', seconds=180)scheduler.start()# job()

Python自动化实现船舶订单抓取相关推荐

Python爬虫实战六之抓取爱问知识人问题并保存至数据库
大家好,本次为大家带来的是抓取爱问知识人的问题并将问题和答案保存到数据库的方法,涉及的内容包括: Urllib的用法及异常处理 Beautiful Soup的简单应用 MySQLdb的基础用法正则表 ...
Atitit.web的自动化操作与信息抓取 attilax总结
Atitit.web的自动化操作与信息抓取 attilax总结 1. Web操作自动化工具,可以简单的划分为2大派系: 1.录制回放 2.手工编写0 U' z; D! s2 d/ Q! ^1 2. 常 ...
基于python的今日头条文章抓取内含signature算法
基于python的今日头条文章抓取内含signature算法扫二维码添加微信备注:爬虫 , 拉你进爬虫交流群或许你会成为第一个加群的人~ 刚有的创群想法! 1. 简单文字描述头条爬虫注意点由于 ...
Python爬虫包 BeautifulSoup 递归抓取实例详解
Python爬虫包 BeautifulSoup 递归抓取实例详解概要: 爬虫的主要目的就是为了沿着网络抓取需要的内容.它们的本质是一种递归的过程.它们首先需要获得网页的内容,然后分析页面内容并找到另 ...
[Python爬虫] 三、数据抓取之Requests HTTP 库
往期内容提要: [Python爬虫] 一.爬虫原理之HTTP和HTTPS的请求与响应 [Python爬虫] 二.爬虫原理之定义.分类.流程与编码格式一.urllib 模块所谓网页抓取,就是把URL ...
Python爬虫之gif图片抓取
Python爬虫之gif图片抓取标签:图片爬虫这几天,对于怎么去爬取图片很感兴趣,就研究了一下,图片爬虫可以说是有简单,更有复杂的,今天,我做了一个比较简单的gif的图片爬虫,仅仅学习一下怎么进行 ...
Python之Email邮箱账号抓取
Python之Email邮箱账号抓取
[Python爬虫] 四、数据抓取之HTTP/HTTPS抓包工具Fiddler
往期内容提要: [Python爬虫] 一.爬虫原理之HTTP和HTTPS的请求与响应 [Python爬虫] 二.爬虫原理之定义.分类.流程与编码格式 [Python爬虫] 三.数据抓取之Request ...
10分钟教你用Python玩转微信之抓取好友个性签名制作词云
10分钟教你用Python玩转微信之抓取好友个性签名制作词云 01 前言+展示各位小伙伴我又来啦.今天带大家玩点好玩的东西,用Python抓取我们的微信好友个性签名,然后制作词云.怎样,有趣吧~好了 ...

Python自动化实现船舶订单抓取

打开浏览器，登录 “龙de船人”

打开页面代码如下：

自动登录（难点：滑块验证）

船舶订单查询

获取表单内容

数据处理

定时任务

代码（无数据库操作部分）

Python自动化实现船舶订单抓取相关推荐

最新文章

热门文章