Python selenium 拉钩爬虫

selenium 用作自动化测试工具，并非爬虫工具，用作爬虫性能没那么好。但既然可以读取网页信息，那还是可以用来爬取数据的。用该工具模拟访问，网站会认为是正常的访问行为。

项目创建几个文件，都在同一个目录中：

setting.cfg ：配置文件

mssql.py ：数据库

lagou.py ：拉勾网标签读取和相关操作

mydriver.py ：driver 相关操作

main.py ：执行文件

selenium 标签读取设置参考本人之前总结的： Python selenium自动化模拟登录操作（一），或者 selenium 元素定位。思路还是和 Python scrapy 爬取拉勾网招聘信息的一样。不过对于翻页的操作，使用selenium 来获取 “下一页” 标签来点击触发跳转。每页读取保存数据后再继续点击下一页，知道达到总页数。比较麻烦的是，要判断页面是否加载完成，如果未加载出来，读取标签太快则报错。（可参考：等待页面加载完成(Waits) 或者Waits）

本脚本缺点：这是一次模拟登陆爬取的，如果爬取另一个岗位，换名称又得重新模拟登陆了。（当然可以传递一个列表/集合的岗位，登录之后每次设置搜索爬取即可）

【seleniumlagou】数据库表结构：

USE [Myspider]
GO
CREATE TABLE [dbo].[seleniumlagou]([companyfullname] [varchar](50) NULL,[positionname] [varchar](50) NULL,[salary] [varchar](20) NULL,[workyear] [varchar](20) NULL,[education] [varchar](20) NULL,[city] [varchar](20) NULL,[district] [varchar](20) NULL,[financestage] [varchar](50) NULL,[industryfield] [varchar](100) NULL,[firsttype] [varchar](50) NULL,[positionlables] [varchar](100) NULL
) ON [PRIMARY]
GO

【setting.cfg】配置文件

[mssql]
MSSQL_HOST = 'HZC'
MSSQL_USER = 'kk'
MSSQL_PASSWD = 'kk'
MSSQL_DBNAME = 'Myspider'[driver]
driverPath = 'D:/Python35/selenium/phantomjs/bin/phantomjs.exe'
imgPath = 'E:/mypy/lagou/img.png'

【mssql.py】数据库执行脚本

# -*- coding: utf-8 -*-
# python 3.5import sys
import pymssql
import configparsersys.path.append(r'E:/mypy/lagou')
cf = configparser.ConfigParser()
cf.read("setting.cfg")  MSSQL_HOST = cf.get("mssql", "MSSQL_HOST").strip().replace("\'","").replace(r"\n","")
MSSQL_USER = cf.get("mssql", "MSSQL_USER").strip().replace("\'","").replace(r"\n","")
MSSQL_PASSWD = cf.get("mssql", "MSSQL_PASSWD").strip().replace("\'","").replace(r"\n","")
MSSQL_DBNAME = cf.get("mssql", "MSSQL_DBNAME").strip().replace("\'","").replace(r"\n","")class MSSQL(object):  def __init__(self):self.host = MSSQL_HOSTself.user = MSSQL_USERself.pwd = MSSQL_PASSWDself.db = MSSQL_DBNAMEself._conn = self.GetConnect()if(self._conn):self._cur = self._conn.cursor()#连接数据库def GetConnect(self):conn = Falsetry:conn = pymssql.connect(host=self.host,user=self.user,password=self.pwd,database =self.db)except Exception as err:print("连接数据库失败, %s" % err)else:return conn#执行查询def ExecQuery(self,sql):res = ""try:self._cur.execute(sql)res = self._cur.fetchall()except Exception as err:print("查询失败, %s" % err)else:return res#执行非查询类语句  def ExecNonQuery(self, sql):flag = Falsetry:self._cur.execute(sql)self._conn.commit()flag = Trueexcept Exception as err:flag = Falseself._conn.rollback()print("执行失败, %s" % err)else:return flag#获取连接信息  def GetConnectInfo(self):  print( "连接信息：" )  print( "服务器:%s , 用户名:%s , 数据库:%s " % (self.host,self.user,self.db))  #关闭数据库连接  def Close(self):if(self._conn):try:if(type(self._cur)=='object'):self._cur.close()if(type(self._conn)=='object'):self._conn.close()except:raise("关闭异常, %s,%s" % (type(self._cur), type(self._conn)))

【mydriver.py】driver 相关操作

# -*- coding: utf-8 -*-
# python 3.5import sys
import configparser
from selenium import webdriver
from lagou import Lagousys.path.append(r'E:/mypy/lagou')
cf = configparser.ConfigParser()
cf.read("setting.cfg")  driverPath = cf.get("driver", "driverPath").strip().replace("\'","").replace(r"\n","")
imgPath = cf.get("driver", "imgPath").strip().replace("\'","").replace(r"\n","")class MyDriver(object):def __init__(self):self.imgPath = imgPathself.driverPath = driverPath#self.driver = webdriver.PhantomJS()self.driver = webdriver.Chrome("D:/Python35/selenium/webdriver/chromedriver/chromedriver.exe")self.myweb = Lagou(self.driver)def setUp(self,url):self.driver.get(url)# 本类变量处理def setImgPath(self,imgPath):self.imgPath = imgPathdef setDriverPath(self,driverPath):self.driverPath = driverPathdef getImgPath(self):return self.imgPathdef getDriverPath(self):return self.driverPathdef getDriver(self):return self.driver# driver 相关操作def setOptions(self):    self.driver.maximize_window()#self.driver.set_window_size(宽，高)def saveScreenshot(self):self.driver.get_screenshot_as_file(imgPath)def quitDriver(self):  self.driver.quit()# web 通用函数登录操作def setUserPwd(self,username,password):self.myweb.setUsername(username)self.myweb.setPassword(password)def doSubmit(self):self.myweb.doSubmit()def getLoginErrMsg(self):return self.myweb.getLoginErrMsg()# web 拉钩其他操作def doFirstSearch(self,keyword):self.myweb.firstSearch(keyword)self.myweb.firstSearchClick()def doDetailSearch(self,keyword,city,workyear,education,financestage,industryfield,monthsalary):self.myweb.detailSearch(keyword,city,workyear,education,financestage,industryfield,monthsalary)def saveDate(self):self.myweb.saveDate()

【lagou.py 】拉勾网标签读取和相关操作

# -*- coding: utf-8 -*-
# python 3.5import time
from mssql import MSSQLclass Lagou(object):def __init__(self,driver):self.mssql = MSSQL()self.driver = driver#self.taltalpage = 0#登录def setUsername(self,username):return self.driver.find_element_by_xpath("//input[@placeholder='请输入常用手机号/邮箱']").send_keys(username)def setPassword(self,password):return self.driver.find_element_by_xpath("//input[@placeholder='请输入密码']").send_keys(password)def doSubmit(self):return self.driver.find_element_by_xpath("//form[@class='active']/div[5]/input[@type='submit']").click()def getLoginErrMsg(self):return self.driver.find_element_by_class_name('input_tips').text.strip()#首页搜索,条件少def firstSearch(self,keyword):return self.driver.find_element_by_id('search_input').send_keys(keyword)def firstSearchClick(self):return self.driver.find_element_by_id('search_button').click()#详细搜索def detailSearch(self,keyword,city,workyear,education,financestage,industryfield,monthsalary):keyword = keyword.strip()if len(city) == 0 :city = ""else:city = "&city=%s" % city.strip()if len(workyear) == 0 :workyear = ""else:workyear = "&gj=%s" % workyear.strip()if len(education) == 0 :education = ""else:education = "&xl=%s" % education.strip()if len(financestage) == 0 :financestage = ""else:financestage = "&jd=%s" % financestage.strip()if len(industryfield) == 0 :industryfield = ""else:industryfield = "&hy=%s" % industryfield.strip()if len(monthsalary) == 0 :monthsalary = ""else:monthsalary = "&yx=%s" % monthsalary.strip()#选择标签比较麻烦，直接拼接网站访问url = "https://www.lagou.com/jobs/list_%s?px=default" % keywordurl = url + "%s%s%s%s%s%s" %(workyear,education,financestage,industryfield,monthsalary,city)self.driver.get(url)#总页数def getTaltalPage(self):num = self.driver.find_element_by_xpath("//div[@class='page-number']/span[2]").text.strip()if len(num) == 0 :num = 0#self.taltalpage = int(num)print("总页数：%s " % num)return int(num)#点击下一页def NextPage(self):self.driver.find_element_by_xpath("//span[@class='pager_next ']").click()#保存所有页数据def saveDate(self):taltalpage = self.getTaltalPage()currentpage = 1if taltalpage != 0:while currentpage <= taltalpage:time.sleep(3) #等待页面加载print(">> 第 %s 页数据处理中…………………………………………" % currentpage)print(self.driver.current_url)self.saveOnePageDate() #保存当页数据self.NextPage()  #点击下一页currentpage = currentpage + 1else:pass#保存一页数据def saveOnePageDate(self):index = 0while index <= 14:xpath = "//li[@data-index='%s']" % indexprint(">> 第 %s 条" % index)self.saveliDate(xpath)index = index + 1#保存 li 到数据库def saveliDate(self,xpath):positi = self.driver.find_element_by_xpath(xpath + "/div[1]/div[1]/div[1]/a/h3").text.strip()citydist = self.driver.find_element_by_xpath(xpath + "/div[1]/div[1]/div[1]/a/span/em").text.strip()salary = self.driver.find_element_by_xpath(xpath + "/div[1]/div[1]/div[2]/div/span").text.strip()wy_edu = self.driver.find_element_by_xpath(xpath + "/div[1]/div[1]/div[2]/div").text.strip()company = self.driver.find_element_by_xpath(xpath + "/div[1]/div[2]/div[1]/a").text.strip()fina_ind = self.driver.find_element_by_xpath(xpath + "/div[1]/div[2]/div[2]").text.strip()firsttype = self.driver.find_element_by_xpath(xpath + "/div[2]/div[1]").text.strip()lables = self.driver.find_element_by_xpath(xpath + "/div[2]/div[2]").text.strip()companyfullname = companypositionname = positisalary = salary  #((wy_edu.replace(" ", "/")).split('/')[0]).strip()workyear = ((wy_edu.replace(" ", "/")).split('/')[1]).strip()education = ((wy_edu.replace(" ", "/")).split('/')[4]).strip()city = ((citydist+'·'+citydist).split('·')[0]).strip()district = ((citydist+'·'+citydist).split('·')[1]).strip()industryfield = (fina_ind.split('/')[0]).strip()financestage = (fina_ind.split('/')[1]).strip()firsttype = firsttype.replace(" ", ",").strip()positionlables = lables.replace("“", "").replace("”", "").strip()sql = """INSERT INTO seleniumlagou( companyfullname , positionname, salary, workyear, education,city,district, industryfield, financestage, firsttype, positionlables) VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')""" % \(companyfullname,positionname,salary,workyear,education,city, district,industryfield,financestage,firsttype,positionlables)self.mssql.ExecNonQuery(sql)"""print("companyfullname = " + companyfullname)print("positionname = " + positionname)print("salary = " + salary)print("workyear = " + workyear)print("education = " + education)print("city = " + city)print("district = " + district)print("industryfield = " + industryfield)print("financestage = " + financestage)print("firsttype = " + firsttype)print("positionlables = " + positionlables)"""

【main.py 】执行文件

# -*- coding: utf-8 -*-
# python 3.5import time
import unittest
from mydriver import MyDriverclass Main(unittest.TestCase):  username = "kk"  password = "kk"  loginUrl = 'https://passport.lagou.com/login/login.html' #【登录拉钩网】mydriver = MyDriver()driver = mydriver.getDriver()#mydriver.setOptions()mydriver.setUp(loginUrl)mydriver.setUserPwd(username,password)mydriver.doSubmit()print("[1] "+driver.current_url)#判断页面是否跳转加载，url不一样说明已跳转。while True:if loginUrl == driver.current_url :time.sleep(1)print("[-] "+driver.current_url)print("loading……")continueelse:breakprint("[2] "+driver.current_url)#mydriver.saveScreenshot()#【按条件搜索】#首页筛选条件太少，任意输入直接点击搜索将跳转详细搜索列表mydriver.doFirstSearch("hzc")print("[3] "+driver.current_url)#详细搜索页面，格式：(岗位,工作城市,工作经验,学历要求,融资阶段,行业领域,月薪范围)#mydriver.doDetailSearch("dba","深圳","3-5年","本科","未融资","移动互联网","15k-25k")mydriver.doDetailSearch("DBA","","","","","","")print("[4] "+driver.current_url)mydriver.saveDate()print("done!")if __name__ == "__main__":Main()

Python selenium 拉钩爬虫相关推荐

selenium实现拉钩爬虫
在这前通过接口分析拉钩网站,发现其反爬虫措施比较多,爬取比较麻烦,在这一章节,采用selenium方法进行爬虫设计. 1. 初始化采用类模式的形式设计实现,先初始化自己的的信息,实现代码如下: ch ...
python 下载拉钩教育AES加密视频
说在前面: 下面我们要爬取的是拉钩教育课程上面的视频,课程已经购买过了.但是由于没有提供缓冲和下载视频的功能,所以就打算把视频通过python给下载下来,以下的文章都是参考博友的,自己总结下并学习学习 ...
Python突破拉钩反爬机制，采集各类招聘数据
首先说一下这个有啥用?要说有用也没啥用,要说没用吧,既然能拿到这些数据,拿来做数据分析.能有效的得到职位信息,薪资信息等.也能为找工作更加简单吧,且能够比较有选择性的相匹配的职位及公司很多人学习py ...
python+Selenium多线程后台爬虫例子
Selenium多线程后台爬虫一.前言: 有些网站不支持网页源码爬虫.或要爬取的网页内容不在网页源码中, 等需要使用Selenium进行爬虫二.准备工作: 安装selenium及对应googlec ...
[Python]爬拉钩（Python职位）
import requests from bs4 import BeautifulSoup import time import random#header里面加上cookie防止被ban def g ...
python selenium 爬虫模拟浏览网站内容
使用python selenium编写的爬虫代码,模拟用户浏览某个网站内容,废话少说进入正文. 1.爬虫界面如下: 界面使用说明: 第一步:填写要访问的网站地址第二步:填写每天访问该网址的次数第三 ...
杭州python爬虫招聘_python爬取招聘网站（智联，拉钩，Boss直聘）
刚好最近有这需求,动手写了几个就贴上代码算了 1.智联将结果保存为python的一个数据框中 import requests from requests.exceptions import Req ...
Python爬虫实现全自动爬取拉钩教育视频
ps:改良之后的多线程版本在最后背景大饼加了不少技术交流群,之前在群里看到拉钩教育平台在做活动,花了1块钱买了套课程.比较尴尬的是大饼一般都会在上下班的路中学习下(路上时间比较久)而这个视频无法缓 ...
python selenium爬虫_详解基于python +Selenium的爬虫
详解基于python +Selenium的爬虫一.背景 1. Selenium Selenium 是一个用于web应用程序自动化测试的工具,直接运行在浏览器当中,支持chrome.firefox等主 ...

Python selenium 拉钩爬虫

Python selenium 拉钩爬虫相关推荐

最新文章

热门文章