静态爬取

https://www.lagou.com/zhaopin/jiqixuexi/?labelWords=label

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2019/5/6 21:12
# @Author  : Paulson
# @File    : Spider_jingtai.py
# @Software: PyCharm
# @define  : functionimport random
import timeimport pandas as pd
import requests
from lxml import etree# 真实cookie
# Cookie = 'xx‘’
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',# 'Cookie': Cookie
}for i in range(1, 2):time.sleep(random.randint(3, 10))url = 'https://www.lagou.com/zhaopin/jiqixuexi/{}/?filterOption=2'.format(i)print(url)print('正在抓取第{}页数据...'.format(i))# 请求网页并解析con = etree.HTML(requests.get(url, headers=headers).text)# 使用xpath表达式抽取各个字段信息job_name = [i for i in con.xpath('//a[@class="position_link"]/h3/text()')]print(job_name)job_address = [i for i in con.xpath("//a[@class='position_link']/span/em/text()")]job_company = [i for i in con.xpath("//div[@class='company_name']/a/text()")]job_salary = [i for i in con.xpath("//span[@class='money']/text()")]job_exp_edu = [i for i in con.xpath("//div[@class='li_b_l']/text()")]job_exp_edu2 = [i for i in [i.strip() for i in job_exp_edu] if i != '']job_industry = [i for i in con.xpath("//div[@class='industry']/text()")]job_tempation = [i for i in con.xpath("//div[@class='list_item_bot']/div[@class='li_b_r']/text()")]job_links = [i for i in con.xpath("//div[@class='p_top']/a/@href")]print(job_links)# 获取详情页链接后采集详情页岗位描述信息job_des = []for link in job_links:time.sleep(random.randint(3, 10))con2 = etree.HTML(requests.get(url=link, headers=headers).text)des = [[i.xpath('string(.)') for i in con2.xpath("//dd[@class='job_bt']/div/p")]]print(des)job_des += des# 对数据进行字典封装dataset = {'岗位名称': job_name,'工作地址': job_address,'公司': job_company,'薪资': job_salary,'经验学历': job_exp_edu2,'所属行业': job_industry,'岗位福利': job_tempation,'任职要求': job_des}# 转化为数据框并存为csvdata = pd.DataFrame(dataset)data.to_csv('machine_learning_hz_job2.csv')

结果

动态爬取

https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2019/5/6 22:11
# @Author  : Paulson
# @File    : Spider_dongtai.py
# @Software: PyCharm
# @define  : functionimport json
import random
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd#定义抓取主函数
def lagou_dynamic_crawl():headers = {'Accept': 'application/json, text/javascript, */*; q=0.01','Referer': 'https://www.lagou.com/jobs/list_%E8%BF%90%E7%BB%B4?city=%E6%88%90%E9%83%BD&cl=false&fromSearch=true&labelWords=&suginput=','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'}# 创建一个职位列表容器positions = []for page in range(1, 31):print('正在抓取{}页数据...'.format(page))URL_ = 'https://www.lagou.com/jobs/list_%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput='URL = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'# 构建请求保单参数params = {'first': 'false','pn': page,'kd': '机器学习'}# 构造请求并返回结果s = requests.Session()s.get(URL_, headers=headers, timeout=3)  # 请求首页获取cookiescookie = s.cookies  # 为此次获取的cookiesresult = s.post(URL, headers=headers, data=params, cookies=cookie, timeout=3)print(result.text)# 将请求结果转化为jsonjson_result = result.json()# 解析json数据获取目标信息try:position_info = json_result['content']['positionResult']['result']except:position_info = json_result['msg']if '您操作太频繁' in position_info:print('操作太频繁')assert False# 循环当前页每一个职位信息，再去爬职位详情页面for position in position_info:# 把我们要爬取的信息放入字典position_dict = {'position_name': position['positionName'],'work_year': position['workYear'],'education': position['education'],'salary': position['salary'],'city': position['city'],'company_name': position['companyFullName'],'address': position['businessZones'],'label': position['companyLabelList'],'stage': position['financeStage'],'size': position['companySize'],'advantage': position['positionAdvantage'],'industry': position['industryField'],'industryLables': position['industryLables']}# 找到职位idposition_id = position['positionId']# 根据职位id调用岗位描述函数获取职位描述position_dict['position_detail'] = recruit_detail(position_id)positions.append(position_dict)time.sleep(random.randint(3, 6))print('全部数据采集完毕...')return positions# 定义抓取岗位描述函数
def recruit_detail(position_id):headers = {"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3","Accept-Encoding": "gzip, deflate, br","Accept-Language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7","Cache-Control": "max-age=0","Connection": "keep-alive","Host": "www.lagou.com","Referer": "https://www.lagou.com/jobs/list_%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0?labelWords=&fromSearch=true&suginput=","Upgrade-Insecure-Requests": "1","User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36",}url = 'https://www.lagou.com/jobs/%s.html' % position_idresult = requests.get(url, headers=headers)time.sleep(random.randint(1, 4))# 解析职位要求textsoup = BeautifulSoup(result.text, 'html.parser')job_bt = soup.find(class_='job_bt')# 通过尝试发现部分记录描述存在空的情况# 所以这里需要判断处理一下if job_bt != None:job_bt = job_bt.textelse:job_bt = 'null'return job_btif __name__ == '__main__':positions = lagou_dynamic_crawl()data = pd.DataFrame(positions)data.to_csv('machine_learning_hz_job3.csv')

结果

感谢star
点击前往GitHub项目
点击前往 jupyter notebook 文件

【2019.05】python 爬取拉钩数据（静态+动态）相关推荐

python 爬取拉钩数据
Python通过Request库爬取拉钩数据爬取方法数据页面建表存储职位信息解析页面核心代码完整代码结果展示爬取方法采用python爬取拉钩数据,有很多方法可以爬取,我采用的是通过Re ...
python 爬取拉钩网数据
python 爬取拉钩网数据完整代码下载:https://github.com/tanjunchen/SpiderProject/blob/master/lagou/LaGouSpider.py # ...
python 爬取拉钩招聘数据
上一篇介绍在linux 搭建jupter lab,本文将介绍python数据接口的爬取以及提取建模分析的数据导入依赖的包 import requests import time from urlli ...
python爬取拉钩python数据分析职位招聘信息
python数据分析 python数据分析是目前python最火的方向之一,为了解目前市场对该职位的需求,我们爬取了拉钩上对pythons数据分析的招聘信息. 环境系统:windows7 pytho ...
用 Python 爬取各类基金数据并动态展示
去年接触基金,体会到了基金的香(真香),这几天也是过年后开始交易的日子,今天爬取『蛋卷基金』数据,通过pyecharts动图可视化方式展示基金的涨跌情况. 本文将围绕这三点去进行爬取数据,动图可视化展 ...
Python应用实战-Python爬取4000+股票数据，并用plotly绘制了树状热力图(treemap)
目录: 1. 准备工作 2. 开始绘图 2.1. 简单的例子 2.2. px.treemap常用参数介绍 2.3. color_continuous_scale参数介绍 2.4. 大A股市树状热力图来 ...
Python 爬取拉勾招聘信息
Python 爬取拉勾招聘信息故事背景最近有个好哥们啊浪迫于家里工资太低,准备从北方老家那边来深圳这边找工作,啊浪是学平面设计的知道我在深圳这边于是向我打听深圳这边平面设计薪资水平,当时我有点懵逼 ...
python爬去朋友圈_利用Python爬取朋友圈数据，爬到你开始怀疑人生
人生最难的事是自我认知,用Python爬取朋友圈数据,让我们重新审视自己,审视我们周围的圈子. 文:朱元禄(@数据分析-jacky) 哲学的两大问题:1.我是谁?2.我们从哪里来? 本文 jacky试 ...
利用Python爬取国家水稻数据中心的品种数据
利用Python爬取国家水稻数据中心的品种数据一.页面获取 python可以进行对网页的访问,主要用到requests,beautifulsoup4包. 首先新建一个page的py文件,用来获取页面 ...

【2019.05】python 爬取拉钩数据（静态+动态）

静态爬取

结果

动态爬取

结果

【2019.05】python 爬取拉钩数据（静态+动态）相关推荐

最新文章

热门文章