20200318_抓取51job招聘数据存数据库

import pandas as pd
import requests
from lxml import etree
import chardet
import numpy as np

#职位名称
zwmc=[]
#公司名称
gsmc=[]
#工作地点
gzdd=[]
#职位最低月薪
xz_low=[]
#职位最高月薪
xz_height=[]
#发布时间
ptime=[]
#网站地址
href=[]
##地区
a=[]
#经验
b=[]
#学历
c=[]
#招聘人数
d=[]
#时间
e=[]
# url='https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590%25E5%25B8%2588,2,1.html'
for i in range(2):url='https://search.51job.com/list/070200,000000,0000,00,9,99,%2B,2,{0}.html'.format(i)headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}response=requests.get(url,headers=headers)response.encoding=chardet.detect(response.content)['encoding']result=etree.HTML(response.text)item={}item['t1']=result.xpath('//div[@class="el"]/p/span/a/text()') #职位名称item['t2']=result.xpath('//div[@class="el"]/span[@class="t2"]/a/text()') #公司名称item['t3']=result.xpath('//div[@class="el"]/span[@class="t3"]/text()') #工作地点t4=result.xpath('//div[@class="el"]/span[@class="t4"]')item['t4']=[]for i in t4:item['t4'].append(i.xpath('string(.)'))  #职位月薪item['t5']=result.xpath('//div[@class="el"]/span[@class="t5"]/text()') #发布时间item['href']=result.xpath('//div[@class="el"]/p/span/a/@href')  #详细链接#3.数据清洗,处理原始数据#(1)去掉职位名称前后空白for i in range(len(item['t1'])):item['t1'][i]=item['t1'][i].strip()#(2)薪资处理#定义列表，存储处理后的薪资数据zw_low=[] #最低月薪zw_height=[] #最高薪资#考虑薪资数据可能出现的情况做循环判断for xz in item['t4']:if xz !="":xz=xz.strip().split('-')if len(xz)>1:if xz[1][-1]=='月' and xz[1][-3]=='万':zw_low.append(float(xz[0])*10000)zw_height.append(float(xz[1][0:-3])*10000)elif xz[1][-1]=='年' and xz[1][-3]=='万':zw_low.append(round((float(xz[0])*10000)/12,1))zw_height.append(round((float(xz[1][0:-3])*10000)/12,1))elif xz[1][-1]=='月' and xz[1][-3]=='千':zw_low.append(float(xz[0])*1000)zw_height.append(float(xz[1][0:-3])*1000)else:zw_low.append(0)zw_height.append(0)else:if xz[0][-1] =='天' and xz[0][-3]=='元':zw_low.append(xz[0][0:-3])zw_height.append(xz[0][0:-3])else:zw_low.append(0)zw_height.append(0)else:zw_low.append(0)zw_height.append(0)item['xz_low']=zw_lowitem['xz_height']=zw_height#(3) 时间数据处理for i in range(len(item['t5'])):item['t5'][i]='2018-'+item['t5'][i]for i in range(len(item['t1'])):zwmc.append(item['t1'][i])gsmc.append(item['t2'][i])gzdd.append(item['t3'][i])xz_low.append(item['xz_low'][i])xz_height.append(item['xz_height'][i])ptime.append(item['t5'][i])href.append(item['href'][i])

j=0
for i in href:print(i)url=iheaders={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}response=requests.get(url,headers=headers)response.encoding=chardet.detect(response.content)['encoding']result=etree.HTML(response.text)item={}item['t1']=result.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/p[2]/text()') #职位名称
#     print(len(item['t1']))if len(item['t1'])==5:a.append(item['t1'][0])b.append(item['t1'][1])c.append(item['t1'][2])d.append(item['t1'][3])elif len(item['t1'])==4:a.append(item['t1'][0])b.append(np.nan)c.append(item['t1'][1])d.append(item['t1'][2])
#     elif len(item['t1'])==7:
#         a.append(item['t1'][0])
#         b.append(item['t1'][1])
#         c.append(item['t1'][2])
#         d.append(item['t1'][3])
#         e.append(item['t1'][4])
#         f.append(item['t1'][5])
#         g.append(item['t1'][6])else:a.append(np.nan)b.append(np.nan)c.append(np.nan)d.append(np.nan)e.append(np.nan)

https://jobs.51job.com/nanjing/120765370.html?s=01&t=0
https://jobs.51job.com/nanjing-qhq/111504132.html?s=01&t=0
https://jobs.51job.com/nanjing/119195851.html?s=01&t=0
https://jobs.51job.com/fuzhou/120763909.html?s=01&t=0
https://jobs.51job.com/nanjing/119679799.html?s=01&t=0
https://jobs.51job.com/nanjing/120759263.html?s=01&t=0
https://jobs.51job.com/nanjing-qxq/120758430.html?s=01&t=0
https://jobs.51job.com/nanjing-jnq/117194442.html?s=01&t=0
https://jobs.51job.com/nanjing-qhq/116958794.html?s=01&t=0
https://jobs.51job.com/nanjing/120749829.html?s=01&t=0
https://jobs.51job.com/nanjing-qhq/120747304.html?s=01&t=0
https://jobs.51job.com/nanjing/120748516.html?s=01&t=0
https://jobs.51job.com/nanjing/120747138.html?s=01&t=0
https://jobs.51job.com/nanjing/120746319.html?s=01&t=0
http://astrazeneca.51job.com/sc/show_job_detail.php?jobid=120745775
https://jobs.51job.com/nanjing-jyq/120116860.html?s=01&t=0
https://jobs.51job.com/nanjing-xwq/119839700.html?s=01&t=0
https://jobs.51job.com/nanjing-yhtq/120735484.html?s=01&t=0
https://jobs.51job.com/nanjing/120734422.html?s=01&t=0
https://jobs.51job.com/nanjing/117008657.html?s=01&t=0
https://jobs.51job.com/nanjing/119619851.html?s=01&t=0
https://jobs.51job.com/nanjing/120763081.html?s=01&t=0
https://jobs.51job.com/nanjing/94954959.html?s=01&t=0
https://jobs.51job.com/nanjing-qhq/105410257.html?s=01&t=0
https://jobs.51job.com/nanjing/120751843.html?s=01&t=0
https://jobs.51job.com/chuzhou/120750509.html?s=01&t=0
http://schaeffler.51job.com/sc/show_job_detail.php?jobid=111620488
https://jobs.51job.com/nanjing/114661488.html?s=01&t=0
http://deppon.51job.com/sc/show_job_detail.php?jobid=119032948
https://jobs.51job.com/nanjing/120759749.html?s=01&t=0
https://jobs.51job.com/nanjing/120335790.html?s=01&t=0
https://jobs.51job.com/nanjing/120077889.html?s=01&t=0
https://jobs.51job.com/nanjing-yhtq/120736428.html?s=01&t=0
https://jobs.51job.com/nanjing/118627153.html?s=01&t=0
https://jobs.51job.com/chuzhou/120741653.html?s=01&t=0
https://jobs.51job.com/nanjing-xwq/120741607.html?s=01&t=0
https://jobs.51job.com/nanjing-qxq/107120958.html?s=01&t=0
https://jobs.51job.com/nanjing/120738083.html?s=01&t=0
https://jobs.51job.com/nanjing/108133718.html?s=01&t=0
https://jobs.51job.com/nanjing/119959950.html?s=01&t=0
https://jobs.51job.com/nanjing-jnq/117650408.html?s=01&t=0
https://jobs.51job.com/nanjing-glq/104704234.html?s=01&t=0
https://jobs.51job.com/nanjing/114939099.html?s=01&t=0
https://jobs.51job.com/nanjing-lsq/120621444.html?s=01&t=0
https://jobs.51job.com/nanjing-lsq/120432024.html?s=01&t=0
https://jobs.51job.com/nanjing-qxq/104835579.html?s=01&t=0
https://jobs.51job.com/nanjing-qxq/120766217.html?s=01&t=0
https://jobs.51job.com/nanjing/120766170.html?s=01&t=0
https://jobs.51job.com/nanjing-qxq/120750779.html?s=01&t=0
https://jobs.51job.com/nanjing-jnq/120517790.html?s=01&t=0

data={'职位名称':zwmc,'公司名称':gsmc,'工作地点':gzdd,'职位最低月薪':xz_low,'职位最高月薪':xz_height,'发布时间':ptime,'网站地址':href,'地区':a,'经验':b,'学历':c,'招聘人数':d,
}

import MySQLdbdef dic2sql(dic, sql):sf = ''for key in dic:tup = (key, dic[key])sf += (str(tup) + ',')sf = sf.rstrip(',')sql2 = sql % sfreturn sql2if __name__ == '__main__':dic = {'apple': 216, 'jar': 138}sql = "insert into users (login,userid) VALUES %s;"ret = dic2sql(dic, sql)# print(ret)# 连接MySQL，并提交数据cxn = MySQLdb.connect(user='root',password='password', db='test')cur = cxn.cursor()cur.execute(ret)cxn.commit()cxn.close()

test=pd.DataFrame(data)

from sqlalchemy import create_engine

engine = create_engine("mysql+pymysql://root:123456@localhost:3306/spider?charset=utf8")

test.to_sql(name = 'cnblog',con = engine,if_exists = 'append',index = False,index_label = False)

test.to_excel('test.xlsx')

20200318_抓取51job招聘数据存数据库相关推荐

python3 scrapy实战：爬取拉勾网招聘数据至数据库（反爬虫）
首先注明:感谢拉勾网提供的权威.质量的数据,本人抱着学习的态度,不愿增加其服务器负担,与dos攻击. 由于后面准备做一个大一点的数据分析项目,所以前提需要获取大量的有质量和权威的信息,其中一个获取点便 ...
爬取51job招聘数据相关招聘信息
表格第一行第一行第一列职位名公司名工作地点&薪资&发布时间二级网址经验&学历信息职位信息公司类型&公司规模(人数)&所属行业(公司)
使用Python爬取51job招聘网的数据
使用Python爬取51job招聘网的数据进行网站分析获取职位信息存储信息最终代码进行网站分析进入https://www.51job.com/这个网站我在这就以python为例搜索职位跳 ...
抓取前程无忧招聘信息
抓取前程无忧招聘信息本文通过分析前程无忧的相关规则,通过python来抓取相关的招聘信息,并通过redis缓存相关信息,实现增量抓取. 相关技术 python3.6 requests redis m ...
[Python] scrapy + selenium 抓取51job 职位信息（实现传参控制抓取页数+职位名称+城市）
目录一.目标二.51job网页分析: 1.网页构成观察 2.网页分析三.代码实现 1. 踩过的坑-----实现城市选择 2.代码实现 3.代码优化 1)存放格式优化 2)在爬虫中去掉\xa0\x ...
Fiddler 详尽教程与抓取移动端数据包
转载自:http://blog.csdn.net/qq_21445563/article/details/51017605 阅读目录 1. Fiddler 抓包简介 1). 字段说明 2). Stat ...
python获取app信息的库_Python学习教程：另辟蹊径，appium抓取app应用数据了解一下...
作为爬虫工程师,没有价格不知道selenium的. 什么是selenium? Selenium原本是一个用于Web应用程序自动化测试工具.Selenium测试直接运行在浏览器中,就像真正的用户在操作一 ...
appium python 抓包_Python学习教程：另辟蹊径，appium抓取app应用数据了解一下
原标题:Python学习教程:另辟蹊径,appium抓取app应用数据了解一下作为爬虫工程师,没有价格不知道selenium的. 什么是selenium? Selenium原本是一个用于Web应用程 ...
Python爬虫：抓取多级页面数据
前面讲解的爬虫案例都是单级页面数据抓取,但有些时候,只抓取一个单级页面是无法完成数据提取的.本节讲解如何使用爬虫抓取多级页面的数据. 在爬虫的过程中,多级页面抓取是经常遇见的.下面以抓取二级页面为例, ...
python抓取京东联盟优惠券_[爬虫]使用python抓取京东全站数据（商品，店铺，分类，评论）...
网上抓取京东数据的文章,现在要么无法抓取数据,要么只能抓取部分数据,本文将介绍如何抓取京东全站数据,包括商品信息.店铺信息,评论信息,分类信息等. -------------------------- ...

20200318_抓取51job招聘数据存数据库

20200318_抓取51job招聘数据存数据库相关推荐

最新文章

热门文章