1、导入包

import requests   #取数
from lxml import etree   #用xpath解析
import pymysql   #连接数据库
import chardet   #自动获取编码

2、获取单页html

def get_one_page(url):headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'}response = requests.get(url, headers=headers)  #习惯先把头部信息加上response.encoding = chardet.detect(response.content)['encoding'] #用chardet.detect方法自动获取网页的编码，也可以自己手动在网页查return response.text

3、解析html

def parse_one_page(html):#对获取内容初始化，再用parse函数etree.HTML解析result = etree.HTML(html)item = {} #建立一个字典储存所有职位信息item['t1'] = result.xpath('//div[@class="el"]/p/span/a/text()') #职位名称item['t2'] = result.xpath('//div[@class="el"]/span[@class="t2"]/a/text()') #公司名称item['t3'] = result.xpath('//div[@class="el"]/span[@class="t3"]/text()') #工作地点t4 = result.xpath('//div[@class="el"]/span[@class="t4"]')   #text无法获取空值(薪资数据可能为空),所以要用string方法获取item['t4'] = []for i in t4:item['t4'].append(i.xpath('string(.)'))  #遍历出来再用xpath解析，string(.)中间的点表示在当前目录item['t5'] = result.xpath('//div[@class="el"]/span[@class="t5"]/text()') #发布时间item['href'] = result.xpath('//div[@class="el"]/p/span/a/@href') #详细链接

4、数据清洗

上面第3步将数取出，存在字典里，接下来做数据清洗，这部分还是在parse_one_page函数体里。

 # (1) 去掉每个职位名称前后空白for i in range(len(item['t1'])):   #有多少个职位就遍历多少遍item['t1'][i] = item['t1'][i].strip()   #strip只针对字符串# (2) 薪资处理# 定义列表，存储处理后的薪资数据sal_low = [] #最低月薪sal_height = [] #最高月薪for sal in item['t4']: #取出的是字符串if sal != "":  #如果薪资不为空，则先截取sal = sal.strip().split('-') #将薪资分成两部分if len(sal) > 1: #若长度>1，则说明薪资是个区间，有最大最小值#研究薪资结构，一般是万/月，千/月，万/年，其它的设为0值if sal[1][-3] == '万' and sal[1][-1] == '月': #判断第二部分的构成sal_low.append(float(sal[0])*10000) #float设置成浮点数sal_height.append(float(sal[1][0:-3])*10000)elif sal[1][-3] == '万' and sal[1][-1] == '年':sal_low.append(round(float(sal[0])*10000/12,1)) #round保留一位小数，月薪=年薪/12sal_height.append(round(float(sal[1][0:-3])*10000/12,1))elif sal[1][-3] == '千' and sal[1][-1] == '月':sal_low.append(float(sal[0])*1000)sal_height.append(float(sal[1][0:-3])*1000)else:sal_low.append(0)  #若存在其它情况则全部设为0sal_height.append(0)else: #否则，薪资只有一个固定值if sal[0][-3] == '元' and sal[0][-1] == '天':sal_low.append(sal[0][0:-3])  #直接把数字填进去（日薪）sal_height.append(sal[0][0:-3]) #因为只有一个值，所以最低最高薪资是相同的else:sal_low.append(0)sal_height.append(0)else: #若为空sal_low.append(0)sal_height.append(0)# 将处理后的薪资存储在字典中item['sal_low'] = sal_lowitem['sal_height'] = sal_height# (3) 时间数据处理for i in range(len(item['t5'])):item['t5'][i] = '2019-' + item['t5'][i]  # 遍历出来把每个结果前面都加上年份yield item

5、存储至mysql

def write_to_mysql(content):# 建立连接conn = pymysql.connect(host='localhost',user='root',passwd='vicky',db='test_db',charset='utf8')cursor = conn.cursor()for i in range(len(content['t1'])):# 在这里只取了下面7个字段jobname = content['t1'][i]company = content['t2'][i]workplace = content['t3'][i]salary_low = content['sal_low'][i]salary_height = content['sal_height'][i]ptime = content['t5'][i]href = content['href'][i]# 在这一步的时候可以去Navicat创建一张表，字段可以多加一个id为主键自增sql = "insert into wuyoujob values(null,%s,%s,%s,%s,%s,%s,%s)"parm = (jobname,company,workplace,salary_low,salary_height,ptime,href)cursor.execute(sql,parm)conn.commit()cursor.close()conn.close()

5、函数回调

函数写好了，实例化就行

def main(page):url = 'https://search.51job.com/list/080200,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590%25E5%25B8%2588,2,'+str(page)+'.html?'   #这里要注意，原地址中间？后面的内容都可以删掉，取前面就好，做个分页时注意要转成字符串格式才能拼接html = get_one_page(url)for i in parse_one_page(html):  #遍历字典print(i)   #打印处理后的数据（字典）也可以不打印write_to_mysql(i)    #把字典的内容传给数据库

6、回调主函数，完成分页

if __name__ == '__main__':for i in range(1,9):  #这里看自己抓取的网页大概有多少页main(i)

然后打开Navicat，刷新一下表，见证奇迹的时候到了！
dei 没错，我取的是杭州的数据

Python3爬取前程无忧数据分析工作并存储到MySQL相关推荐

python-爬虫，实现输入关键字，然后爬取关键字主页代码并存储到mysql数据库
python-爬虫,实现输入关键字,然后爬取关键字主页代码并存储到mysql数据库实现代码如下: 代码是可以实现的,有问题可以私聊我 import os import sys import base ...
Python3爬取前程无忧招聘数据教程
文章来自群友易某某的投稿,在此表示感谢! 原文链接:https://blog.csdn.net/weixin_42572590/article/details/103443213 前几天,我发表了 ...
爬虫实战入门级教学（数据爬取-＞数据分析-＞数据存储）
爬虫实战入门级教学(数据爬取->数据分析->数据存储) 天天刷题好累哦,来一期简单舒适的爬虫学习,小试牛刀(仅供学习交流,不足之处还请指正) 文章讲的比较细比较啰嗦,适合未接触过爬虫的新手 ...
用Python Scrapy爬取某电影网站并存储入mysql
爬取目标:javlib,使用框架Scrapy 首先使用在命令行里 scrapy startproject projectname 和 scrapy genspider spidername 指令创建爬 ...
Python爬取热门微博，并存储到MySQL中
目标网站:m.weibo.cn url的获取可以从浏览器的F12中的network的XHR中找到. weibo_demo.py: import requests import json from w3 ...
通过爬取前程无忧网站数据分析上海互联网行业招聘状况
1.项目要求内容完整程度.可用性(可操作.易操作.美观).时间先后.先进性等. 2.项目内容爬取前程无忧网站(网址:https://www.51job.com/)上的工作招聘信息(截止2018年1 ...
Python3爬取网页信息乱码怎么解决？（更新：已解决）
更新:乱码问题已经解决了. 将下面代码中的红色部分改为下面这样就不会出现个别职位信息乱码的情况了. soup2 = BeautifulSoup(wbdata2, 'html.parser',from_ ...
Python爬虫《自动化学报》数据爬取与数据分析
Python爬虫<自动化学报>数据爬取与数据分析文章目录 Python爬虫<自动化学报>数据爬取与数据分析前言一.代码二.结果展示三.爬虫实现 1.准备 2.获取网页 ...
手把手Selenium安装使用及实战爬取前程无忧招聘网站（一）
目录一.安装浏览器驱动器 1. 下载驱动器 2. 启动驱动器二 .selenium的使用 1. 启动驱动器加载网页 2. 八大元素定位 (1)id 定位 (2)name定位 (3)link_tex ...

Python3爬取前程无忧数据分析工作并存储到MySQL

1、导入包

2、获取单页html

3、解析html

4、数据清洗

5、存储至mysql

5、函数回调

6、回调主函数，完成分页

Python3爬取前程无忧数据分析工作并存储到MySQL相关推荐

最新文章

热门文章