51job爬取完整代码:

import requests
from bs4 import BeautifulSoup
import datetime
import json
import xlwings as xw
from selenium import webdriver
import time
import pandas as pd
from selenium.webdriver import Chrome, ChromeOptions, ActionChains
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.common.keys import Keys
import csv
import multiprocessing
import os# 获取职位信息
def jobMesssage(item):df = pd.DataFrame()item.list = item.find_all('a', attrs={'class': 'el'})  # 获取招聘岗位信息for i, item in enumerate(item.list):try:df['招聘职位网址'] = item.get('href'),df['岗位名称'] = item.find_all('span')[0].text,df['发布日期'] = item.find_all('span')[1].text,df['薪资'] = item.find_all('span')[2].text,  #df['工作地及要求'] = item.find_all('span')[3].text,  ## df_all=pd.concat([df,df_all],axis=1)item.list = item.find_all('p', attrs={'class': 'tags'})for i, item.list in enumerate(item.list):df['福利'] = item.list.get('title'),  #print(str(i), '招聘职位写入正常')except:print(str(i), '招聘职位写入正常')return df# 获取职位对应公司信息
def jobFirm(item):df = pd.DataFrame()item.list = item.find_all('div', attrs={'class': 'er'})  # 获取招聘公司信息for i, item in enumerate(item.list):# print(item,i,sep=',')# print(item.find_all('p')[1].text)try:df['招聘公司网址'] = item.find('a').get('href'),df['公司名称'] = item.find('a').text,df['公司规模'] = item.find_all('p')[0].text,df['所属行业'] = item.find_all('p')[0].text,print(str(i), '招聘公司写入正常')except:print(str(i), '招聘公司写入异常')return df# 职位要求
def jobRequire(html):df = pd.DataFrame()# with open('jobhtmlText.html', 'r', encoding='utf-8') as f:#     html = BeautifulSoup(f, 'html.parser')# html.list = html.find_all('div', attrs={'class': 'tHeader tHjob'})html.list = html.find_all('div', attrs={'class': 'tCompany_main'})# print(html.list)for i, item in enumerate(html.list):try:# contactInf=item.find_all('div', attrs={'class': 'tBorderTop_box'})[1].find('span').text.strip('') #联系方式# officeAddress=item.find_all('div', attrs={'class': 'tBorderTop_box'})[1].find('p').text#上班地址jobRequir_a = item.find('div', attrs={'class': 'tBorderTop_box'}).text.strip('').replace('\n', '').replace('\t', '') \.replace(' ', '')  # 任职要求# print(jobRequir_a, i, sep='')item.list = item.find('div', attrs={'class': 'tBorderTop_box'}).find_all('p')jobRequir = []  # 职位要求for i, item in enumerate(item.list):jobRequir.append(item.text.strip('') + '\n')jobRequirText = ''.join(jobRequir)# print(jobRequirText)# print(jobRequirText.find('任职要求'))if jobRequirText.find('任职要求') > 0:df['招聘要求'] = jobRequirText,else:df['招聘要求'] = jobRequir_a,# print(df)print(str(i), '职位信息写入正常')except:print(str(i), '职位信息写入异常')return df# 招聘公司信息获取
def firmMeessage(html):df = pd.DataFrame()# with open('jobhtmlText.html', 'r', encoding='utf-8') as f:#     html = BeautifulSoup(f, 'html.parser')html.list = html.find_all('div', attrs={'class': 'tCompany_full'})# print(html.list)for i, item in enumerate(html.list):item.list = item.find_all('div', attrs={'class': 'tBorderTop_box'})# print(item.list[0].text.strip('').replace('\n', '').replace('\t', '').replace(' ', ''))# for i, item in enumerate(item.list):#     print(item.text,i,sep='')try:df['公司信息'] = item.list[0].text.strip('').replace('\n', '').replace('\t', '').replace(' ', ''),# print(df)print(str(i), '公司信息写入正常')except:print(str(i), '公司信息写入异常')return df
class writeExcel:def __init__(self, data):self.data = data# print(data)def wE_r(self):app = xw.App(visible=False, add_book=False)new_workbook = xw.Book()new_worksheet = new_workbook.sheets.add('worksheet')app.display_alerts = Falseapp.screen_updating = Falsenew_worksheet.range('l:l').row_height = 20new_worksheet.range('l:l').column_width = 11title = ["序号", "岗位名称", "发布日期", "薪资", "工作地及要求", "公司名称", "公司规模", "所属行业", "招聘职位网址", "招聘要求","招聘公司网址", "公司信息", '福利', '关键字', '薪资范围', '标记', '顺序', '记录日期', '是否投递']new_worksheet['A1'].value = titlefor i in range(len(self.data)):new_worksheet.cells[i + 1, 0].value = i + 1new_worksheet.cells[i + 1, 1].value = data[i]['岗位名称']new_worksheet.cells[i + 1, 2].value = data[i]['发布日期']new_worksheet.cells[i + 1, 3].value = data[i]['薪资']new_worksheet.cells[i + 1, 4].value = data[i]['工作地及要求']new_worksheet.cells[i + 1, 5].value = data[i]['公司名称']new_worksheet.cells[i + 1, 6].value = data[i]['公司规模']new_worksheet.cells[i + 1, 7].value = data[i]['所属行业']new_worksheet.cells[i + 1, 8].value = data[i]['招聘职位网址']new_worksheet.cells[i + 1, 9].value = data[i]['招聘要求']new_worksheet.cells[i + 1, 10].value = data[i]['招聘公司网址']new_worksheet.cells[i + 1, 11].value = data[i]['公司信息']new_worksheet.cells[i + 1, 12].value = data[i]['福利']# 修改项目new_worksheet.cells[i + 1, 13].value = key  # 关键字new_worksheet.cells[i + 1, 14].value = '15-40K' if salary == '08%252c09%252c10' else '20-30K'  # 薪资范围new_worksheet.cells[i + 1, 17].value = datetime.date.today()  # 薪资范围print(str(i), 'Excel数据写入正常')new_worksheet.autofit()new_workbook.save('jobGain.xlsx')new_workbook.close()app.quit()def run(self):pf = multiprocessing.Process(target=self.wE_r())pf.start()pf.join()
class Web:def __init__(self, url):self.url = urldef web(self):# with open('jobhtml.html', 'r', encoding='utf-8') as f:# job_url = 'https://search.51job.com/list/080200,000000,0000,00,9,99,%25E7%2589%25A9%25E6%25B5%2581,2,1.html?'driver.back()time.sleep(0.3)driver.get(self.url)  # 加载网址time.sleep(1 + 1)source = driver.page_source  # 页面内容实例化html = BeautifulSoup(source, 'html.parser')  # 获取页面内容html.list = html.find_all('div', attrs={'class': 'j_joblist'})return html.list# 招聘需求信息获取def web_a(self, url):job_url = 'https://www.baidu.com/?tn=21002492_18_hao_pg'driver.back()driver.get(job_url)  # 加载网址time.sleep(0.3 + 0.4)driver.get(url)  # 加载网址time.sleep(1.2 + 1)source = driver.page_source  # 页面内容实例化html = BeautifulSoup(source, 'html.parser')  # 获取页面内容# with open('jobhtmlText.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312#     f.write(source)# print(html)return html# 招聘公司信息获取def web_b(self, url):job_url = 'https://www.baidu.com/?tn=21002492_18_hao_pg'driver.back()driver.get(job_url)  # 加载网址time.sleep(0.5 + 0.5)driver.get(url)  # 加载网址time.sleep(1.2 + 1)source = driver.page_source  # 页面内容实例化html = BeautifulSoup(source, 'html.parser')  # 获取页面内容# print(html)# with open('jobhtmlText.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312#     f.write(source)# print(html)return htmlkey = '配送'  # 物流经理#物流运营#物流管理【#运营#物流#数据#运输#仓储#配送】
salary = '08%252c09%252c10'  # 08表示1.5-20K,09表示20-30k,08%252c09%252c10表示1.5-20K,20-30K,30K以上
timeday = '3'  # 1表示近三天,2表示近一周,近一个月是3
if __name__ == "__main__":opt = FirefoxOptions()  # ChromeOptions()  # 创建chrome参数# 不加载图片opt.set_preference('permissions.default.image', 2)opt.headless = False  # 显示浏览器driver = webdriver.Firefox(options=opt)  # Chrome(options=opt)  # 浏览器实例化# driver.set_window_size(500, 900)# options = FirefoxOptions()# selenium = webdriver.Firefox(options=options)# job_url = 'https://search.51job.com/list/080200,000000,0000,00,9,99,%25E7%2589%25A9%25E6%25B5%2581,2,1.html?'# 杭州,2-3万'https://search.51job.com/list/080200,000000,0000,00,9,09,%25E7%2589%25A9%25E6%25B5%2581%25E8%25BF%2590%25E8%2590%25A5,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='# 杭州1.5-2'https://search.51job.com/list/080200,000000,0000,00,9,08,%25E7%2589%25A9%25E6%25B5%2581%25E8%25BF%2590%25E8%2590%25A5,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='# # 招聘需求信息获取# myWeb = Web(job_url)  # 实例化类# time.sleep(0.2)# html = myWeb.web_a('https://jobs.51job.com/hangzhou-scq/125683481.html?s=sou_sou_soulb&t=0_0')  # 'https://jobs.51job.com/hangzhou/135496109.html?s=sou_sou_soulb&t=0_0') # 实例化网址# # df4 = jobRequire(html)  # 获取职位需求信息# df4 = jobRequire()# print(df4)# time.sleep(0.3)# 取前三页数据df = pd.DataFrame()  # 定义pands整理表格for i in range(5):try:  # '+str(i+1)+'#08表示1.5-20K,09表示20-30kprint(str(i), '获取第{}页数据'.format(i + 1))job_url = 'https://search.51job.com/list/080200,000000,0000,21,' + timeday + ',' + salary + ',' + key + ',2,' + str(i + 1) + '.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=03%252c04&&jobterm=99&companysize=99&ord_field=1&dibiaoid=0&line=&welfare='#03%252c04&大专和本科学历print(job_url)'https://search.51job.com/list/080200,000000,0000,21,3,08%252c09%252c10,%25E8%25BF%2590%25E8%2590%25A5,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=03%252c04&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='#增加学历degreefrom'https://search.51job.com/list/080200,000000,0000,21,3,08%252c09%252c10,%25E8%25BF%2590%25E8%2590%25A5,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='#增加薪资'https://search.51job.com/list/080200,000000,0000,21,3,09,%25E8%25BF%2590%25E8%2590%25A5,2,2.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='# 'https://search.51job.com/list/080200,000000,0000,00,1,09,%25E7%2589%25A9%25E6%25B5%2581%25E7%25BB%258F%25E7%2590%2586,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='# 'https://search.51job.com/list/080200,000000,0000,00,1,09,%25E7%2589%25A9%25E6%25B5%2581%25E7%25BB%258F%25E7%2590%2586,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='# 'https://search.51job.com/list/080200,000000,0000,00,3,09,%25E7%2589%25A9%25E6%25B5%2581%25E7%25BB%258F%25E7%2590%2586,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='# with open('jobhtml.html', 'r', encoding='utf-8') as f:#     html = BeautifulSoup(f, 'html.parser')#     html.list = html.find_all('div', attrs={'class': 'j_joblist'})time1 = time.time()  # 计算时长myWeb = Web(job_url)  # 实例化类  # 'https://jobs.51job.com/hangzhou-yhq/135494019.html?s=sou_sou_soulb&t=0_0')  # 实例化网址time.sleep(1 + 1)html = myWeb.web()# print(html)for i, item in enumerate(html):# print(item,i,sep=',')item.list = item.find_all('div', attrs={'class': 'e'})  # 获取每个招聘岗位条目for i, item in enumerate(item.list):df1 = jobMesssage(item)  # 获取岗位# print(df1['招聘职位网址'])df2 = jobFirm(item)  # 获取公司url = str(df1['招聘职位网址'].values).strip("['").strip("']").strip('')print(url)url_b = str(df2['招聘公司网址'].values).strip("['").strip("']").strip('')print(url_b)# 招聘需求信息获取myWeb = Web(job_url)  # 实例化类time.sleep(0.3)html = myWeb.web_a(url)  # 'https://jobs.51job.com/hangzhou/135496109.html?s=sou_sou_soulb&t=0_0') # 实例化网址df4 = jobRequire(html)  # 获取职位需求信息print(df4)time.sleep(0.5 + 0.5 + 0.5)# 招聘公司信息获取myWeb = Web(job_url)  # 实例化类time.sleep(0.3)html = myWeb.web_b(url_b)  # 'https://jobs.51job.com/all/co3836624.html')  # 实例化网址df5 = firmMeessage(html)  # 获取职位需求信息print(df5)time.sleep(0.5 + 0.5 + 0.5)df3 = pd.concat([df1, df2], axis=1)df6 = pd.concat([df3, df4], axis=1)df7 = pd.concat([df5, df6], axis=1)df7.to_csv('job.csv', mode='a+', header=None, index=None, encoding='utf-8-sig', sep=',')df = pd.concat([df, df7], axis=0)print(df)df.to_json('jobGain.json', orient='records', indent=1, force_ascii=False)time.sleep(0.5 + 0.5 + 0.5)time.sleep(0.5 + 0.5 + 0.5)print(str(i), '数据正常'.format(i + 1))time2 = time.time()  # 计算时长print('总耗时:{}'.format(time2 - time1))except:print(str(i), '数据异常'.format(i + 1))# key = '物流管理'  # 物流经理#物流运营# salary = '08'  # 08表示1.5-20K,09表示20-30kwith open('jobGain.json', 'r', encoding='utf-8') as f:data = json.load(f)# print(data)myWe = writeExcel(data)  # 写入excelmyWe.run()  # 执行多线程try:  # 关闭后台浏览器driver.close()driver.quit()os.system('taskkill /F /IM chromedriver.exe')  # 关闭进程浏览器sreach_windows = driver.current_window_handle# 获得当前所有打开的窗口的句柄all_handles = driver.window_handlesfor handle in all_handles:driver.switch_to.window(handle)driver.close()time.sleep(1.2)except:print('已完后台毕浏览器')

猎聘网爬取数据:

import requests
from bs4 import BeautifulSoup
import datetime
import json
import xlwings as xw
from selenium import webdriver
import time
import pandas as pd
from selenium.webdriver import Chrome, ChromeOptions, ActionChains
from selenium.webdriver.common.keys import Keys
import csv
import multiprocessing
import os# 获取招聘职位信息
def jobMesssage(html):df_jobMesssage = pd.DataFrame()df= pd.DataFrame()# with open('jobhtml.html', 'r', encoding='utf-8') as f:#     html = BeautifulSoup(f, 'html.parser')# html.list = html.find_all('div', attrs={'class': 'left-list-box'})for i, item in enumerate(html):item.list=item.find_all('div', attrs={'class': 'job-detail-box'})for i, item in enumerate(item.list):# print(item, i, sep=',')# print(item.find('div', attrs={'class': 'job-detail-header-box'}).find('span', attrs={'class': 'job-salary'}).text,i,sep=',')try:df_jobMesssage ['招聘职位网址'] = item.find('a', attrs={'data-nick': 'job-detail-job-info'}).get('href'),df_jobMesssage ['岗位名称'] = item.find('a', attrs={'data-nick': 'job-detail-job-info'}).find('div', attrs={'class': 'job-title-box'}).text.strip('').replace('\n', '').replace('\t', ''),df_jobMesssage ['工作地及要求'] = item.find('a', attrs={'data-nick': 'job-detail-job-info'}).find('div', attrs={'class': 'job-labels-box'}).text.strip('').replace('\n', '').replace('\t', ''),  #df_jobMesssage['公司名称']=item.find('div', attrs={'data-nick': 'job-detail-company-info'}).find('div', attrs={'class': 'job-company-info-box'}).text.strip('').replace('\n', '').replace('\t', '')df_jobMesssage['薪资']=item.find('div', attrs={'class': 'job-detail-header-box'}).find('span', attrs={'class': 'job-salary'}).text# print(df_jobMesssage)df_jobMesssage.to_csv('job.csv', mode='a+', header=None, index=True, encoding='utf-8-sig', sep=',')df = pd.concat([df, df_jobMesssage], axis=0)# df.to_json('jobliepin.json', orient='records', indent=1, force_ascii=False)print(str(i), '招聘职位写入正常')except:print(str(i), '招聘职位写入正常')return df
#获取招聘要求和公司信息
def jobRequire(url):df = {}#定义字典# url='https://www.liepin.com/a/29686195.shtml?d_sfrom=search_prime&d_ckId=c8f01cee484fdfafc8e1e5d047a1e1d1&d_curPage=0&d_pageSize=40&d_headId=6ae8e76ae415c8d307347eef4182b4e4&d_posi=38'cookie='Cookie: __uuid=1632571874000.95; __s_bid=11011704223d5f9c92ff5bd3e81bc8334a74; __tlog=1632611231431.79%7C00000000%7C00000000%7C00000000%7C00000000; Hm_lvt_a2647413544f5a04f00da7eee0d5e200=1632571900,1632611231; Hm_lpvt_a2647413544f5a04f00da7eee0d5e200=1632615070; __session_seq=12; __uv_seq=12'headers={'user-agent':'User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36','Cookie':cookie,'Connection': 'keep - alive'}# 新闻链接# session = requests.session()res = requests.get(url=url,headers=headers,timeout=30)res.encoding = 'utf-8'res.raise_for_status()res.encoding = res.apparent_encodinghtml = BeautifulSoup(res.text, 'html.parser')time.sleep(0.1)# print(html)# 存入本地# with open('jobhtmlText.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312#     f.write(res.text)# with open('jobhtmlText.html', 'r', encoding='utf-8') as f:#     html = BeautifulSoup(f, 'html.parser')html.list = html.find_all('content')  # 整体框架for i, item in enumerate(html):# item.list = item.find_all('section', attrs={'class': 'company-intro-container'})[0].text#上级框架# print(item.list)try:df['招聘要求'] = item.find_all('section', attrs={'class': 'job-intro-container'})[0].text.strip('\n'),df['公司信息'] = item.find_all('section', attrs={'class': 'company-intro-container'})[0].text.strip('\n'),# df.to_csv('job.csv', mode='a+', header=None, index=None, encoding='utf-8-sig', sep=',')# df.to_json('jobliepin.json', orient='records', indent=1, force_ascii=False)print(df)print(str(i), '招聘职位写入正常')except:print(str(i), '招聘职位写入正常')return dfclass Web:def __init__(self,url):self.url=url# 获取招聘职位信息def web(self):driver.back()time.sleep(0.3)driver.get(self.url)  # 加载网址time.sleep(1)source = driver.page_source  # 页面内容实例化html = BeautifulSoup(source, 'html.parser')  # 获取页面内容html.list = html.find_all('div', attrs={'class': 'left-list-box'})# with open('jobhtml.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312#     f.write(source)# print(html)return html.list# 获取招聘要求和公司信息def web_a(self,url):driver.back()time.sleep(0.3)driver.get(url)  # 加载网址time.sleep(1)source = driver.page_source  # 页面内容实例化html = BeautifulSoup(source, 'html.parser')  # 获取页面内容html.list = html.find_all('content')#整体框架# with open('jobhtmlText.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312#     f.write(source)# print(html)return html.list
class writeExcel:def __init__(self,data):self.data=data# print(data)def wE_r(self):app = xw.App(visible=False, add_book=False)new_workbook = xw.Book()new_worksheet = new_workbook.sheets.add('worksheet')app.display_alerts = Falseapp.screen_updating = Falsetitle = ["序号", "岗位名称", "发布日期", "薪资", "工作地及要求", "公司名称", "公司规模", "所属行业", "招聘职位网址", "招聘要求","招聘公司网址","公司信息",'福利','关键字','薪资范围','标记','顺序','记录日期']new_worksheet['A1'].value = titlefor i in range(len(self.data)):try:df_w = jobRequire(data[i]['招聘职位网址'])print(data[i]['招聘职位网址'])# if i%9==8:#     time.sleep(20)#每取8个停下8秒应对反扒# else:#     time.sleep(0.2)new_worksheet.cells[i + 1, 0].value = i + 1new_worksheet.cells[i + 1, 1].value = data[i]['岗位名称']new_worksheet.cells[i + 1, 2].value =''# data[i]['发布日期']new_worksheet.cells[i + 1, 3].value = data[i]['薪资']new_worksheet.cells[i + 1, 4].value = data[i]['工作地及要求']new_worksheet.cells[i + 1, 5].value = data[i]['公司名称']new_worksheet.cells[i + 1, 6].value = ''#data[i]['公司规模']new_worksheet.cells[i + 1, 7].value = ''#data[i]['所属行业']new_worksheet.cells[i + 1, 8].value = data[i]['招聘职位网址']new_worksheet.cells[i + 1, 9].value =df_w['招聘要求']#str(df_w['招聘要求'].values).strip("['").strip("']").strip('')new_worksheet.cells[i + 1, 10].value =''# data[i]['招聘公司网址']new_worksheet.cells[i + 1, 11].value = df_w['公司信息']#str(df_w['公司信息'].values).strip("['").strip("']").strip('')new_worksheet.cells[i + 1, 12].value =''# data[i]['福利']#修改项目new_worksheet.cells[i + 1, 13].value = key#关键字new_worksheet.cells[i + 1, 14].value = salary#薪资范围new_worksheet.cells[i + 1, 17].value = datetime.date.today()  # 薪资范围print(str(i), 'Excel数据写入正常')except:print(str(i), 'Excel数据写入异常')new_worksheet.autofit()new_workbook.save('jobliepin.xlsx')new_workbook.close()app.quit()def run(self):pf=multiprocessing.Process(target=self.wE_r())pf.start()pf.join()df=pd.DataFrame()#定义    全局变量
key='物流经理'#物流经理#物流运营
salary='20$40'#20$40#10$20
if __name__=="__main__":# jobRequire()opt = ChromeOptions()  # 创建chrome参数opt.headless = False  # 显示浏览器driver = Chrome(options=opt)  # 浏览器实例化# driver=webdriver.Chrome()driver.set_window_size(300,700)for i in range(3):#+str(i);key=try:print(str(i), '获取第{}页数据'.format(i + 1))job_url = 'https://www.liepin.com/zhaopin/?headId=9f577a23fdb5d9437efff7679944c610&key='+str(key)+'&dq=070020&salary='+salary+'&pubTime=3&currentPage='+str(i)print(job_url)'https://www.liepin.com/zhaopin/?headId=12baac27653545ffceb6a268fc0c82aa&ckId=12baac27653545ffceb6a268fc0c82aa&key=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&dq=070020&salary=20$40&pubTime=3&currentPage=1''https://www.liepin.com/zhaopin/?headId=12baac27653545ffceb6a268fc0c82aa&key=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&dq=070020&salary=10$20&pubTime=3''https://www.liepin.com/zhaopin/?headId=9f577a23fdb5d9437efff7679944c610&key=%E7%89%A9%E6%B5%81%E7%AE%A1%E7%90%86&dq=070020&salary=20$40&pubTime=3'# job_url_a='https://www.liepin.com/a/30216633.shtml?d_sfrom=search_prime&d_ckId=10e193c94fdc8095c14815c02246e6e7&d_curPage=0&d_pageSize=40&d_headId=6ae8e76ae415c8d307347eef4182b4e4&d_posi=2'time1 = time.time()  # 计算时长#获取招聘职位信息myWeb = Web(job_url)  # 实例化类html = myWeb.web()#招聘要求和公司信息time.sleep(0.5)# print(html)df1=jobMesssage(html)df=pd.concat([df1,df],axis=0)df.to_json('jobliepin.json', orient='records', indent=1, force_ascii=False)time2 = time.time()  # 计算时长print(str(i), '数据正常'.format(i + 1))print('总耗时:{}'.format(time2 - time1))except:print(str(i), '数据异常'.format(i + 1))# 写入excelwith open('jobliepin.json', 'r', encoding='utf-8') as f:data = json.load(f)# print(data)myWe = writeExcel(data)  # 写入excelmyWe.run()  # 执行多线程try:#关闭后台浏览器driver.close()driver.quit()os.system('taskkill /F /IM chromedriver.exe')  # 关闭进程浏览器sreach_windows = driver.current_window_handle# 获得当前所有打开的窗口的句柄all_handles = driver.window_handlesfor handle in all_handles:driver.switch_to.window(handle)driver.close()time.sleep(1.2)except:print('已完后台毕浏览器')

BOss直聘网站:

import requests
from bs4 import BeautifulSoup
import datetime
import json
import xlwings as xw
from selenium import webdriver
import time
import pandas as pd
from selenium.webdriver import Chrome, ChromeOptions, ActionChains
from selenium.webdriver.common.keys import Keys
import csv
import multiprocessing
import osdef jobMesssage(html):df_jobMesssage = pd.DataFrame()df = pd.DataFrame()# with open('jobhtml.html', 'r', encoding='utf-8') as f:#     html = BeautifulSoup(f, 'html.parser')# html.list = html.find_all('div', attrs={'class': 'job-list'})# print(html.list)for i, item in enumerate(html):item.list = item.find_all('div', attrs={'class': 'job-primary'})# print(item,i,sep=',')for i, item in enumerate(item.list):  # 获取每个招聘条目# print(item, i, sep=',')try:item.list = item.find('div', attrs={'class': 'info-append clearfix'}).text.replace(' ', '').replace('\n', ' ')print(item.list, i, sep=',')df_jobMesssage['招聘职位网址'] = 'https://www.zhipin.com' + item.find('div',attrs={'class': 'primary-box'}).get('href'),df_jobMesssage['岗位名称'] = item.find('div', attrs={'class': 'job-title'}).find('span', attrs={'class': 'job-name'}).text,df_jobMesssage['工作地及要求'] = item.find('div', attrs={'class': 'job-title'}).find('span', attrs={'class': 'job-area-wrapper'}).text.strip('\n'),  #df_jobMesssage['公司名称'] = item.find('div', attrs={'class': 'info-company'}).text.replace(' ','').replace('\n', ' '),df_jobMesssage['薪资'] = item.find('div', attrs={'class': 'job-limit clearfix'}).text.strip('').replace('\n', ' '),df_jobMesssage['福利'] = item.find('div', attrs={'class': 'info-append clearfix'}).text.replace(' ','').replace('\n', ' '),# print(df_jobMesssage)df_jobMesssage.to_csv('job.csv', mode='a+', header=None, index=True, encoding='utf-8-sig', sep=',')df = pd.concat([df, df_jobMesssage], axis=0)df.to_json('jobBoss.json', orient='records', indent=1, force_ascii=False)print(str(i), '招聘职位写入正常')except:print(str(i), '招聘职位写入正常')return df
def jobRequire(html):# df = pd.DataFrame()df = {}  # 定义字典# # url='https://www.zhipin.com/job_detail/c3aea253a5b3b2501nJ92d-9GFBR.html'# url='https://www.zhipin.com/job_detail/c2b2f449e3c613a71nN72NS1FlpW.html'# # url='https://www.zhipin.com/job_detail/1635c904e28317c31nN63ti0FlJY.html'# cookie = 'Cookie: __guid=95203226.4063907470298592000.1630401055947.081; _bl_uid=tIkzmsaaz8bup1qepsempvm87k3z; wt2=Dt6B1sNjfS9mOw2rOUcWz7LnE65oG5AcG7C-7iuSGQ10DZgwjtuGdrBZlKOJt5QsEu8DWRIOSeNQ2a7qP7q1yRQ~~; lastCity=101210100; __g=-; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1630888771,1632789052,1632907583,1632959098; acw_tc=0bdd34ba16329610479403976e01a46b6a653805d48cc356c7a1254d2d5375; __c=1632959098; __a=66278464.1630401067.1632907554.1632959098.52.6.7.47; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1632962530; __zp_stoken__=0138dGiMjNjETJHpLDRQ2VDBYbnMRPGxPGRFeJC8TJ0Y%2FASEDIHMxYwBwZi8AHjN%2BTxwJVQgkUkJCHRMVQ3ACZm0YMWV2U1EgOHM5WnAVdzxse017agxTPj5JZUd4Q1w1DSU7fXVbUEcKIRY%3D; __zp_sseed__=iVynj0LLIRVDsqGYxrY8A2rJBiqFMuzEYl1KvBTzD1Q=; __zp_sname__=e948d594; __zp_sts__=1632962688132; monitor_count=40'# # cookie ='Cookie: HMACCOUNT_BFESS=399A131593FFAEE5; BDUSS_BFESS=VpjS3U5Q1hQd3ktdkMwand3N3k1ekppN1FJSUhSc2EtdVBEMGhBaU0zSEdYbEpoRVFBQUFBJCQAAAAAAAAAAAEAAADB320FNTMzNTg5NDkzAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMbRKmHG0SphW; BAIDUID_BFESS=DA74B922ACBBFCBDF71367A36C973898:FG=1'# # cookie ='set-cookie: __zp_sseed__=iVynj0LLIRVDsqGYxrY8A7QRlGL1xd7z8VDrvc0yURg=; Path=/; Domain=.zhipin.com'# headers = {#     'user-agent': 'user-agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',#     'Cookie': cookie,#     'Connection': 'keep - alive',#     'Accept':'Accept: image / avif, image / webp, image / apng, image / *, * / *;q = 0.8',# }# # 新闻链接# # session = requests.session()# res = requests.get(url=url, headers=headers, timeout=30)# res.encoding = 'utf-8'# res.raise_for_status()# res.encoding = res.apparent_encoding# html = BeautifulSoup(res.text, 'html.parser')# time.sleep(3)# print(html)# # 存入本地# with open('jobhtmlText.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312#     f.write(res.text)# with open('jobhtmlText.html', 'r', encoding='utf-8') as f:#     html = BeautifulSoup(f, 'html.parser')# html.list = html.find_all('div', attrs={'class': 'job-detail'})  # 整体框架for i, item in enumerate(html):# print(item,1,sep=',')item.list = item.find_all('div', attrs={'class': 'text'})[0].text.strip('').replace(' ', '')print(item.list, i, sep=',')try:df['招聘要求'] = item.find_all('div', attrs={'class': 'text'})[0].text.strip('\n').replace(' ', '').replace('\n', ' ').replace('\r', ' ').replace('\t', ' '),  # 上级框架,df['公司信息'] = item.find_all('div', attrs={'class': 'job-sec company-info'})[0].text.strip('\n').replace(' ',''),# df.to_csv('job.csv', mode='a+', header=None, index=None, encoding='utf-8-sig', sep=',')# df.to_json('jobBoss.json', orient='records', indent=1, force_ascii=False)# print(df)print(str(i), '招聘职位写入正常')except:print(str(i), '招聘职位写入正常')return df
class writeExcel:def __init__(self, data):self.data = data# print(data)def wE_r(self):app = xw.App(visible=False, add_book=False)new_workbook = xw.Book()new_worksheet = new_workbook.sheets.add('worksheet')app.display_alerts = Falseapp.screen_updating = Falsetitle = ["序号", "岗位名称", "发布日期", "薪资", "工作地及要求", "公司名称", "公司规模", "所属行业", "招聘职位网址", "招聘要求","招聘公司网址", "公司信息", '福利', '关键字', '薪资范围', '标记', '顺序', '记录日期']new_worksheet['A1'].value = titlefor i in range(len(self.data)):try:# df_w = jobRequire(data[i]['招聘职位网址'])# print(data[i]['招聘职位网址'])new_worksheet.cells[i + 1, 0].value = i + 1new_worksheet.cells[i + 1, 1].value = data[i]['岗位名称']new_worksheet.cells[i + 1, 2].value = ''  # data[i]['发布日期']new_worksheet.cells[i + 1, 3].value = data[i]['薪资']new_worksheet.cells[i + 1, 4].value = data[i]['工作地及要求']new_worksheet.cells[i + 1, 5].value = data[i]['公司名称']new_worksheet.cells[i + 1, 6].value = ''  # data[i]['公司规模']new_worksheet.cells[i + 1, 7].value = ''  # data[i]['所属行业']new_worksheet.cells[i + 1, 8].value = data[i]['招聘职位网址']# new_worksheet.cells[i + 1, 9].value =df_w['招聘要求']#str(df_w['招聘要求'].values).strip("['").strip("']").strip('')new_worksheet.cells[i + 1, 10].value = ''  # data[i]['招聘公司网址']# new_worksheet.cells[i + 1, 11].value = df_w['公司信息']#str(df_w['公司信息'].values).strip("['").strip("']").strip('')new_worksheet.cells[i + 1, 12].value = ''  # data[i]['福利']# 修改项目new_worksheet.cells[i + 1, 13].value = key  # 关键字new_worksheet.cells[i + 1, 14].value = '20-30k' if salary == '6' else '15-20K'  # 薪资范围new_worksheet.cells[i + 1, 17].value = datetime.date.today()  # 薪资范围print(str(i), 'Excel数据写入正常')except:print(str(i), 'Excel数据写入异常')# 招聘公司信息获取for i in range(len(self.data)):try:# 招聘公司信息获取time1 = time.time()  # 计算时长myWeb = Web(url)  # 实例化类time.sleep(0.5)html = myWeb.web_a(data[i]['招聘职位网址'])  # 'https://jobs.51job.com/all/co3836624.html')  # 实例化网址df_w = jobRequire(html)  # 获取职位需求信息print(df_w)time.sleep(3)new_worksheet.cells[i + 1, 9].value = df_w['招聘要求']new_worksheet.cells[i + 1, 11].value = df_w['公司信息']print(str(i), 'Excel数据-2模块写入正常')time2 = time.time()  # 计算时长print('总耗时:{}'.format(time2 - time1))except:print(str(i), 'Excel数据-2模块写入异常')new_worksheet.autofit()new_workbook.save('jobBoss.xlsx')new_workbook.close()app.quit()def run(self):pf = multiprocessing.Process(target=self.wE_r())pf.start()pf.join()
class Web:def __init__(self, url):self.url = url# 获取招聘职位信息def web(self):driver.back()# driver.refresh()time.sleep(0.5)driver.get(self.url)  # 加载网址time.sleep(1.5)source = driver.page_source  # 页面内容实例化html = BeautifulSoup(source, 'html.parser')  # 获取页面内容html.list = html.find_all('div', attrs={'class': 'job-list'})# with open('jobhtml.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312#     f.write(source)# print(html)return html.list# 获取招聘要求和公司信息def web_a(self, url):driver.back()# driver.refresh()print('回退刷新')time.sleep(0.5)driver.get(url)  # 加载网址# driver.refresh()# print('刷新')time.sleep(2)source = driver.page_source  # 页面内容实例化html = BeautifulSoup(source, 'html.parser')  # 获取页面内容html.list = html.find_all('div', attrs={'class': 'job-detail'})  # 整体框架# with open('jobhtmlText.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312#     f.write(source)# print(html)return html.list
df = pd.DataFrame()  # 定义 salary   全局变量
key = '物流管理'  # 物流经理#物流运营
salary = '5'  # 5表示15-20K,6表示20-30k
if __name__ == '__main__':# jobMesssage()# jobRequire()# opt = ChromeOptions()  # 创建chrome参数# opt.headless = False  # 显示浏览器# driver = Chrome(options=opt)  # 浏览器实例化# # driver=webdriver.Chrome()# driver.set_window_size(300, 700)# url='https://www.zhipin.com/job_detail/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&city=101210100&industry=&position='# url_b='https://www.zhipin.com/job_detail/63a31859fef2dbbc1nJy0tS8EFJY.html'# # 招聘公司信息获取# myWeb = Web(url)  # 实例化类# time.sleep(0.3)# html = myWeb.web_a(url_b)  # 'https://jobs.51job.com/all/co3836624.html')  # 实例化网址# df5 = jobRequire(html)  # 获取职位需求信息# print(df5)# time.sleep(0.5)opt = ChromeOptions()  # 创建chrome参数# 不加载图片prefs = {"profile.managed_default_content_settings.images": 2}opt.add_experimental_option("prefs", prefs)opt.headless = False  # 显示浏览器driver = Chrome(options=opt)  # 浏览器实例化# driver=webdriver.Chrome()driver.set_window_size(300, 700)url = 'https://www.zhipin.com/c101210100/y_6/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&ka=sel-salary-6'for i in range(3):  # +str(i);key=try:print(str(i), '获取第{}页数据'.format(i + 1))url = 'https://www.zhipin.com/c101210100/y_' + salary + '/?query=' + key + '&city=101210100&industry=&position=&ka=sel-salary-' + salary + '&page=' + str(i + 1) + '&ka=page-' + str(i + 1)print(url)# 'https://www.zhipin.com/job_detail/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&city=101210100&industry=&position='# 'https://www.zhipin.com/c101210100/y_6/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&ka=sel-salary-6'# 'https://www.zhipin.com/c101210100/y_5/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&ka=sel-salary-5'# ‘https://www.zhipin.com/c101210100/y_5/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&page=2&ka=page-2’time1 = time.time()  # 计算时长# 获取招聘职位信息myWeb = Web(url)html = myWeb.web()  # 获取招聘岗位信息# html=myWeb.web_a('https://www.zhipin.com/job_detail/c2b2f449e3c613a71nN72NS1FlpW.html')# 获取招聘要求和公司信息time.sleep(0.5)# print(html)df1 = jobMesssage(html)df = pd.concat([df1, df], axis=0)df.to_json('jobBoss.json', orient='records', indent=1, force_ascii=False)# url_b = str(df1['招聘公司网址'].values).strip("['").strip("']").strip('')# print(url_b)# # 招聘公司信息获取# myWeb = Web(url)  # 实例化类# time.sleep(0.3)# html = myWeb.web_a(url_b)  # 'https://jobs.51job.com/all/co3836624.html')  # 实例化网址# df2 = jobRequire(html)  # 获取职位需求信息# print(df2)# time.sleep(0.5)## df3 = pd.concat([df1, df2], axis=1)# df3.to_csv('job.csv', mode='a+', header=None, index=None, encoding='utf-8-sig', sep=',')# df = pd.concat([df, df3], axis=0)# print(df)# df.to_json('jobBoss.json', orient='records', indent=1, force_ascii=False)# time.sleep(0.5)time2 = time.time()  # 计算时长print(str(i), '数据正常'.format(i + 1))print('总耗时:{}'.format(time2 - time1))except:print(str(i), '数据异常'.format(i + 1))# 写入excelwith open('jobBoss.json', 'r', encoding='utf-8') as f:data = json.load(f)# print(data)myWe = writeExcel(data)  # 写入excelmyWe.run()  # 执行多线程try:  # 关闭后台浏览器driver.close()driver.quit()os.system('taskkill /F /IM chromedriver.exe')  # 关闭进程浏览器sreach_windows = driver.current_window_handle# 获得当前所有打开的窗口的句柄all_handles = driver.window_handlesfor handle in all_handles:driver.switch_to.window(handle)driver.close()time.sleep(1.2)except:print('已完后台毕浏览器')

boss完整代码2-单独提取职位要求和公司信息:

import requests
from bs4 import BeautifulSoup
import datetime
import json
import xlwings as xw
from selenium import webdriver
import time
import pandas as pd
from selenium.webdriver import Chrome, ChromeOptions, ActionChains
from selenium.webdriver.common.keys import Keys
import csv
import multiprocessing
import osdef jobMesssage(html):df_jobMesssage = pd.DataFrame()df = pd.DataFrame()# with open('jobhtml.html', 'r', encoding='utf-8') as f:#     html = BeautifulSoup(f, 'html.parser')# html.list = html.find_all('div', attrs={'class': 'job-list'})# print(html.list)for i, item in enumerate(html):item.list = item.find_all('div', attrs={'class': 'job-primary'})# print(item,i,sep=',')for i, item in enumerate(item.list):  # 获取每个招聘条目# print(item, i, sep=',')try:item.list = item.find('div', attrs={'class': 'info-append clearfix'}).text.replace(' ', '').replace('\n', ' ')print(item.list, i, sep=',')df_jobMesssage['招聘职位网址'] = 'https://www.zhipin.com' + item.find('div',attrs={'class': 'primary-box'}).get('href'),df_jobMesssage['岗位名称'] = item.find('div', attrs={'class': 'job-title'}).find('span', attrs={'class': 'job-name'}).text,df_jobMesssage['工作地及要求'] = item.find('div', attrs={'class': 'job-title'}).find('span', attrs={'class': 'job-area-wrapper'}).text.strip('\n'),  #df_jobMesssage['公司名称'] = item.find('div', attrs={'class': 'info-company'}).text.replace(' ','').replace('\n', ' '),df_jobMesssage['薪资'] = item.find('div', attrs={'class': 'job-limit clearfix'}).text.strip('').replace('\n', ' '),df_jobMesssage['福利'] = item.find('div', attrs={'class': 'info-append clearfix'}).text.replace(' ','').replace('\n', ' '),# print(df_jobMesssage)df_jobMesssage.to_csv('job.csv', mode='a+', header=None, index=True, encoding='utf-8-sig', sep=',')df = pd.concat([df, df_jobMesssage], axis=0)df.to_json('jobBoss.json', orient='records', indent=1, force_ascii=False)print(str(i), '招聘职位写入正常')except:print(str(i), '招聘职位写入正常')return dfdef jobRequire(html):# df = pd.DataFrame()df = {}  # 定义字典# # url='https://www.zhipin.com/job_detail/c3aea253a5b3b2501nJ92d-9GFBR.html'# url='https://www.zhipin.com/job_detail/c2b2f449e3c613a71nN72NS1FlpW.html'# # url='https://www.zhipin.com/job_detail/1635c904e28317c31nN63ti0FlJY.html'# cookie = 'Cookie: __guid=95203226.4063907470298592000.1630401055947.081; _bl_uid=tIkzmsaaz8bup1qepsempvm87k3z; wt2=Dt6B1sNjfS9mOw2rOUcWz7LnE65oG5AcG7C-7iuSGQ10DZgwjtuGdrBZlKOJt5QsEu8DWRIOSeNQ2a7qP7q1yRQ~~; lastCity=101210100; __g=-; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1630888771,1632789052,1632907583,1632959098; acw_tc=0bdd34ba16329610479403976e01a46b6a653805d48cc356c7a1254d2d5375; __c=1632959098; __a=66278464.1630401067.1632907554.1632959098.52.6.7.47; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1632962530; __zp_stoken__=0138dGiMjNjETJHpLDRQ2VDBYbnMRPGxPGRFeJC8TJ0Y%2FASEDIHMxYwBwZi8AHjN%2BTxwJVQgkUkJCHRMVQ3ACZm0YMWV2U1EgOHM5WnAVdzxse017agxTPj5JZUd4Q1w1DSU7fXVbUEcKIRY%3D; __zp_sseed__=iVynj0LLIRVDsqGYxrY8A2rJBiqFMuzEYl1KvBTzD1Q=; __zp_sname__=e948d594; __zp_sts__=1632962688132; monitor_count=40'# # cookie ='Cookie: HMACCOUNT_BFESS=399A131593FFAEE5; BDUSS_BFESS=VpjS3U5Q1hQd3ktdkMwand3N3k1ekppN1FJSUhSc2EtdVBEMGhBaU0zSEdYbEpoRVFBQUFBJCQAAAAAAAAAAAEAAADB320FNTMzNTg5NDkzAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMbRKmHG0SphW; BAIDUID_BFESS=DA74B922ACBBFCBDF71367A36C973898:FG=1'# # cookie ='set-cookie: __zp_sseed__=iVynj0LLIRVDsqGYxrY8A7QRlGL1xd7z8VDrvc0yURg=; Path=/; Domain=.zhipin.com'# headers = {#     'user-agent': 'user-agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',#     'Cookie': cookie,#     'Connection': 'keep - alive',#     'Accept':'Accept: image / avif, image / webp, image / apng, image / *, * / *;q = 0.8',# }# # 新闻链接# # session = requests.session()# res = requests.get(url=url, headers=headers, timeout=30)# res.encoding = 'utf-8'# res.raise_for_status()# res.encoding = res.apparent_encoding# html = BeautifulSoup(res.text, 'html.parser')# time.sleep(3)# print(html)# # 存入本地# with open('jobhtmlText.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312#     f.write(res.text)# with open('jobhtmlText.html', 'r', encoding='utf-8') as f:#     html = BeautifulSoup(f, 'html.parser')# html.list = html.find_all('div', attrs={'class': 'job-detail'})  # 整体框架for i, item in enumerate(html):# print(item,1,sep=',')item.list = item.find_all('div', attrs={'class': 'text'})[0].text.strip('').replace(' ', '')print(item.list, i, sep=',')try:df['招聘要求'] = item.find_all('div', attrs={'class': 'text'})[0].text.strip('\n').replace(' ', '').replace('\n', ' ').replace('\r', ' ').replace('\t', ' '),  # 上级框架,df['公司信息'] = item.find_all('div', attrs={'class': 'job-sec company-info'})[0].text.strip('\n').replace(' ',''),# df.to_csv('job.csv', mode='a+', header=None, index=None, encoding='utf-8-sig', sep=',')# df.to_json('jobBoss.json', orient='records', indent=1, force_ascii=False)# print(df)print(str(i), '招聘职位写入正常')except:print(str(i), '招聘职位写入正常')return dfclass writeExcel:def __init__(self, data):self.data = data# print(data)def wE_r(self):app = xw.App(visible=False, add_book=False)new_workbook = xw.Book()new_worksheet = new_workbook.sheets.add('worksheet')app.display_alerts = Falseapp.screen_updating = Falsetitle = ["序号", "岗位名称", "发布日期", "薪资", "工作地及要求", "公司名称", "公司规模", "所属行业", "招聘职位网址", "招聘要求","招聘公司网址", "公司信息", '福利', '关键字', '薪资范围', '标记', '顺序', '记录日期']new_worksheet['A1'].value = titlefor i in range(len(self.data)):try:# df_w = jobRequire(data[i]['招聘职位网址'])# print(data[i]['招聘职位网址'])new_worksheet.cells[i + 1, 0].value = i + 1new_worksheet.cells[i + 1, 1].value = data[i]['岗位名称']new_worksheet.cells[i + 1, 2].value = ''  # data[i]['发布日期']new_worksheet.cells[i + 1, 3].value = data[i]['薪资']new_worksheet.cells[i + 1, 4].value = data[i]['工作地及要求']new_worksheet.cells[i + 1, 5].value = data[i]['公司名称']new_worksheet.cells[i + 1, 6].value = ''  # data[i]['公司规模']new_worksheet.cells[i + 1, 7].value = ''  # data[i]['所属行业']new_worksheet.cells[i + 1, 8].value = data[i]['招聘职位网址']# new_worksheet.cells[i + 1, 9].value =df_w['招聘要求']#str(df_w['招聘要求'].values).strip("['").strip("']").strip('')new_worksheet.cells[i + 1, 10].value = ''  # data[i]['招聘公司网址']# new_worksheet.cells[i + 1, 11].value = df_w['公司信息']#str(df_w['公司信息'].values).strip("['").strip("']").strip('')new_worksheet.cells[i + 1, 12].value = ''  # data[i]['福利']# 修改项目new_worksheet.cells[i + 1, 13].value = key  # 关键字new_worksheet.cells[i + 1, 14].value = '20-30k' if salary == 6 else '15-20K'  # 薪资范围new_worksheet.cells[i + 1, 17].value = datetime.date.today()  # 薪资范围print(str(i), 'Excel数据写入正常')except:print(str(i), 'Excel数据写入异常')# 招聘公司信息获取# for i in range(len(self.data)):#     try:#         # 招聘公司信息获取#         time1 = time.time()  # 计算时长#         myWeb = Web(url)  # 实例化类#         time.sleep(0.5)#         html = myWeb.web_a(data[i]['招聘职位网址'])  # 'https://jobs.51job.com/all/co3836624.html')  # 实例化网址#         df_w = jobRequire(html)  # 获取职位需求信息#         print(df_w)#         time.sleep(2.5)#         new_worksheet.cells[i + 1, 9].value = df_w['招聘要求']#         new_worksheet.cells[i + 1, 11].value = df_w['公司信息']#         print(str(i), 'Excel数据-2模块写入正常')#         time2 = time.time()  # 计算时长#         print('总耗时:{}'.format(time2 - time1))#     except:#         print(str(i), 'Excel数据-2模块写入异常')new_worksheet.autofit()new_workbook.save('jobBoss.xlsx')new_workbook.close()app.quit()def wE_r_a(self):app = xw.App(visible=True, add_book=False)wb=app.books.open('jobBoss.xlsx')sh=wb.sheets['worksheet']# print(sh.range('i2').value)rng = [i for i in sh.range("i:i").value if i != None]# 单元格内容招聘网址print(rng)# j = sh.range('a1').expand('table').rows.count# print(j)app.display_alerts = Falseapp.screen_updating = Falsefor i in range(len(rng) - 1):try:# 招聘公司信息获取time1 = time.time()  # 计算时长myWeb = Web(url)  # 实例化类time.sleep(0.5)html = myWeb.web_a(rng[i+1])  # 'https://jobs.51job.com/all/co3836624.html')  # 实例化网址df_w = jobRequire(html)  # 获取职位需求信息print(df_w)time.sleep(2.5)sh.cells[i + 1, 9].value = df_w['招聘要求']sh.cells[i + 1, 11].value = df_w['公司信息']print(str(i), 'Excel数据-2模块写入正常')time2 = time.time()  # 计算时长print('总耗时:{}'.format(time2 - time1))except:print(str(i), 'Excel数据-2模块写入异常')sh.autofit()wb.save('jobBoss.xlsx')wb.close()app.quit()def run(self):pf = multiprocessing.Process(target=self.wE_r())pf.start()pf.join()def run_a(self):pf = multiprocessing.Process(target=self.wE_r_a())pf.start()pf.join()class Web:def __init__(self, url):self.url = url# 获取招聘职位信息def web(self):driver.back()# driver.refresh()time.sleep(0.5)driver.get(self.url)  # 加载网址time.sleep(1.5)source = driver.page_source  # 页面内容实例化html = BeautifulSoup(source, 'html.parser')  # 获取页面内容html.list = html.find_all('div', attrs={'class': 'job-list'})# with open('jobhtml.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312#     f.write(source)# print(html)return html.list# 获取招聘要求和公司信息def web_a(self, url):driver.back()# driver.refresh()time.sleep(0.5)driver.get(url)  # 加载网址time.sleep(1.5)source = driver.page_source  # 页面内容实例化html = BeautifulSoup(source, 'html.parser')  # 获取页面内容html.list = html.find_all('div', attrs={'class': 'job-detail'})  # 整体框架# with open('jobhtmlText.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312#     f.write(source)# print(html)return html.listdf = pd.DataFrame()  # 定义    全局变量
key = '物流管理'  # 物流经理#物流运营
salary = '6'  # 5表示15-20K,6表示20-30k
if __name__ == '__main__':# jobMesssage()# jobRequire()# opt = ChromeOptions()  # 创建chrome参数# opt.headless = False  # 显示浏览器# driver = Chrome(options=opt)  # 浏览器实例化# # driver=webdriver.Chrome()# driver.set_window_size(300, 700)# url='https://www.zhipin.com/job_detail/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&city=101210100&industry=&position='# url_b='https://www.zhipin.com/job_detail/63a31859fef2dbbc1nJy0tS8EFJY.html'# # 招聘公司信息获取# myWeb = Web(url)  # 实例化类# time.sleep(0.3)# html = myWeb.web_a(url_b)  # 'https://jobs.51job.com/all/co3836624.html')  # 实例化网址# df5 = jobRequire(html)  # 获取职位需求信息# print(df5)# time.sleep(0.5)opt = ChromeOptions()  # 创建chrome参数# 不加载图片prefs = {"profile.managed_default_content_settings.images": 2}opt.add_experimental_option("prefs", prefs)opt.headless = False  # 显示浏览器driver = Chrome(options=opt)  # 浏览器实例化# driver=webdriver.Chrome()driver.set_window_size(300, 700)url = 'https://www.zhipin.com/c101210100/y_6/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&ka=sel-salary-6''''for i in range(3):  # +str(i);key=try:print(str(i), '获取第{}页数据'.format(i + 1))url='https://www.zhipin.com/c101210100/y_'+salary+'/?query='+key+'&city=101210100&industry=&position=&ka=sel-salary-'+salary+'&page='+str(i+1)+'&ka=page-'+str(i+1)print(url)#'https://www.zhipin.com/job_detail/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&city=101210100&industry=&position='#'https://www.zhipin.com/c101210100/y_6/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&ka=sel-salary-6'#'https://www.zhipin.com/c101210100/y_5/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&ka=sel-salary-5'#‘https://www.zhipin.com/c101210100/y_5/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&page=2&ka=page-2’time1 = time.time()  # 计算时长# 获取招聘职位信息myWeb=Web(url)html=myWeb.web()#获取招聘岗位信息# html=myWeb.web_a('https://www.zhipin.com/job_detail/c2b2f449e3c613a71nN72NS1FlpW.html')# 获取招聘要求和公司信息time.sleep(0.5)# print(html)df1 = jobMesssage(html)df = pd.concat([df1, df], axis=0)df.to_json('jobBoss.json', orient='records', indent=1, force_ascii=False)# url_b = str(df1['招聘公司网址'].values).strip("['").strip("']").strip('')# print(url_b)# # 招聘公司信息获取# myWeb = Web(url)  # 实例化类# time.sleep(0.3)# html = myWeb.web_a(url_b)  # 'https://jobs.51job.com/all/co3836624.html')  # 实例化网址# df2 = jobRequire(html)  # 获取职位需求信息# print(df2)# time.sleep(0.5)## df3 = pd.concat([df1, df2], axis=1)# df3.to_csv('job.csv', mode='a+', header=None, index=None, encoding='utf-8-sig', sep=',')# df = pd.concat([df, df3], axis=0)# print(df)# df.to_json('jobBoss.json', orient='records', indent=1, force_ascii=False)# time.sleep(0.5)time2 = time.time()  # 计算时长print(str(i), '数据正常'.format(i + 1))print('总耗时:{}'.format(time2 - time1))except:print(str(i), '数据异常'.format(i + 1))# 写入excelwith open('jobBoss.json', 'r', encoding='utf-8') as f:data = json.load(f)# print(data)myWe = writeExcel(data)  # 写入excelmyWe.run()  # 执行多线程'''# 写入excel_awith open('jobBoss.json', 'r', encoding='utf-8') as f:data = json.load(f)# print(data)myWe = writeExcel(data)  # 写入excelmyWe.run_a()  # 执行多线程try:  # 关闭后台浏览器driver.close()driver.quit()os.system('taskkill /F /IM chromedriver.exe')  # 关闭进程浏览器sreach_windows = driver.current_window_handle# 获得当前所有打开的窗口的句柄all_handles = driver.window_handlesfor handle in all_handles:driver.switch_to.window(handle)driver.close()time.sleep(1.2)except:print('已完后台毕浏览器')

对应招聘信息单独查询写入excel

import requests
from bs4 import BeautifulSoup
import datetime
import json
import xlwings as xw
from selenium import webdriver
import time
import pandas as pd
from selenium.webdriver import Chrome, ChromeOptions, ActionChains
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.common.keys import Keys
import csv
import multiprocessing
import os# 职位要求
def jobRequire(html):df = pd.DataFrame()# with open('jobhtmlText.html', 'r', encoding='utf-8') as f:#     html = BeautifulSoup(f, 'html.parser')# html.list = html.find_all('div', attrs={'class': 'tHeader tHjob'})html.list = html.find_all('div', attrs={'class': 'tCompany_main'})# print(html.list)for i, item in enumerate(html.list):try:# contactInf=item.find_all('div', attrs={'class': 'tBorderTop_box'})[1].find('span').text.strip('') #联系方式# officeAddress=item.find_all('div', attrs={'class': 'tBorderTop_box'})[1].find('p').text#上班地址jobRequir_a = item.find('div', attrs={'class': 'tBorderTop_box'}).text.strip('').replace('\n', '').replace('\t', '') \.replace(' ', '')  # 任职要求# print(jobRequir_a, i, sep='')item.list = item.find('div', attrs={'class': 'tBorderTop_box'}).find_all('p')jobRequir = []  # 职位要求for i, item in enumerate(item.list):jobRequir.append(item.text.strip('') + '\n')jobRequirText = ''.join(jobRequir)# print(jobRequirText)# print(jobRequirText.find('任职要求'))if jobRequirText.find('任职要求') > 0:df['招聘要求'] = jobRequirText,else:df['招聘要求'] = jobRequir_a,# print(df)print(str(i), '职位信息写入正常')except:print(str(i), '职位信息写入异常')return df
# 招聘公司信息获取
def firmMeessage(html):df = pd.DataFrame()# with open('jobhtmlText.html', 'r', encoding='utf-8') as f:#     html = BeautifulSoup(f, 'html.parser')html.list = html.find_all('div', attrs={'class': 'tCompany_full'})# print(html.list)for i, item in enumerate(html.list):item.list = item.find_all('div', attrs={'class': 'tBorderTop_box'})# print(item.list[0].text.strip('').replace('\n', '').replace('\t', '').replace(' ', ''))# for i, item in enumerate(item.list):#     print(item.text,i,sep='')try:df['公司信息'] = item.list[0].text.strip('').replace('\n', '').replace('\t', '').replace(' ', ''),# print(df)print(str(i), '公司信息写入正常')except:print(str(i), '公司信息写入异常')return df# 招聘需求信息获取
def web_a( url):job_url = 'https://www.baidu.com/?tn=21002492_18_hao_pg'driver.back()driver.get(job_url)  # 加载网址time.sleep(0.3 + 0.4)driver.get(url)  # 加载网址time.sleep(1.2 + 1)source = driver.page_source  # 页面内容实例化html = BeautifulSoup(source, 'html.parser')  # 获取页面内容# with open('jobhtmlText.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312#     f.write(source)# print(html)return html# 招聘公司信息获取
def web_b( url):job_url = 'https://www.baidu.com/?tn=21002492_18_hao_pg'driver.back()driver.get(job_url)  # 加载网址time.sleep(0.5 + 0.5)driver.get(url)  # 加载网址time.sleep(1.2 + 1)source = driver.page_source  # 页面内容实例化html = BeautifulSoup(source, 'html.parser')  # 获取页面内容# print(html)# with open('jobhtmlText.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312#     f.write(source)# print(html)return htmlif __name__ == "__main__":opt = FirefoxOptions()  # ChromeOptions()  # 创建chrome参数# 不加载图片opt.set_preference('permissions.default.image', 2)opt.headless = False  # 显示浏览器driver = webdriver.Firefox(options=opt)  # Chrome(options=opt)  # 浏览器实例化app = xw.App(visible=True, add_book=False)wb = app.books.open('职业发展.xlsx')# 创建一个worksheetsh = wb.sheets['前程无忧']rng_firmMeessage = [i for i in sh.range("k:k").value if i != None]  # 单元格内容rng_jobRequire= [i for i in sh.range("i:i").value if i != None]  # 单元格内容j = sh.range('a1').expand('table').rows.count  # 序号app.display_alerts = False# app.screen_updating = Falsefor i in range(len(rng_jobRequire) - 1):try:html = web_a(rng_jobRequire[i+1])    # 实例化网址print(rng_jobRequire[i+1])df4 = jobRequire(html)print(df4)# print(df4.index)# print(df4.iloc[0,0])sh.cells[i + 1, 9].value = df4.iloc[0,0]print(str(i), '招聘要求数据写入正常')except:print(str(i),"招聘要求查询错误")for i in range(len(rng_firmMeessage) - 1):try:df5 = firmMeessage(rng_firmMeessage[i+1])  # 获取职位需求信息print(df5)wb.cells[i + 1, 11].value = df5.iloc[0,0]print(str(i), '公司信息数据写入正常')except:print(str(i),"公司信息查询错误")wb.autofit()wb.save('职业发展.xlsx')wb.close()app.quit()try:  # 关闭后台浏览器driver.close()driver.quit()os.system('taskkill /F /IM chromedriver.exe')  # 关闭进程浏览器sreach_windows = driver.current_window_handle# 获得当前所有打开的窗口的句柄all_handles = driver.window_handlesfor handle in all_handles:driver.switch_to.window(handle)driver.close()time.sleep(1.2)except:print('已完后台毕浏览器')

python中获取职位信息相关推荐

  1. Python中获取异常(try Exception)信息

    Python中获取异常(try Exception)信息 参考文章: (1)Python中获取异常(try Exception)信息 (2)https://www.cnblogs.com/hixiao ...

  2. python查看系统进程_在Python中获取操作系统的进程信息

    本文主要介绍在 Python 中使用 psutil 获取系统的进程信息. 1 概述 psutil 是 Python 的一个进程和系统工具集模块,通过使用 psutil,我们可以在 Python 中获取 ...

  3. python获取职位信息

    51job爬取完整代码: import requests from bs4 import BeautifulSoup import datetime import json import xlwing ...

  4. python教程西安中服_厉害了!打工人用Python分析西安市职位信息

    厉害了!打工人用Python分析西安市职位信息​blog.yuanpei.me 在上一篇博客中,我和大家分享了整个11月份找工作的心路历程,而在找工作的过程中,博主发现西安大小周.单休这种变相&quo ...

  5. python金融数据怎么获取_class类怎样在python中获取金融数据?

    我们搜集金融数据,通常想要的是利用爬虫的方法.其实我们最近所学的class不仅可以进行类调用,在获取数据方面同样是可行的,很多小伙伴都比较关注理财方面的情况,对金融数据的需要也是比较多的.下面就cla ...

  6. 如何在Python中获取文件创建和修改日期/时间?

    我有一个脚本,该脚本需要根据文件创建和修改日期执行一些操作,但必须在Linux和Windows上运行. 在Python中获取文件创建和修改日期/时间的最佳跨平台方法是什么? #1楼 最好的功能是os. ...

  7. python中获取文件大小_如何在Python中获取文件大小

    python中获取文件大小 We can get file size in Python using the os module. 我们可以使用os模块在Python中获取文件大小. Python中的 ...

  8. python中什么是异常_一文教你读懂 Python 中的异常信息

    在写 Python 代码的时候,当代码中出现错误,会在输出的时候打印 Traceback  错误信息,很多初学者看到那一堆错误信息,往往都会处于懵逼状态,脑中总会冒出一句,这都是些啥玩意.如果你是第一 ...

  9. python如何收集数据的方法有哪些_class类在python中获取金融数据的实例方法

    我们搜集金融数据,通常想要的是利用爬虫的方法.其实我们最近所学的class不仅可以进行类调用,在获取数据方面同样是可行的,很多小伙伴都比较关注理财方面的情况,对金融数据的需要也是比较多的.下面就cla ...

最新文章

  1. ECMAScript——基本数据类型之null和undefined
  2. 什么是倾斜45度的火山图?
  3. Mysql学习总结(47)——MySQL大表优化方案
  4. 用MATLAB解决实际数学问题,用matlab解决一道数学问题
  5. Python分布式爬虫打造搜索引擎(四)
  6. 深入JVM-Class装载系统
  7. 【笔记】VUE学习笔记
  8. xilinx spartan-3a iob
  9. HTML系列之多媒体视频标签 video
  10. 极光推送 java demo_android 极光推送demo
  11. VUE | 过滤器的作用
  12. Java学习推荐教材
  13. 一篇文章带你了解!什么是贴近摄影测量
  14. Stibo Systems(思迪博) 荣获数据质量卓越实践奖并获得CSTC权威鉴定
  15. 使用C语言产生正弦波数据
  16. 一款非常萌的桌面工具 --- Bongo Cat Mver 附使用教程
  17. random.seed()以及random.shuffle()
  18. php判断客户端是微信,PHP判断是手机端还是PC端以及PHP判断是否是微信浏览器
  19. Winwows下Python命令行执行脚本乱码
  20. 用什么打开DWG文件进行查看呢

热门文章

  1. 软件生命周期、管理软件过程-------------【软件】
  2. 软件安全理论测试部分
  3. PDF翻译,仅支持英译中,可以下载翻译后的pdf或者word版
  4. 【较全面】LaTeX tcolorbox的使用(添加阴影,支持跨页,设置颜色,设置缩进,设置边距,设置线框类型,隐藏线框)
  5. java.lang.ClassCastException: xxx.User cannot be cast to
  6. ha456 jar下载 免费 亲测有效
  7. 010 火狐浏览器插件中,没有Xpath怎么办
  8. maven的setting文件-02
  9. 图解CRM(客户关系管理)全流程 1
  10. 查看PCIE槽位是x8 还是 x6