import requests
import time, random, csv
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
from threadpool import ThreadPool, makeRequestsdef request_url(city_code, city_name, city_letter):"""请求主页"""with open('has_elong.json', 'a+', encoding='utf-8') as hs:hs.write(city_code + '\n')hs.close()if city_code and int(city_code) < 1000:city_code = '0' + str(city_code)else:city_code = str(city_code)with open('艺龙/%s.csv' % city_name, 'w+', encoding='utf-8-sig') as f:cs = csv.writer(f, dialect='excel')# [酒店名称,价格,地址,星级,主题,可供服务,酒店信息]cs.writerow(['酒店名称', '价格', '地址', '星级', '主题', '可供服务', '酒店信息'])# 循环1-89页for n in range(1, 89):url = 'http://hotel.elong.com/%s/' % city_letterdata = {"code": "7140144","listRequest.areaID": "","listRequest.bookingChannel": "1","listRequest.cardNo": "192928","listRequest.checkInDate": "2019-03-02 00:00:00",  # 入住时间"listRequest.checkOutDate": "2019-03-03 00:00:00",  # 离开时间"listRequest.cityID": city_code,"listRequest.cityName": city_name,  # 北京等地区"listRequest.customLevel": "11","listRequest.distance": "20","listRequest.endLat": "0","listRequest.endLng": "0","listRequest.facilityIds": "","listRequest.highPrice": "0","listRequest.hotelBrandIDs": "","listRequest.isAdvanceSave": "false","listRequest.isAfterCouponPrice": "true","listRequest.isCoupon": "false","listRequest.isDebug": "false","listRequest.isLimitTime": "false","listRequest.isLogin": "false","listRequest.isMobileOnly": "true","listRequest.isNeed5Discount": "true","listRequest.isNeedNotContractedHotel": "false","listRequest.isNeedSimilarPrice": "false","listRequest.isReturnNoRoomHotel": "true","listRequest.isStaySave": "false","listRequest.isTrace": "false","listRequest.isUnionSite": "false","listRequest.keywords": "","listRequest.keywordsType": "0","listRequest.language": "cn","listRequest.listType": "0","listRequest.lowPrice": "0","listRequest.orderFromID": "50","listRequest.pageIndex": n,  # 翻页"listRequest.pageSize": "20","listRequest.payMethod": "0","listRequest.personOfRoom": "0","listRequest.poiId": "0","listRequest.promotionChannelCode": "0000","listRequest.proxyID": "ZD","listRequest.rankType": "0","listRequest.returnFilterItem": "true","listRequest.sellChannel": "1","listRequest.seoHotelStar": "0","listRequest.sortDirection": "1","listRequest.sortMethod": "1","listRequest.starLevels": "","listRequest.startLat": "0","listRequest.startLng": "0","listRequest.taRecommend": "false","listRequest.themeIds": "","listRequest.ctripToken": "1c06a555-04ce-4884-aa05-e6f92ad0e84e","listRequest.elongToken": "jc94shhj-d5a1-4092-8060-828b168dbb61"}headers = {'Accept': 'application/json, text/javascript, */*; q=0.01','Accept-Encoding': 'gzip, deflate','Accept-Language': 'zh-CN,zh;q=0.8','Cache-Control': 'no-cache','Content-Length': '1599','Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',# 'Cookie':'……61b8-48a1-b398-8b9ec1903f05……','Host': 'hotel.elong.com','Origin': 'http://hotel.elong.com','Pragma': 'no-cache','Proxy-Connection': 'keep-alive','Referer': 'http://hotel.elong.com/%s/' % city_letter,'User-Agent': UserAgent(verify_ssl=False).random,'X-Requested-With': 'XMLHttpRequest'}try:time.sleep(random.randint(1, 4))res = requests.get(url, data=data, headers=headers)dete_list = get_info_and_req_details(res.text)for data in dete_list:cs.writerow(data)except Exception:continuef.close()def get_info_and_req_details(html):"""清洗该页列表数据并向请求各个酒店的详情页page_list = [酒店名称,价格,地址,星级,主题,可供服务,酒店信息]"""bs = BeautifulSoup(html, "lxml")h_list = bs.find_all('div', attrs={'class': 'h_item'})page_list = []i = 0for hotel in h_list:if i < 25:try:hotel_name = hotel.find('div', attrs={'class': 'h_info_pic'}).find('img').get('alt')hotel_price = str(hotel.find('span', attrs={'class': 'h_pri_num'}).get_text()) + '元起'hotel_add = hotel.find('p', attrs={'class': 'h_info_b2'}).find('a').get_text().replace('[', '').replace(']', '')hotel_ress = hotel.find('span', attrs={'class': 'l1'}).get('data-hoteladdress')try:hotel_grade = hotel.find('b', attrs={'class': 'icon_stars'}).get('title')except Exception:hotel_grade = '经济型'try:hotel_theme = hotel.find('div', attrs={'class': 'tagList'}).get_text().replace('\n', ',')except Exception:hotel_theme = ''try:hotel_link = hotel.find('div', attrs={'class': 'h_info_pic'}).find('a').get('href')time.sleep(random.randint(1, 3))detail_html = requests.get('http://hotel.elong.com%s#hotelContent' % hotel_link)server, hotel_info = get_details(detail_html.text)except Exception:server = ''hotel_info = ''except Exception:continuepage_list.append([hotel_name, hotel_price, str(hotel_add)+str(hotel_ress), hotel_grade, hotel_theme, server, hotel_info])i += 1return page_listdef get_details(detail_html):"""清洗详情页数据"""detail = BeautifulSoup(detail_html, 'lxml')server = ''hotel_info = ''try:server = detail.find('ul', attrs={'class': 'dview_icon_list'}).get_text().replace('\n', ',')hotel_info = detail.find('div', attrs={'class': 'dview_info'}).get_text().replace('\n', ',').replace('\t', ',')except Exception:return server, hotel_inforeturn server, hotel_infoif __name__ == '__main__':has_num = []req_list = []  // 地址爬取请借鉴爬取携程酒店信息for line in open('elong.json', encoding='utf-8'):line_list = line.replace("\n", "").split(',')for has in open("has_elong.json", encoding='utf-8'):has_num.append(int(has.replace('\n', '')))if int(line_list[0]) in has_num:continue# request_url(line_list[0], line_list[1], line_list[2])line_tuple = (line_list, None)req_list.append(line_tuple)pool = ThreadPool(3)requests_list = makeRequests(request_url, req_list)[pool.putRequest(req) for req in requests_list]pool.wait()

转载于:https://www.cnblogs.com/wuyan717/p/10509740.html

使用requests、BeautifulSoup、线程池爬取艺龙酒店信息并保存到Excel中相关推荐

  1. Python爬取中国大学排名,并且保存到excel中

    前言 以下文章来源于数据分析和Python ,作者冈坂日川 今天发的是python爬虫爬取中国大学排名,并且保存到excel中,当然这个代码很简单,我用了半小时就写完了,我的整体框架非常清晰,可以直接 ...

  2. 使用selenium,xpath,线程池爬取斗鱼主播信息

    使用xpath,线程池爬取斗鱼主播信息: 主要爬取主播昵称,直播内容分类,房间名称,房间号以及人气,共爬取了大概110多页数据,大概15000条,保存在txt文本中, import timefrom ...

  3. Python爬取URP教务系统课程表并保存到excel

    Python爬取URP教务系统课程表并保存到excel 爬取URP教务系统课程表最终结果如图所示: 接下来开始操作: 首先打开教务系统->按F12->点击Network->刷新一下界 ...

  4. 爬取网易云在线课程并保存到Excel

    一.准备工作 1.打开网易云课堂,搜索Python相关课程,选择全部查看 2.打开谷歌浏览器,使用检查功能(F12)分析页面,在NetWork-XHR中发现所有课程信息都保存在"studyc ...

  5. post请求爬取艺龙酒店的评论

    爬取酒店的评论 使用的库 import urllib2 import requests import re import time import json 通过抓包发现酒店的id在asyncsearc ...

  6. 爬取猫眼top100数据,并保存到excel

    本文代码参考了崔庆才先生的视频教学,此篇仅作为自己的学习记录. import requests # 爬取网站 from requests.exceptions import RequestExcept ...

  7. 爬取奇书网各类小说信息并保存到excel中

    一开始用的保存函数是将原来的文件内容替换掉,所以换了一种方法就可以追加数据内容了 两种方法的对比 1.追加数据的函数 old_file = xlrd.open_workbook('qishu.xls' ...

  8. 基于requests模块的cookie,session和线程池爬取

    基于requests模块的cookie,session和线程池爬取 有些时候,我们在使用爬虫程序去爬取一些用户相关信息的数据(爬取张三"人人网"个人主页数据)时,如果使用之前req ...

  9. 【每日爬虫】:利用线程池爬取2万张装修效果图

    文章目录 一.前言 二.需求 三.技术路线 四.线程池爬取2万张装修效果图 五.其他 一.前言 2020-04-08日爬虫练习 每日一个爬虫小练习,学习爬虫的记得关注哦! 学习编程就像学习骑自行车一样 ...

最新文章

  1. 企业级监控软件Zabbix搭建部署之使用mutt+msmtp配置Zabbix邮件报警
  2. 解决Windows对JDK默认版本切换问题
  3. sip消息概念(一)
  4. asp隐藏邮箱部分字符_asp.net core 中使用 signalR(二)
  5. configtx.yaml中文注解
  6. 配置NAT超载(NAPT)
  7. vim编辑器初级(一)
  8. mapreduce新编程实例
  9. 最全的Gateway统一网关快速入门
  10. 代码执行器 hook console.log 方案
  11. 教你怎样用CAD做三维图
  12. 卡通动漫游戏人物网页模板_灰色 卡通 动漫 游戏 漂亮 精美 整站 斜纹 质感
  13. 阿里视频云web播放器常见问题汇总
  14. 互联网正在消灭中产阶级
  15. 这种国家的外贸不做也罢
  16. 深入浅出网络编程TCP,UDP,Socket,Http网络编程面试题
  17. 古典密码——代替密码
  18. IPv6 地址数量有多少,能够分配到地球上的每一粒尘埃吗
  19. Android11 读写权限申请
  20. 总线与接口(内部总线、系统总线、外部总线)

热门文章

  1. ug许可证错误未连接服务器10004,ug12许可错误,服务器未连接-10004ug打开报错 | 老伙计...
  2. 工具教程第六讲:MyToken行情软件使用(一)
  3. 服务器显示tl是什么意思,-tl 数据库服务器选项
  4. 谷歌中文输入法linux版
  5. 戴着VR头盔教机器人抓握,机器人当场就学会了
  6. 成小胖学习微服务架构·基础篇
  7. 转载:改“条件导向法”为“目标倒推法”
  8. 特征权重的处理与最终排名
  9. 代码 操作 excel 打印且适应纸张大小
  10. android人脸表情,unity人脸面部表情视觉识别插件Dlib FaceLandmark Detector 1.2.7