Python 携程、去哪儿游记爬取
应别人的需求
我把以前的代码拿过来,改了改,获取了一些数据
爬取的内容就特别简单的那种,如下
携程
pip install -i https://pypi.doubanio.com/simple/ --trusted-host pypi.doubanio.com feapder
feapder create -j
{"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9","accept-encoding": "gzip, deflate, br","accept-language": "zh-CN,zh;q=0.9","cache-control": "max-age=0","cookie": "","sec-ch-ua": "\"Google Chrome\";v=\"95\", \"Chromium\";v=\"95\", \";Not A Brand\";v=\"99\"","sec-ch-ua-mobile": "?0","sec-ch-ua-platform": "\"Windows\"","sec-fetch-dest": "document","sec-fetch-mode": "navigate","sec-fetch-site": "none","sec-fetch-user": "?1","upgrade-insecure-requests": "1","user-agent": ""
}
import time
import requests
from lxml import etree
from multiprocessing.dummy import Pool
from requests.exceptions import RequestException
import openpyxl
import re
from fake_useragent import UserAgentdef get_one_page(url):try:res = requests.get(url,headers = headers)if res.status_code == 200:return res.textreturn Noneexcept RequestException:return Nonedef parse_one_page(html):# 构造HTML解析器info_list=[]ii_list = html.xpath('//a[@class="journal-item cf"]')for ii in ii_list:##提取# 客源地title = ii.xpath('.//dt[@class="ellipsis"]/text()')[0].strip()di_li = ii.xpath('.//span[@class="tips_a"]/text()')[0].strip().split('\n ')try:day = di_li[0]# 出游时间(月份)time = di_li[1]# 出行同伴(家人、朋友或其他)people = di_li[3].replace(',','')# 人均消费money = di_li[2].replace(',','')except Exception:day=''time=''people=''money=''# print(day,time,people,money)# 用户名user1 = ii.xpath('.//dd[@class="item-user"]/text()')[0].strip()user = re.findall(r"(.+?)发表于", user1)[0].strip()fa = user1[user1.rfind('发表于'):].replace('发表于 ', '')# print(user)url = 'https://you.ctrip.com' + ii.xpath('./@href')[0].strip()headers = {'cookie': '_ga=GA1.2.50538359.1626942417; MKT_CKID=1626942416972.xsrkp.h14a; _RSG=vt4axMVXju2TUp4mgpTnUB; _RDG=28416d30204f5527dc27cd978da9f4f9ba; _RGUID=2e2d85f5-bb90-4df9-b7b1-773ab013379d; GUID=09031042315856507136; nfes_isSupportWebP=1; nfes_isSupportWebP=1; ibulanguage=CN; ibulocale=zh_cn; cookiePricesDisplayed=CNY; _gid=GA1.2.607075386.1635573932; MKT_Pagesource=PC; Session=smartlinkcode=U130026&smartlinklanguage=zh&SmartLinkKeyWord=&SmartLinkQuary=&SmartLinkHost=; Union=AllianceID=4897&SID=130026&OUID=&createtime=1635573932&Expires=1636178732460; MKT_CKID_LMT=1635573932634; _RF1=113.204.171.221; ASP.NET_SessionSvc=MTAuNjAuNDkuOTJ8OTA5MHxqaW5xaWFvfGRlZmF1bHR8MTYyMzE0MzgyNjI2MA; _bfa=1.1626942411832.2cm51p.1.1635573925821.1635580203950.4.26; _bfs=1.2; _jzqco=%7C%7C%7C%7C%7C1.429931237.1626942416968.1635580207564.1635580446965.1635580207564.1635580446965.0.0.0.19.19; __zpspc=9.4.1635580207.1635580446.2%232%7Cwww.baidu.com%7C%7C%7C%7C%23; appFloatCnt=7; _bfi=p1%3D290602%26p2%3D0%26v1%3D26%26v2%3D25','User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'+'Chrome/62.0.3202.94 Safari/537.36'} hji2 = get_one_page(url)hji2 = etree.HTML(hji2)quanwen = hji2.xpath('//div[@class="ctd_content"]')try:quanwen = quanwen[0].xpath('string(.)').strip().replace('\\n', '')except Exception:quanwen = ''pattern="[\u4e00-\u9fa5]+"regex = re.compile(pattern)results = ','.join(regex.findall(quanwen))cai = hji2.xpath('//div[@class="ctd_content_controls cf"]')try:result = cai[0].xpath('string(.)').strip().replace('\\n', '').replace('\r\n', '')except Exception:result = ''# print(result)if '天数' in result:n = re.findall(r"天数:(.+?)天", result[result.rfind('天数'):])[0].strip()+'天'# print(n)else:n = ''if '时间' in result:m = re.findall(r"时间:(.+?)月", result[result.rfind('时间'):])[0].strip()+'月'# print(m)else:m = ''if '人均' in result:ren1111 = re.findall(r"人均:(.+?)元", result[result.rfind('人均'):])[0].strip()+'元'# print(k)else:ren1111 = ''if '和谁' in result:c = result[result.rfind('和谁'):][3:6]# print(c)else:c = ''if '玩法' in result:try:a = re.findall(r"玩法:(.+?)作者去了这些地方", result[result.rfind('玩法'):])[0].strip()# print(a)except Exception:a = ''else:a=''if '作者去了这些地方' in result:b = result[result.rfind('作者去了这些地方'):].replace(' ','、')b = b.replace('、、 ', ',')b = b.replace('、', '')b = b.replace('作者去了这些地方:', '')# print(b)else:b = ''# print([user,title, day, n,time,m,people,c,money,k,a,b,quanwen,url])sheet.append([fa,user,title, day, n,time,m,people,c,money,ren1111,a,b,url,results])print(ii)def main(offset):# 构造主函数,初始化各个模块,传入入口URLbase_url = 'https://you.ctrip.com/travels/qiandongnan2375/t2-p{}.html'print(offset)url = base_url.format(offset)html = etree.HTML(get_one_page(url))parse_one_page(html)if __name__ == '__main__':wb = openpyxl.Workbook() # 获取工作簿对象sheet = wb.active # 活动的工作表# 添加列名global ren1111sheet.append(['发表于','用户名', '标题', '出游天数','天数','具体时间','出游时间(月份)',\'出行同伴','和谁','人均消费','人均','玩法','作者去了这些地方','链接','全文'])# 请求头headers = {'User-Agent':UserAgent(verify_ssl=False).random}# 使用线程池print('多线程爬取开始')start_time=time.time()p = Pool(8)p.map(main,[i for i in range(70,75)])# 保存位置wb.save(r'info8.xlsx')#关闭线程池end_time=time.time()print('多线程爬取结束')print('耗时:',end_time-start_time)p.close()p.join()#用来等待进程池中的worker进程执行完毕,防止主进程在worker进程结束前结束。
去哪儿
代码都差球不多
import time
import requests
from lxml import etree
from multiprocessing.dummy import Pool
from requests.exceptions import RequestException
import openpyxl
import re
from fake_useragent import UserAgentdef get_one_page(url):try:res = requests.get(url,headers = headers)if res.status_code == 200:return res.textreturn Noneexcept RequestException:return Nonedef parse_one_page(html):# 构造HTML解析器info_list=[]ii_list = html.xpath('//li[@class="list_item "]')try:hh = html.xpath('//li[@class="list_item last_item"]')[0]ii_list.append(hh)except Exception:passfor ii in ii_list:##提取try:# 客源地title = ''.join(ii.xpath('.//h2[@class="tit"]//text()'))except Exception:title=''try: # 用户名user = ii.xpath('.//span[@class="user_name"]/a/text()')[0].strip()except Exception:user=''try:# 出游天数day = ii.xpath('.//span[@class="days"]/text()')[0].strip()except Exception:day=''try:# 出游时间(月份)time = ii.xpath('.//span[@class="date"]/text()')[0].strip()except Exception:time=''try:# 出行同伴(家人、朋友或其他)people = ii.xpath('.//span[@class="people"]/text()')[0].strip()except Exception:people=''try:# 人均消费money = ii.xpath('.//span[@class="fee"]/text()')[0].strip()except Exception:money=''try:# 途径places = ''.join(ii.xpath('.//p[@class="places"]//text()'))xingcheng = places[places.rfind('行程'):]tujing = places[:places.rfind('行程')]except Exception:xingcheng=''tujing=''# 游记链接url = 'https://travel.qunar.com/travelbook/note/' + ii.xpath('.//h2[@class="tit"]/a/@href')[0].strip().replace('/youji/','')headers = {'User-Agent':UserAgent(verify_ssl=False).random}res1 = requests.get(url,headers = headers)hji2 = res1.texthji2 = etree.HTML(hji2)quanwen = hji2.xpath('//div[@class="b_panel_schedule"]')try:quanwen = quanwen[0].xpath('string(.)').strip().replace('\\n', '')except Exception:quanwen = ''pattern="[\u4e00-\u9fa5]+"regex = re.compile(pattern)results = ','.join(regex.findall(quanwen))try:chufa_date = hji2.xpath('//li[@class="f_item when"]//text()')[2]except Exception:chufa_date=''try:tian = hji2.xpath('//li[@class="f_item howlong"]//text()')[2]+'天'except Exception:tian=''try:fee = hji2.xpath('//li[@class="f_item howmuch"]//text()')[2]+'元'except Exception:fee=''try:ren = hji2.xpath('//li[@class="f_item who"]//text()')[2]except Exception:ren=''try:wan = ''.join(hji2.xpath('//li[@class="f_item how"]//text()')).replace('玩法/', '').replace('\xa0',' ')wan = wan.strip()except Exception:wan=''# print(result)# print([user,title, day, n,time,m,people,c,money,k,a,b,quanwen,url])sheet.append([user,title, day,tian,time,chufa_date,people,ren,fee,money,xingcheng,tujing,wan,url,results])print(ii)def main(offset):# 构造主函数,初始化各个模块,传入入口URLbase_url = 'https://travel.qunar.com/search/gonglue/22-qiandongnan-300125/hot_ctime/{}.htm'print(offset)url = base_url.format(offset)html = etree.HTML(get_one_page(url))parse_one_page(html)if __name__ == '__main__':wb = openpyxl.Workbook() # 获取工作簿对象sheet = wb.active # 活动的工作表# 添加列名global ren1111sheet.append(['用户名', '标题', '出游天数','天数','具体时间','出游时间',\'出行同伴','和谁','人均消费','人均','行程','途径','玩法','链接','全文'])# 请求头headers = {'User-Agent':UserAgent(verify_ssl=False).random}# 使用线程池print('多线程爬取开始')start_time=time.time()# p = Pool(8)# p.map(main,[i for i in range(1,4)])for i in range(30,35):time.sleep(6)main(i)# 保存位置wb.save(r'去哪儿6.xlsx')#关闭线程池end_time=time.time()print('多线程爬取结束')print('耗时:',end_time-start_time)# p.close()# p.join()#用来等待进程池中的worker进程执行完毕,防止主进程在worker进程结束前结束。
Python 携程、去哪儿游记爬取相关推荐
- python携程酒店评论_python爬取携程景点评论信息
今天要分析的网站是携程网,获取景点的用户评论,评论信息通过json返回API,页面是这个样子的 按下F12之后,F5刷新一下 具体需要URL Request的方式为POST,还需要你提取的哪一页,下面 ...
- python爬携程景区评论_python爬取携程景点评论信息
python爬取携程景点评论信息 今天要分析的网站是携程网,获取景点的用户评论,评论信息通过json返回API,页面是这个样子的 按下F12之后,F5刷新一下 具体需要URL Request的方式为P ...
- python协程第一课(实现爬取自己博客)
定义协程 # asyncio import asyncio # 定义一个协程函数 async def f1():await asyncio.sleep(1)#定义协程执行sleep(1)的时候,可以去 ...
- python协程gevent案例:爬取斗鱼美女图片
分析 分析网站寻找需要的网址 用谷歌浏览器摁F12打开开发者工具,然后打开斗鱼颜值分类的页面,如图: 在里面的请求中,最后发现它是以ajax加载的数据,数据格式为json,如图: 圈住的部分是我们需要 ...
- 7月更新 携程酒店价格房价爬取
5月,携程的反爬升级,新增加了加密参数 视屏地址:https://v.youku.com/v_show/id_XNDE3MzMxMjcxNg== 目前js代码已经扒出来了,可实现脱机操作,稳定的解密e ...
- python爬取携程酒店评论_python爬取携程酒店列表
做个笔记,亲测可用 ```python import requests import json from lxml import etree from bs4 import BeautifulSoup ...
- python爬去新浪微博_!如何通过python调用新浪微博的API来爬取数据
python抓取新浪微博,求教 爬手机端 可以参考的代码, #-*-coding:utf8-*- import smtplib from email.mime.text import MIMEText ...
- python项目开发案例集锦 豆瓣-Python第三个项目:爬取豆瓣《哪吒之魔童降世》 短评...
前面爬完网站信息图片之后,今天的又有了个小目标,最近的电影哪吒很火,去豆瓣上看了一下 影评,决定了今天主要是实现Python第三个项目:爬取豆瓣<哪吒之魔童降世> 短评,然后下载在exce ...
- 从入门到入土:Python爬虫学习|实例练手|爬取百度翻译|Selenium出击|绕过反爬机制|
此博客仅用于记录个人学习进度,学识浅薄,若有错误观点欢迎评论区指出.欢迎各位前来交流.(部分材料来源网络,若有侵权,立即删除) 本人博客所有文章纯属学习之用,不涉及商业利益.不合适引用,自当删除! 若 ...
最新文章
- PAT (Advanced Level) 1078. Hashing (25)
- 视觉直观感受 7 种常用的排序算法
- 安裝TA-Lib到想要罵髒話
- 类的加载过程一:Loading
- Spring注解编程基石(三)
- python 接收邮件服务器地址_Python 用IMAP接收邮件
- 从用户不足2000万到27亿,这项技术真要上天了?
- Oracle提供的序号函数
- QT_在循环中刷新界面
- [导入]Nebula3学习笔记(3): Core Namespace
- linux vi 底行命令,Linux下vi命令详解
- 硬盘pe安装红旗linux系统,WinPE 安装操作系统详细图解(图文教程)
- 矩阵开根号,工作矩阵平方根
- 如何站在巨人的肩膀上学习
- 如何挑选文档管理软件?
- Redis expire
- 中国500家企业【薪资待遇】一览
- 业务需求调研经验分享
- word自动编号变成黑块儿的原因及解决方案
- 计算机资源管理器出问题怎么办,W7系统资源管理器已停止工作怎么办
热门文章
- nginx proxy 详解,代理路径的转发
- 【C++】ODA的基本操作-平移、旋转、矩阵变换
- 手机开发实战163——视频介绍
- 用python画一个简单卡通人物_Python绘制可爱的卡通人物 | 【turtle使用】-Go语言中文社区...
- 艾灵网络完成战略轮融资
- 小程序SSL证书怎么选?
- 建设工程法规专科【1】
- 精准表达课学习心得《一》
- 交叉编译时undefined reference to `inflate_魅蓝风采现腕间 品鉴格拉苏蒂原创议员系列世界时腕表...
- 2023全球数字化营销洞察报告