Python 携程、去哪儿游记爬取

应别人的需求

我把以前的代码拿过来，改了改，获取了一些数据

爬取的内容就特别简单的那种，如下

携程

pip  install  -i  https://pypi.doubanio.com/simple/  --trusted-host pypi.doubanio.com  feapder

feapder create -j

{"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9","accept-encoding": "gzip, deflate, br","accept-language": "zh-CN,zh;q=0.9","cache-control": "max-age=0","cookie": "","sec-ch-ua": "\"Google Chrome\";v=\"95\", \"Chromium\";v=\"95\", \";Not A Brand\";v=\"99\"","sec-ch-ua-mobile": "?0","sec-ch-ua-platform": "\"Windows\"","sec-fetch-dest": "document","sec-fetch-mode": "navigate","sec-fetch-site": "none","sec-fetch-user": "?1","upgrade-insecure-requests": "1","user-agent": ""
}

import time
import requests
from lxml import etree
from multiprocessing.dummy import Pool
from requests.exceptions import RequestException
import openpyxl
import re
from fake_useragent import UserAgentdef get_one_page(url):try:res = requests.get(url,headers = headers)if res.status_code == 200:return res.textreturn Noneexcept RequestException:return Nonedef parse_one_page(html):# 构造HTML解析器info_list=[]ii_list  = html.xpath('//a[@class="journal-item cf"]')for ii in ii_list:##提取# 客源地title = ii.xpath('.//dt[@class="ellipsis"]/text()')[0].strip()di_li = ii.xpath('.//span[@class="tips_a"]/text()')[0].strip().split('\n                                        ')try:day = di_li[0]# 出游时间（月份）time = di_li[1]# 出行同伴（家人、朋友或其他）people = di_li[3].replace('，','')# 人均消费money = di_li[2].replace('，','')except Exception:day=''time=''people=''money=''# print(day,time,people,money)# 用户名user1 = ii.xpath('.//dd[@class="item-user"]/text()')[0].strip()user = re.findall(r"(.+?)发表于", user1)[0].strip()fa = user1[user1.rfind('发表于'):].replace('发表于 ', '')# print(user)url = 'https://you.ctrip.com' + ii.xpath('./@href')[0].strip()headers = {'cookie': '_ga=GA1.2.50538359.1626942417; MKT_CKID=1626942416972.xsrkp.h14a; _RSG=vt4axMVXju2TUp4mgpTnUB; _RDG=28416d30204f5527dc27cd978da9f4f9ba; _RGUID=2e2d85f5-bb90-4df9-b7b1-773ab013379d; GUID=09031042315856507136; nfes_isSupportWebP=1; nfes_isSupportWebP=1; ibulanguage=CN; ibulocale=zh_cn; cookiePricesDisplayed=CNY; _gid=GA1.2.607075386.1635573932; MKT_Pagesource=PC; Session=smartlinkcode=U130026&smartlinklanguage=zh&SmartLinkKeyWord=&SmartLinkQuary=&SmartLinkHost=; Union=AllianceID=4897&SID=130026&OUID=&createtime=1635573932&Expires=1636178732460; MKT_CKID_LMT=1635573932634; _RF1=113.204.171.221; ASP.NET_SessionSvc=MTAuNjAuNDkuOTJ8OTA5MHxqaW5xaWFvfGRlZmF1bHR8MTYyMzE0MzgyNjI2MA; _bfa=1.1626942411832.2cm51p.1.1635573925821.1635580203950.4.26; _bfs=1.2; _jzqco=%7C%7C%7C%7C%7C1.429931237.1626942416968.1635580207564.1635580446965.1635580207564.1635580446965.0.0.0.19.19; __zpspc=9.4.1635580207.1635580446.2%232%7Cwww.baidu.com%7C%7C%7C%7C%23; appFloatCnt=7; _bfi=p1%3D290602%26p2%3D0%26v1%3D26%26v2%3D25','User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'+'Chrome/62.0.3202.94 Safari/537.36'}            hji2 = get_one_page(url)hji2 = etree.HTML(hji2)quanwen  = hji2.xpath('//div[@class="ctd_content"]')try:quanwen = quanwen[0].xpath('string(.)').strip().replace('\\n', '')except Exception:quanwen = ''pattern="[\u4e00-\u9fa5]+"regex = re.compile(pattern)results =  ','.join(regex.findall(quanwen))cai = hji2.xpath('//div[@class="ctd_content_controls cf"]')try:result = cai[0].xpath('string(.)').strip().replace('\\n', '').replace('\r\n', '')except Exception:result = ''# print(result)if '天数' in result:n = re.findall(r"天数：(.+?)天", result[result.rfind('天数'):])[0].strip()+'天'# print(n)else:n = ''if '时间' in result:m = re.findall(r"时间：(.+?)月", result[result.rfind('时间'):])[0].strip()+'月'# print(m)else:m = ''if '人均' in result:ren1111 = re.findall(r"人均：(.+?)元", result[result.rfind('人均'):])[0].strip()+'元'# print(k)else:ren1111 = ''if '和谁' in result:c = result[result.rfind('和谁'):][3:6]# print(c)else:c = ''if '玩法' in result:try:a = re.findall(r"玩法：(.+?)作者去了这些地方", result[result.rfind('玩法'):])[0].strip()# print(a)except Exception:a = ''else:a=''if '作者去了这些地方' in result:b = result[result.rfind('作者去了这些地方'):].replace('                                                                                                     ','、')b = b.replace('、、  ', ',')b = b.replace('、', '')b = b.replace('作者去了这些地方：', '')# print(b)else:b = ''# print([user,title, day, n,time,m,people,c,money,k,a,b,quanwen,url])sheet.append([fa,user,title, day, n,time,m,people,c,money,ren1111,a,b,url,results])print(ii)def main(offset):# 构造主函数，初始化各个模块，传入入口URLbase_url = 'https://you.ctrip.com/travels/qiandongnan2375/t2-p{}.html'print(offset)url = base_url.format(offset)html = etree.HTML(get_one_page(url))parse_one_page(html)if __name__ == '__main__':wb = openpyxl.Workbook()    # 获取工作簿对象sheet = wb.active           # 活动的工作表# 添加列名global ren1111sheet.append(['发表于','用户名', '标题', '出游天数','天数','具体时间','出游时间（月份）',\'出行同伴','和谁','人均消费','人均','玩法','作者去了这些地方','链接','全文'])# 请求头headers = {'User-Agent':UserAgent(verify_ssl=False).random}# 使用线程池print('多线程爬取开始')start_time=time.time()p = Pool(8)p.map(main,[i for i in range(70,75)])# 保存位置wb.save(r'info8.xlsx')#关闭线程池end_time=time.time()print('多线程爬取结束')print('耗时:',end_time-start_time)p.close()p.join()#用来等待进程池中的worker进程执行完毕，防止主进程在worker进程结束前结束。

去哪儿

代码都差球不多

import time
import requests
from lxml import etree
from multiprocessing.dummy import Pool
from requests.exceptions import RequestException
import openpyxl
import re
from fake_useragent import UserAgentdef get_one_page(url):try:res = requests.get(url,headers = headers)if res.status_code == 200:return res.textreturn Noneexcept RequestException:return Nonedef parse_one_page(html):# 构造HTML解析器info_list=[]ii_list  = html.xpath('//li[@class="list_item "]')try:hh  = html.xpath('//li[@class="list_item last_item"]')[0]ii_list.append(hh)except Exception:passfor ii in ii_list:##提取try:# 客源地title = ''.join(ii.xpath('.//h2[@class="tit"]//text()'))except Exception:title=''try:     # 用户名user = ii.xpath('.//span[@class="user_name"]/a/text()')[0].strip()except Exception:user=''try:# 出游天数day = ii.xpath('.//span[@class="days"]/text()')[0].strip()except Exception:day=''try:# 出游时间（月份）time = ii.xpath('.//span[@class="date"]/text()')[0].strip()except Exception:time=''try:# 出行同伴（家人、朋友或其他）people = ii.xpath('.//span[@class="people"]/text()')[0].strip()except Exception:people=''try:# 人均消费money = ii.xpath('.//span[@class="fee"]/text()')[0].strip()except Exception:money=''try:# 途径places = ''.join(ii.xpath('.//p[@class="places"]//text()'))xingcheng = places[places.rfind('行程'):]tujing = places[:places.rfind('行程')]except Exception:xingcheng=''tujing=''# 游记链接url = 'https://travel.qunar.com/travelbook/note/' + ii.xpath('.//h2[@class="tit"]/a/@href')[0].strip().replace('/youji/','')headers = {'User-Agent':UserAgent(verify_ssl=False).random}res1 = requests.get(url,headers = headers)hji2 = res1.texthji2 = etree.HTML(hji2)quanwen  = hji2.xpath('//div[@class="b_panel_schedule"]')try:quanwen = quanwen[0].xpath('string(.)').strip().replace('\\n', '')except Exception:quanwen = ''pattern="[\u4e00-\u9fa5]+"regex = re.compile(pattern)results =  ','.join(regex.findall(quanwen))try:chufa_date = hji2.xpath('//li[@class="f_item when"]//text()')[2]except Exception:chufa_date=''try:tian =  hji2.xpath('//li[@class="f_item howlong"]//text()')[2]+'天'except Exception:tian=''try:fee = hji2.xpath('//li[@class="f_item howmuch"]//text()')[2]+'元'except Exception:fee=''try:ren = hji2.xpath('//li[@class="f_item who"]//text()')[2]except Exception:ren=''try:wan = ''.join(hji2.xpath('//li[@class="f_item how"]//text()')).replace('玩法/', '').replace('\xa0',' ')wan = wan.strip()except Exception:wan=''# print(result)# print([user,title, day, n,time,m,people,c,money,k,a,b,quanwen,url])sheet.append([user,title, day,tian,time,chufa_date,people,ren,fee,money,xingcheng,tujing,wan,url,results])print(ii)def main(offset):# 构造主函数，初始化各个模块，传入入口URLbase_url = 'https://travel.qunar.com/search/gonglue/22-qiandongnan-300125/hot_ctime/{}.htm'print(offset)url = base_url.format(offset)html = etree.HTML(get_one_page(url))parse_one_page(html)if __name__ == '__main__':wb = openpyxl.Workbook()    # 获取工作簿对象sheet = wb.active           # 活动的工作表# 添加列名global ren1111sheet.append(['用户名', '标题', '出游天数','天数','具体时间','出游时间',\'出行同伴','和谁','人均消费','人均','行程','途径','玩法','链接','全文'])# 请求头headers = {'User-Agent':UserAgent(verify_ssl=False).random}# 使用线程池print('多线程爬取开始')start_time=time.time()# p = Pool(8)# p.map(main,[i for i in range(1,4)])for i in range(30,35):time.sleep(6)main(i)# 保存位置wb.save(r'去哪儿6.xlsx')#关闭线程池end_time=time.time()print('多线程爬取结束')print('耗时:',end_time-start_time)# p.close()# p.join()#用来等待进程池中的worker进程执行完毕，防止主进程在worker进程结束前结束。

Python 携程、去哪儿游记爬取相关推荐

python携程酒店评论_python爬取携程景点评论信息
今天要分析的网站是携程网,获取景点的用户评论,评论信息通过json返回API,页面是这个样子的按下F12之后,F5刷新一下具体需要URL Request的方式为POST,还需要你提取的哪一页,下面 ...
python爬携程景区评论_python爬取携程景点评论信息
python爬取携程景点评论信息今天要分析的网站是携程网,获取景点的用户评论,评论信息通过json返回API,页面是这个样子的按下F12之后,F5刷新一下具体需要URL Request的方式为P ...
python协程第一课(实现爬取自己博客)
定义协程 # asyncio import asyncio # 定义一个协程函数 async def f1():await asyncio.sleep(1)#定义协程执行sleep(1)的时候,可以去 ...
python协程gevent案例：爬取斗鱼美女图片
分析分析网站寻找需要的网址用谷歌浏览器摁F12打开开发者工具,然后打开斗鱼颜值分类的页面,如图: 在里面的请求中,最后发现它是以ajax加载的数据,数据格式为json,如图: 圈住的部分是我们需要 ...
7月更新携程酒店价格房价爬取
5月,携程的反爬升级,新增加了加密参数视屏地址:https://v.youku.com/v_show/id_XNDE3MzMxMjcxNg== 目前js代码已经扒出来了,可实现脱机操作,稳定的解密e ...
python爬取携程酒店评论_python爬取携程酒店列表
做个笔记,亲测可用 ```python import requests import json from lxml import etree from bs4 import BeautifulSoup ...
python爬去新浪微博_!如何通过python调用新浪微博的API来爬取数据
python抓取新浪微博,求教爬手机端可以参考的代码, #-*-coding:utf8-*- import smtplib from email.mime.text import MIMEText ...
python项目开发案例集锦豆瓣-Python第三个项目：爬取豆瓣《哪吒之魔童降世》短评...
前面爬完网站信息图片之后,今天的又有了个小目标,最近的电影哪吒很火,去豆瓣上看了一下影评,决定了今天主要是实现Python第三个项目:爬取豆瓣<哪吒之魔童降世> 短评,然后下载在exce ...
从入门到入土：Python爬虫学习|实例练手|爬取百度翻译|Selenium出击|绕过反爬机制|
此博客仅用于记录个人学习进度,学识浅薄,若有错误观点欢迎评论区指出.欢迎各位前来交流.(部分材料来源网络,若有侵权,立即删除) 本人博客所有文章纯属学习之用,不涉及商业利益.不合适引用,自当删除! 若 ...

Python 携程、去哪儿游记爬取

Python 携程、去哪儿游记爬取相关推荐

最新文章

热门文章