python 携程_python 携程爬虫开发笔记

前言

最近购买了《Python3 爬虫、数据清洗与可视化实战》，刚好适逢暑假，就尝试从携程页面对广州的周边游产品进行爬虫数据捕捉。

因为才学Python不够一个星期，python的命名规范还是不太了解，只能套用之前iOS开发的命名规范，有不足之处请多多指点

一、前期

1.主要用到的库

from bs4 import BeautifulSoup

import time

import re #正则表达式

from selenium import webdriver

from selenium.webdriver.common.by import By

from selenium.webdriver.support.wait import WebDriverWait

from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.common.action_chains import ActionChains #浏览器操作

import xlrd

import xlwt

from xlutils.copy import copy

import os

BeautifulSoup：用于对标签等数据进行定位和抓取

selenium：用于启动浏览器和对页面进行自动操作

time：暂停等待操作

xlrd、xlwt、xlutils：对数据结果进行Excel读写保存操作

2.核心思路

1，跳进出发点的周边游页面(广州)

2，在首页捕捉推荐的热门目的地和热点景点，进行保存

3，针对目的地地点进行遍历搜索所展示的旅游产品

4，产品数据参数抓取

5，数据保存

6，退出浏览器

二、代码

1.启动浏览器

def setupDriverSetting():

global driver

# url = 'http://m.ctrip.com/restapi/soa2/10290/createclientid?systemcode=09&createtype=3&conte'#获取cookieID

# 手机端

# url = 'https://m.ctrip.com/webapp/vacations/tour/list?tab=64&kwd=%E7%8F%A0%E6%B5%B7&salecity=32&searchtype=tour&sctiy=32'

# 电脑端

url = 'https://weekend.ctrip.com/around/'

# 设置用chrome启动

driver = webdriver.Chrome()

# #设置fireFox请求头参数

# profile = webdriver.FirefoxProfile()

# user_agent = "Mozilla/5.0 (X11; Linux x86_64; rv:58.0) Gecko/20100101 Firefox/58.0"

# profile.set_preference("general.useragent.override",user_agent)

# driver = webdriver.Firefox(profile)

driver.get(url)

用webdriver启动Chrome或者fireFox，并跳进首页URL

2.选择出发点城市

def select_StartPlace(startPlace):

#点击出发点view

driver.find_element_by_xpath("//*[@id='CitySelect']").click()

#选择出发点

cityList = driver.find_elements_by_xpath("//*[@id='CitySelect']/dd/ul")

for link in cityList:

links = link.find_elements(By.TAG_NAME,"a")

for eachCity in links:

cityStr = eachCity.text

if cityStr == startPlace:

print("找到目标城市:"+eachCity.get_attribute('href'))

driver.get(eachCity.get_attribute('href'))

time.sleep(2)

try:

WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, "//*[@id='SearchText']")))

except:

print('出发地页面加载不成功')

break

主要是用find_element_by_xpath寻找目标城市进行选择筛选，然后跳到城市专页

3.搜索目的地

def finAllDestinationPage():

#查找总数组

destType = driver.find_element_by_id("J_sub_circum")#id 决定产品范围(周边游，境外游)

print(destType.text)

destType1 = destType.find_element_by_class_name("side_jmp_dest")

destTypeItem = destType1.get_attribute('innerHTML')

item = BeautifulSoup(destTypeItem,'lxml')

destTypeList = item.find_all('li')

allDestinationListDic = {}

for each in destTypeList:

typeName = each.h4.string

typeList = each.find_all('a')

list = []

for i in typeList:

list.append(i.string)

allDestinationListDic[typeName] = list

return allDestinationListDic

搜索所有可推荐目的地和景点，并用字典保存

4.旅游产品列表页

def jump_destinationPage(startPlace,destination):

#定位搜索栏

try:

WebDriverWait(driver,5).until(EC.presence_of_element_located((By.XPATH,"//*[@id='SearchText']")))

except:

print('查找不到搜索栏')

finally:

print('本地页面加载完毕')

driver.find_element_by_xpath("//input[@id='SearchText']").send_keys(destination)

print("输入目的地："+destination)

driver.find_element_by_xpath("//*[@id='SearchBtn']").click()

print("点击搜索按钮结束")

time.sleep(2)

try:

WebDriverWait(driver,5).until(EC.presence_of_element_located((By.XPATH,"//*[@id='js-dpSearcher']")))

except:

print('产品列表页加载不成功')

finally:

print('产品列表页加载完毕')

#再选一次出发地，以防出错

reSelect_StartPlace(startPlace)

#搜索页数

pageHtml = driver.find_element_by_xpath("//*[@id='_sort']/div/span")

print(pageHtml.text)

pageNumStr = pageHtml.text

pageNumStr = pageNumStr[:-1]

print("获取的num:" + pageNumStr)

#正则表达式查找页数

pageNumS = re.findall(r'\d+',pageNumStr)

pageNum = int(pageNumS[1])

print(pageNum)

tourProductList = []

for i in range(0,pageNum):

itemList = showCurrentPageAllData()

#收集数据

for j in range(0,len(itemList)):

eachItem = collectCurrentPageEachData(j)

tourProductList.append(eachItem)

#点击下一页

driver.find_element_by_xpath("//input[@id='ipt_page_txt']").clear()

driver.find_element_by_xpath("//input[@id='ipt_page_txt']").send_keys(str(i+2))

driver.find_element_by_xpath("//*[@id='ipt_page_btn']").click()

print("点击下一页结束->"+str(i+2)+"页")

time.sleep(2)

return driver

跳进产品页，并根据标签，抓取总页数，在遍历所有旅游产品后，再跳到下一页进行循环遍历

5.产品数据抓取

def collectCurrentPageEachData(itemNum):

itemList = driver.find_elements_by_class_name("product_box")

str = itemList[itemNum].get_attribute('innerHTML')#转换成字符串

# item = BeautifulSoup(str,"html.parser")#获取item的soup对象

item = BeautifulSoup(str, "lxml") # 获取item的soup对象

# print("+++++++"+item.prettify())

# 解析

#产品名称

titleNameHtml = item.find('h2',class_= 'product_title')

print("-------"+titleNameHtml.get_text())

productName = titleNameHtml.get_text()

#产品链接

productLink = titleNameHtml.a['href']

productLink = productLink[2:]

productLink = "https://"+productLink

print("link:" + productLink)

#产品类型

productType = item.find('em')

print("type:"+productType.get_text())

productTypeStr = productType.get_text()

#产品价格

priceHtml = item.find('span',class_='sr_price')

priceStr = priceHtml.strong.get_text()

#判断是否为数字

if priceStr.isdigit() == True :

priceStr = "%.2f"%float(priceStr)

print("price:"+priceStr)

#产品供应商

productRetail = item.find('p',class_='product_retail')

productRetailStr = productRetail['title']

if "供应商" in productRetailStr:

productRetailStr = productRetailStr[4:]

print("retail:" + productRetailStr)

#产品评分

try :

gradeHtml = item.find('p', class_='grade')

gradeStr = gradeHtml.strong.get_text()

print("grade:" + gradeStr)

except:

print('查找不到评分')

gradeStr = ''

# 产品人数

try:

commentHtml = item.find('div', class_='comment')

commentStr = commentHtml.em.get_text()

commentNumS = re.findall(r'\d+', commentStr)

commentNum = int(commentNumS[0])

print("comment:",commentNum)

except:

print('查找不到出游人数')

commentNum = ''

return {

'名称':productName,

'链接':productLink,

'类型':productTypeStr,

'价格':priceStr,

'供应商':productRetailStr,

'评分':gradeStr,

'人数':commentNum,

}

在产品页面上获取所有可见信息，并返回

6.数据保存

class ExcelFileManager:

def creatExcelFile(fileName,sheetName,headRowList):

# 获取项目所在目录

filePath = os.getcwd() + '/' + fileName + '.xls'

#如果不存在就新增

try:

oldFile = xlrd.open_workbook(filePath)

file = copy(oldFile)

except:

file = xlwt.Workbook()

print("新建文件")

#如果不存在就新增

try:

sheet1 = file.add_sheet(sheetName,cell_overwrite_ok=True)

except:

sheet1 = file.get_sheet(sheetName)

#设置style样式

head_style = xlwt.easyxf('font: name Times New Roman, color-index red, bold on',num_format_str='#,##0.00')

row0 = headRowList

for i in range(0,len(row0)):

sheet1.write(0,i,row0[i],head_style)

print(filePath)

file.save(filePath)

def addDataToExcelFile(fileName,sheetName,dataList):

filePath = os.getcwd()+'/'+fileName+'.xls'

file = xlrd.open_workbook(filePath)

#已存在的行数

newRows = file.sheet_by_name(sheetName).nrows

new_File = copy(file)

sheet = new_File.get_sheet(sheetName)

try:

for i in range(0,len(dataList)):

for j in range(0,len(dataList[i])):

sheet.write(i+newRows,j,dataList[i][j])

except Exception as e:

print(e)

new_File.save(filePath)

Excel文件创建与保存数据，不得不说，python对Excel支持不是很友好，xlrd和xlwt仅支持读和写，不支持增加sheet或者在原有Excel文件上添加数据等操作，需要用到第三方库

三、抓取结果：

1530848043475.jpg

python 携程_python 携程爬虫开发笔记相关推荐

python3携程_python携程
介绍协程(coroutine),又称为微线程,纤程.协程的作用:在执行A函数的时候,可以随时中断,去执行B函数,然后中断继续执行A函数(可以自动切换),单着一过程并不是函数调用(没有调用语句),过程 ...
python asyncio教程_Python 协程模块 asyncio 使用指南
Python 协程模块 asyncio 使用指南前面我们通过5 分钟入门 Python 协程了解了什么是协程,协程的优点和缺点和如何在 Python 中实现一个协程.没有看过的同学建议去看看.这篇文 ...
python gevent缺点_python 协程 greenlet gevent
一.并发的本质切换+保存状态 cpu正在运行一个任务,会在两种情况下切走去执行其他的任务(切换由操作系统强制控制),一种情况是该任务发生了阻塞,另外一种情况是该任务计算的时间过长时间片到了二.协程 ...
python协成_Python协程（上）
几个概念: event_loop 事件循环:程序开启一个无限的循环,程序员会把一些函数注册到事件循环上.当满足事件发生的时候,调用相应的协程函数. coroutine 协程:协程对象,指一个使用asy ...
python使用协程_Python 协程使用心得
基本概念协程:又称微线程,纤程.英文名Coroutine.协程是一种子程序,它在执行过程中可以中断,然后转而执行别的子程序,在适当的时候再返回来接着执行. 注意:如程序内不需要中断,则不要定义成协程 ...
python从网址爬图片协程_python协程gevent案例爬取斗鱼图片过程解析
分析分析网站寻找需要的网址用谷歌浏览器摁F12打开开发者工具,然后打开斗鱼颜值分类的页面,如图: 在里面的请求中,最后发现它是以ajax加载的数据,数据格式为json,如图: 圈住的部分是我们需要 ...
python从网址爬图片协程_python协程gevent案例：爬取斗鱼美女图片
分析分析网站寻找需要的网址用谷歌浏览器摁F12打开开发者工具,然后打开斗鱼颜值分类的页面,如图: 在里面的请求中,最后发现它是以ajax加载的数据,数据格式为json,如图: 圈住的部分是我们需要 ...
python 协程_Python 协程与 Go 协程的区别（一）
? "Python猫" ,一个值得加星标的公众号花下猫语:年关将近,不知各位过得怎样?我最近有些忙,收获也挺多,以后有机会分享下.吃饭时间,追了两部剧<了不起的麦瑟尔夫人& ...
python协成_Python协程技术的演进
引言 1.1. 存储器山存储器山是 Randal Bryant 在<深入理解计算机系统>一书中提出的概念. 基于成本.效率的考量,计算机存储器被设计成多级金字塔结构,塔顶是速度最快.成本 ...

python 携程_python 携程爬虫开发笔记

python 携程_python 携程爬虫开发笔记相关推荐

最新文章

热门文章