python爬取景点数据看该去哪里玩—

写在开头

之前详细说明了代码过程的伊犁篇[传送门]，是基于jupyter notebook一步步交互得来的，所以代码是一段段的，所以在这里用函数整理了一下。

1、步骤思路

STEP1. 确定要去的区域，获得去哪儿网景点评价的网页地址
STEP2. 通过爬虫爬取网页数据，把感兴趣的数据爬取下来，同时获得景点的详细页面链接
STEP3. 通过pandas工具对数据进行清洗整理，转换格式
STEP4. 进行综合评价，得出排名，再爬取前n个景点的详细数据，添加表格
STEP5. 进行空间落位，筛选出想去的地方

2、代码

遵循一个函数解决一个事情，

import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time#获取页面链接
def get_urls(ui,n):  #ui:地址，n：页码数urllst = []for i in range(1,n+1):urllst.append(ui+str(i))return urllst#获得每个页面10个基本信息
def get_onepage_data(u): #u:网址r = requests.get(u)soup = BeautifulSoup(r.text, 'lxml')infor = soup.find('ul',class_ = "list_item clrfix").find_all('li')data_jd = []data_link = []for i in infor:dic = {}dic_link = {}dic['lat'] = i['data-lat']dic['lng'] = i['data-lng']dic['景点名称'] = i.find('span', class_="cn_tit").textdic['攻略提到数量'] = i.find('div', class_="strategy_sum").textdic['点评数量'] = i.find('div', class_="comment_sum").textdic['景点排名'] = i.find('span', class_="ranking_sum").textdic['驴友去过'] = i.find('span', class_="sum").text.split('%')[0]dic['星级'] = i.find('span', class_="cur_star")['style'].split(':')[1].split('%')[0]dic['描述'] = i.find('div', class_="desbox").textdic['链接'] = i.find('a')['href']data_jd.append(dic)        return data_jd#获得n页网页信息
def get_all_data(urls): #urls 网址列表data_list= []for i in urls:       data_list.extend(get_onepage_data(i))        #print('成功采集%i个景点数据' % len(data_list)) df = pd.DataFrame(data_list)   #导入pandas的DataFrame   df.index = df['景点名称']del df['景点名称']               return df#数据字符转数字，以便计算处理
def data_collation(df):      df['lng'] = df['lng'].astype(np.float)df['lat'] = df['lat'].astype(np.float)df['点评数量'] = df['点评数量'].astype(np.int)df['攻略提到数量'] = df['攻略提到数量'].astype(np.int)df['驴友去过'] = df['驴友去过'].astype(np.int)df['星级'] = df['星级'].astype(np.int)df.fillna(value = 0,inplace = True)  #填充空值，如果列是lnt类型就0；如果是str就用nanreturn df# 筛选综合得分前n名的数据
def data_top(urls,n):  # 前n个数据df = get_all_data(urls)df = data_collation(df)# 构建函数实现字段标准化，标准分cols = ['攻略提到数量','星级','点评数量']for col in cols:df[col + '_b'] = round((df[col] - df[col].min())/(df[col].max() - df[col].min())*100,2)            #由驴友去过比例得分+攻略提到数量得分+星级得分+点评数得分，每项均为0-100分df['综合得分'] = df['驴友去过']+df['攻略提到数量_b']+df['星级_b']+df['点评数量_b']top_n = df.sort_values(by = '综合得分', ascending=False).iloc[:n]    del top_n['攻略提到数量_b']del top_n['点评数量_b']del top_n['星级_b']return top_n#通过景点链接爬取景点详细信息
def get_onepage_info(name,u): #u:网址r = requests.get(u)soup = BeautifulSoup(r.text, 'lxml')dic_jd = {}dic_jd['景点名称'] = namedic_jd['开放时间'] = soup.find('td',class_ = "td_r")if dic_jd['开放时间'] is None:  #判断空值，防止出错dic_jd['开放时间'] = '无'else:dic_jd['开放时间'] = dic_jd['开放时间'].find('p').text            dic_jd['门票价格'] = soup.find('div',class_ = "b_detail_section b_detail_ticket")if dic_jd['门票价格'] is None:  #判断空值，防止出错dic_jd['门票价格'] = '无'else:dic_jd['门票价格'] = dic_jd['门票价格'].find('div',class_ = "e_db_content_box e_db_content_dont_indent").text   dic_jd['旅游时节'] = soup.find('div',class_ = "b_detail_section b_detail_travelseason")if dic_jd['旅游时节'] is None:  #判断空值，防止出错dic_jd['旅游时节'] = '全年'else:dic_jd['旅游时节'] = dic_jd['旅游时节'].find('div',class_ ='e_db_content_box e_db_content_dont_indent').textdic_jd['其他'] = soup.find('td',class_ = "td_l")if dic_jd['其他'] is None:  #判断空值，防止出错dic_jd['其他'] = '无'else:dic_jd['其他'] = dic_jd['其他'].textreturn dic_jd#通过爬取的景点链接，获得所有景点详细信息
def get_detailed(df):detailed_data = []for index,row in df.iterrows():#print(index,row['链接'])detailed_data.append(get_onepage_info(index,row['链接']))detailed_df = pd.DataFrame(detailed_data)detailed_df.index = detailed_df['景点名称']del detailed_df['景点名称']#为了让数据简洁一些，这里得说明所有数据都指向同一个数据集，删除后前面df的也都没有这些数据了del df['链接']del df['攻略提到数量']del df['点评数量']del df['星级']del df['驴友去过']res = pd.concat([df,detailed_df],axis= 1)  #添加景点详细数据return resif __name__=="__main__":start_time = time.time()urls = get_urls('https://travel.qunar.com/p-cs299861-nanjing-jingdian-1-',5) #链接及页数top30_data = data_top(urls,30)  # 前30的数据top30_data.to_excel('F://nanj_top30.xlsx')  #导出文件end_time1 = time.time()print ("爬取基本数据，耗时:",end_time1 - start_time)    res = get_detailed(top30_data)  # 添加详细景点信息,比较费时间res.to_excel('F://nanj_detailed.xlsx')  #导出文件end_time2 = time.time()print ("添加详细信息，耗时:",end_time2 - start_time)

3、空间可视化结果如下

带有坐标的数据可以进行空间可视化，能看到景点的分布，旅游达人们就能知道大致的路线了。

景点主要分布在中山陵景区及附近

老门东—夫子庙—总统府—鸡鸣寺—玄武湖这条地铁3号线沿线，相对来说比较好组织路线

外围分布有大屠杀纪念馆、南京长江大桥、浦口火车站等个别景点，纯看个人兴趣了。

4、看看我们的数据集

nanj_top30.xlsx 的数据

nanj_detailed.xlsx的数据
详细信息里有多种信息，可以看到很多详细描述供参考！
看其中一个：
1、中山陵景区Sun Yatsen Mausoleum
描述：
沿着长长的石阶墓道爬上中山陵顶，瞻仰一代伟人的雕像。
门票价格：
免费开放。
开放时间：
周二-周日8:30-17:00；周一祭堂维护不开放，只能走到陵门为止（如恰逢法定节假日则正常开放）。中山陵陵寝自2018年6月1日起，试运行实施游客“预约”参观和“禁噪”管理。在试运行平稳基础上，拟于2019年1月1日起，正式施行中山陵陵寝“预约”参观和“禁噪”管理。
旅游时节：
四季皆宜。6月中旬－7月初为梅雨季节。南京以前有“火炉”之称，7－8月极端最高气温有时高达40℃，一般也在35℃左右。“夏热冬寒”是南京较之其它江南城市有过之而无不及的显著气候特征，通常12月份下雪机会最多，如果您有缘在南京遇上大雪，那也是令人神往的，江南的雪景更显妩媚动人。
其他：
地址:南京市玄武区石象路7号
电话:400-9288-312