python bs4 csv requests 爬虫爬取携程火车票网址信息并保存

目的：爬取携程网址

火车中的单程与中转

单程
url=“https://trains.ctrip.com/trainbooking/search?tocn=%25e5%258d%2583%25e5%25b2%259b%25e6%25b9%2596&fromcn=%25e6%259d%25ad%25e5%25b7%259e&day=2020-12-31”
中转
url=“https://trains.ctrip.com/pages/booking/hubSingleTrip?ticketType=2&fromCn=%25E6%259D%25AD%25E5%25B7%259E&toCn=%25E5%258D%2583%25E5%25B2%259B%25E6%25B9%2596&departDate=2020-12-31”
采用parse.quote（）进行url转码
采用csv进行数据保存
random.choice进行选择一个User Agent 自认为这是个不错的习惯
携程单程信息在原网页源代码中
携程中转网址火车中中转信息保存在json文件中(js_url)

LET’S GO

url="https://trains.ctrip.com/pages/booking/hubSingleTrip?ticketType=5&fromCn=%25E6%259D%25AD%25E5%25B7%259E&toCn=%25E6%2596%25B0%25E4%25B9%25A1&departDate=2020-12-30" #  携程单程火车原网址 查询参数 fromcn 出发站 tocn 目的站 departDate 日期
#原网页查询参数需要进行两次url编码（注意点1）
#携程单程信息在原网页源代码中
'''
url="https://trains.ctrip.com/pages/booking/hubSingleTrip?ticketType=2&fromCn=%25E6%259D%25AD%25E5%25B7%259E&toCn=%25E5%258D%2583%25E5%25B2%259B%25E6%25B9%2596&departDate=2020-12-31"
js_url="https://trains.ctrip.com/pages/booking/getTransferList?departureStation=%2525E6%25259D%2525AD%2525E5%2525B7%25259E&arrivalStation=%2525E6%252596%2525B0%2525E4%2525B9%2525A1&departDateStr=2020-12-30"
携程中转网址火车中中转信息保存在json文件中(js_url) 查询参数departureStation arrivalStation departDateStr
类似稍加自己比较即可发现
js_url查询参数需要进行三次url编码（注意点2）
'''from urllib import parse
import random
from bs4 import BeautifulSoup
import  csv
import os
import requests
# print(parse.unquote((parse.unquote("%25E6%259D%25AD%25E5%25B7%259E"))))
fromArea = input("出发站")
toArea = input("目的站")
date=input("年-月-日 ：")
if not os.path.exists("D:/携程查找练习"):#创建后续保存文件os.mkdir("D:/携程查找练习")
class NewsByTransfer():#该类用于爬取中转的信息def __init__(self):#初始化self.fromArea=fromAreaself.toArea=toAreaself.date=datedef getOneJsUrl(self,fromArea,toArea,date):#进行js_url拼接fromArea=parse.quote(parse.quote(fromArea))departureStation=parse.quote(fromArea)toArea=parse.quote(parse.quote(toArea))arrivalStation=parse.quote(toArea)url="https://trains.ctrip.com/pages/booking/hubSingleTrip?ticketType=5&fromCn="+fromArea+"&toCn="+toArea #原网页js_url="https://trains.ctrip.com/pages/booking/getTransferList?departureStation="+departureStation+"&arrivalStation="+arrivalStationjs_url=js_url+"&departDateStr="+date# print(url)print(js_url)return js_urldef getOneNews(self,js_url):#爬取js_url信息UA = ["Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36","Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14","Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)",'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11','Opera/9.25 (Windows NT 5.1; U; en)','Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)','Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)','Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12','Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',"Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7","Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 "]user_agent = random.choice(UA)text=requests.get(js_url,headers={"User-Agent":user_agent}).json() #获取json字符串用于python字典处理transferList=text["data"]["transferList"]#第一次定位主要信息列表csvList=[]#创建csv后续dictwriter写入 保存列表for oneTransfer in  transferList:# print(oneTransfer)tranDict={}tranDict["总出发站"] = oneTransfer["departStation"]tranDict["总目的站"] = oneTransfer["arriveStation"]tranDict["总信息"] = oneTransfer["transferStation"] + "换乘 停留" + oneTransfer["transferTakeTime"] + " 全程" + \oneTransfer["totalRuntime"] + " 价格" + oneTransfer['showPriceText']trainTransferInfosList=oneTransfer["trainTransferInfos"]for trainTransferInfos in trainTransferInfosList:tranDict[f"班次列车号{trainTransferInfos['sequence']}"]=trainTransferInfos['trainNo']tranDict[f"发车时间-到站时间{trainTransferInfos['sequence']}"]=trainTransferInfos['departDate']+" "+  \trainTransferInfos['departTime']+"---"+trainTransferInfos['arriveDate']+" "+trainTransferInfos['arriveTime']tranDict[f"发车站-目的站{trainTransferInfos['sequence']}"]=trainTransferInfos[ 'departStation']+"---"  +\trainTransferInfos["arriveStation"]csvList.append(tranDict)print(csvList)return csvListdef mkcsv(self,csvlist):#创建csv文件with open(f"D:/携程查找练习/{csvlist[0]['总出发站']}到{csvlist[0]['总目的站']}转站查找.csv","w+",newline="",encoding="utf-8") as f:writer = csv.DictWriter(f, list(csvlist[0].keys()))writer.writeheader()writer.writerows(csvlist)def main(self):js_url = self.getOneJsUrl(self.fromArea, self.toArea, self.date)csvList = self.getOneNews(js_url)self.mkcsv(csvList)print(csvList)class NewsBySingle():#爬取单程信息def __init__(self):self.fromArea=fromAreaself.toArea=toAreaself.date=datedef getOneUrl(self,fromArea,toArea,date):fromArea=parse.quote(parse.quote(fromArea))toArea=parse.quote(parse.quote(toArea))url="https://trains.ctrip.com/trainBooking/search?ticketType=0&fromCn="+fromArea+"&toCn="+toArea+"&day="+self.date+"&mkt_header=&orderSource="# print(url)print(url)return urldef getOneNews(self,url):UA = ["Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36","Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14","Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)",'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11','Opera/9.25 (Windows NT 5.1; U; en)','Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)','Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)','Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12','Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',"Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7","Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 "]user_agent = random.choice(UA)text=requests.get(url,headers={"User-Agent":user_agent}).content.decode("utf-8")print(text)#获取源代码为后续bs4解析soup=BeautifulSoup(text,"lxml")oneTripList=soup.select("div.railway_list")print(len(oneTripList))oneTripNewList=[]for oneTrip in oneTripList:oneTripDict={}print(oneTrip)oneTripDict["班次列车号"]=oneTrip.select("strong")[0].stringoneTripDict["出发站名称"]=oneTrip.select("span")[0].stringoneTripDict["出发站时间"]=oneTrip.select("strong")[1].stringoneTripDict["中途时间"]=list(oneTrip.select("div.haoshi")[0].stripped_strings)[0]oneTripDict["目的站名称"]=oneTrip.select("span")[1].stringoneTripDict["到站时间"]=oneTrip.select("strong")[2].stringprint(oneTripDict)oneTripNewList.append(oneTripDict)print("---"*60)print(oneTripNewList)return oneTripNewListdef mkcsv(self,oneTripNewList):with open(f"D:/携程查找练习/{oneTripNewList[0]['出发站名称']}到{oneTripNewList[0]['目的站名称']}单程查找.csv","w+",newline="",encoding="utf-8") as f:writer = csv.DictWriter(f, list(oneTripNewList[0].keys()))writer.writeheader()writer.writerows(oneTripNewList)def main(self):url=self.getOneUrl(self.fromArea, self.toArea, self.date)oneTripNewList=self.getOneNews(url)self.mkcsv(oneTripNewList)
NewsByTransfer().main()
NewsBySingle().main()

总结一下还是不难的，基础打牢，python类函数运用 bs4解析（感觉比xpath直观）然后浏览器network中xhr就简单看看写一下就行，给个点赞好不，创造不易，无需关注，给赞就行