医脉通数据爬取 http://disease.medlive.cn


import requests
import time
from lxml import etree
import re
import xlwt
import random
import xlrdfrom multiprocessing import Processclass Yimaitong():def __init__(self):# 请求的urlself.url = 'http://disease.medlive.cn/wiki/list/171'f = time.time()nd = '%s' % int(round(f))# 请求头self.headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9','Accept-Encoding': 'gzip, deflate','Accept-Language': 'zh-CN,zh;q=0.9','Connection': 'keep-alive','Cookie': 'eyJ1aWQiOiIwIiwicmVzb3VyY2UiOiIiLCJhcHBfbmFtZSI6IiIsImV4dF92ZXJzaW9uIjoiMSJ9','Host': 'www.medlive.cn','Referer': 'http://disease.medlive.cn/wiki/list/178','User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Mobile Safari/537.36',}header = {'Referer': 'http://disease.medlive.cn/wiki/entry/0_0_37977?row=1','Cookie': 'ymt_pk_id=b7470997e9241352; _pk_ses.3.a971=*; sess=3ve23k417je2d1hhipuku66fg2; ymtinfo=eyJ1aWQiOiIwIiwicmVzb3VyY2UiOiIiLCJhcHBfbmFtZSI6IiIsImV4dF92ZXJzaW9uIjoiMSJ9; Hm_lvt_62d92d99f7c1e7a31a11759de376479f=1588835592,1589005542,1589006182; Hm_lpvt_62d92d99f7c1e7a31a11759de376479f=1589013816; _pk_id.3.a971=b7470997e9241352.1588835592.7.1589013816.1588925929.',}def parser_url(self, start, end, pathtxt):# 解析第一次返回的数据response = requests.get(self.url)html = etree.HTML(response.text)href = html.xpath('//*[@id="wiki_list_box"]/div[1]/div[2]/ul/li/a/@href')fina_dic = {}fina_data = []# for url in range(2):for url in range(start, end):# 遍历所需要的下标，下标对应的网址f = []time.sleep(0.2)new_url = 'http://disease.medlive.cn' + href[url]print(url, new_url)# 解析urlfirst_response = requests.get(new_url)new_html = etree.HTML(first_response.text)first_href = new_html.xpath('//*[@id="wiki_list_box"]/div[2]/ul/li/dl/dd/a/@href')# 获取疾病分类jibing_name = new_html.xpath('//*[@id="wiki_list_box"]/div[1]/div[2]/ul/li[' + str(url + 1) + ']/a/text()')[0]# 解析第二次的urlfor second in first_href:list_data = []# for second in first_href:time.sleep(1)try:second_url = 'http://disease.medlive.cn' + secondsecond_response = requests.get(second_url)second_html = etree.HTML(second_response.text)second_name = second_html.xpath('/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/label/text()')[0]second_href = second_html.xpath('/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/a/@href')[0]three_response = requests.get(second_href)# 获取精要three_html = etree.HTML(three_response.text)three_url = three_html.xpath('//*[@id="content"]/div/div[1]/div[1]/div[2]/div[1]/dl/dd[3]/a/@href')[0]four_url = 'http://disease.medlive.cn' + three_urlfour_response = requests.get(four_url)four_html = etree.HTML(four_response.text)time.sleep(0.2)# 点击详细one_detail = \four_html.xpath('//*[@id="content"]/div/div[1]/div[2]/div/div[2]/table/tbody/tr[2]/td[1]/p/a/@href')[0]one_detail_url = 'http://disease.medlive.cn' + one_detail# print(1,one_detail_url)one_detail_response = requests.get(one_detail_url)one_detail_html = etree.HTML(one_detail_response.text)keyword = ['关键因素', ]keyword.append(one_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[1]/ul/li/div/h5/span[1]/text()'))key_data = ['关键内容', ]key_data.append(one_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[1]/ul/li/div[2]/div/p/text()'))otherword = ['其它诊断因素', ]otherword.append(one_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[2]/ul/li/div/h5/span[1]/text()'))other_data = ['其他诊断内容', ]other_data.append(one_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[2]/ul/li/div[2]/div/p/text()'))dengerword = ['危险因素', ]dengerword.append(one_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[3]/ul/li/div/h5/span[1]/text()'))dengerdata = ['危险内容', ]dengerdata.append(one_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[3]/ul/li/div[2]/div/p/text()'))list_data.append(keyword)list_data.append(key_data)list_data.append(dengerword)list_data.append(dengerdata)list_data.append(otherword)list_data.append(other_data)# time.sleep(random.randint(2, 8))two_detail = \four_html.xpath('//*[@id="content"]/div/div[1]/div[2]/div/div[2]/table/tbody/tr[2]/td[2]/p/a/@href')[0]two_detail_url = 'http://disease.medlive.cn' + two_detail# print(2,two_detail_url)two_detail_response = requests.get(two_detail_url)two_detail_html = etree.HTML(two_detail_response.text)precedence = ['优先检测', ]precedence.append(two_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[1]/ul/li/div[1]/h5/span/text()'))precedencedata = ['优先检测内容', ]precedencedata.append(two_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[1]/ul/li/div[2]/table/tbody//text()'))# for aaa in nnnn:#     print(aaa.text)# print(666)select = ['可选检测', ]select.append(two_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[2]/ul/li/div[1]/h5/text()'))selectdata = ['可选检测内容', ]selectdata.append(two_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[2]/ul/li/div[2]/table/tbody//text()'))new = ['新的检测', ]new.append(two_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[3]/ul/li/div[1]/h5/span/text()'))newdata = ['新的检测内容', ]newdata.append(two_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[3]/ul/li/div[2]/table/tbody/tr//text()'))list_data.append(precedence)list_data.append(precedencedata)list_data.append(select)list_data.append(selectdata)list_data.append(new)list_data.append(newdata)# time.sleep(random.randint(1, 5))three_detail = \four_html.xpath('//*[@id="content"]/div/div[1]/div[2]/div/div[2]/table/tbody/tr[2]/td[3]/p/a/@href')[0]three_detail_url = 'http://disease.medlive.cn' + three_detail# print(3,three_detail_url)three_detail_response = requests.get(three_detail_url)three_detail_html = etree.HTML(three_detail_response.text)Treatment_conditions = three_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[1]/ul/li/a//text()')Treatment_conditions_url = three_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[1]/ul/li/a/@href')p = r'\S+'three_re = ['治疗细则', ]t_re = []for i in Treatment_conditions:t_re.append(re.findall(p, i)[0])three_re.append(t_re)three_data = ['治疗细则内容', ]for Treatment_url in Treatment_conditions_url:new_Treatment_url = 'http://disease.medlive.cn' + Treatment_urlnew_Treatment_urlresponse = requests.get(new_Treatment_url, headers=self.header)Treatment_urlresponse = etree.HTML(new_Treatment_urlresponse.text)three_data.append(Treatment_urlresponse.xpath('//*[@id="wiki_view_frm"]/div/div[1]/div[2]//text()'))list_data.append(three_re)list_data.append(three_data)detail_data = {second_name: list_data}f.append(detail_data)fina_dic = {jibing_name: f}except:passfina_data.append(fina_dic)# with open(pathtxt, 'w+')as f:#     f.write(str(fina_data))# print(fina_data)print(fina_data)# 返回获取的数据return fina_datadef first_parser_data(self, data, path):# 将获取的数据进行解析# 写入文件workbook = xlwt.Workbook(encoding='utf-8')worksheet = workbook.add_sheet('Sheet1')row = 2for i in data:  # 拿到儿科 jibingkey 儿科：[]for jibingkey, jibingvalue in i.items():  # 拿到print(0, jibingkey)worksheet.write(row, 0, jibingkey)for bingzhong in jibingvalue:for bingzhongkey, bingzhongvalue in bingzhong.items():# 拿到病种 bingzhongkeyprint(1, bingzhongkey)worksheet.write(row, 1, bingzhongkey)for detail_data in bingzhongvalue:# 拿到细则列表  判断长度并写入# print(detail_data)for write_num in detail_data[1]:if len(detail_data[1]) == 0:detail_data[1].append('0')for write_data in write_num:print(2, detail_data[0])worksheet.write(row, 2, detail_data[0])print(3, write_data)worksheet.write(row, 3, write_data)# print(4,write_data)row += 1# workbook.save('yimaitong.xlsx')workbook.save(path)passdef parser_data(self, data):# 写入文件workbook = xlwt.Workbook(encoding='utf-8')worksheet = workbook.add_sheet('Sheet1')row1 = 0row3 = 0row = 0a = 0for i in data:  # 拿到儿科 jibingkey {儿科：[]}for jibingkey, jibingvalue in i.items():  # 拿到 儿科：[]worksheet.write(row3, 0, jibingkey)for bingzhong in jibingvalue:  # 拿到{Alport综合征：[]}for bingzhongkey, bingzhongvalue in bingzhong.items():  # 拿到 [[因素，[]]]worksheet.write(row1, 1, bingzhongkey)for detail_data in bingzhongvalue:  # 遍历每一组因素  ['关键因素', ['肾脏病变', '听力障碍']]worksheet.write(a, 2, detail_data[0])a += 1col = 3for write_num in detail_data[1]:worksheet.write(row, col, write_num)col += 1row += 1row3 += 1row1 += 7row3 += row1workbook.save('医脉通—数据11-18.xlsx')def main(self):process_list = []yimaitong = Yimaitong()pathtxt1 = 'yimaitong 1-5.txt'data1 = yimaitong.parser_url(0, 5, pathtxt1)path1 = 'yimaitong 1-5.xlsx'# yimaitong.first_parser_data(data1, path1)p1 = Process(target=self.first_parser_data, args=(data1, path1))p1.start()pathtxt2 = 'yimaitong 5-10.txt'data2 = yimaitong.parser_url(5, 10, pathtxt2)path2 = 'yimaitong 5-10.xlsx'# yimaitong.first_parser_data(data2, path2)p2 = Process(target=self.first_parser_data, args=(data2, path2))p2.start()pathtxt3 = 'yimaitong 10-15.txt'data3 = yimaitong.parser_url(10, 15, pathtxt3)path3 = 'yimaitong 10-15.xlsx'# yimaitong.first_parser_data(data3, path3)p3 = Process(target=self.first_parser_data, args=(data3, path3))p3.start()pathtxt4 = 'yimaitong 15-20.txt'data4 = yimaitong.parser_url(15, 20, pathtxt4)path4 = 'yimaitong 15-20.xlsx'# yimaitong.first_parser_data(data4, path4)p4 = Process(target=self.first_parser_data, args=(data4, path4))p4.start()pathtxt5 = 'yimaitong 20-25.txt'data5 = yimaitong.parser_url(20, 25, pathtxt5)path5 = 'yimaitong 20-25.xlsx'# yimaitong.first_parser_data(data4, path4)p5 = Process(target=self.first_parser_data, args=(data5, path5))p5.start()process_list.append(p1)process_list.append(p2)process_list.append(p3)process_list.append(p4)process_list.append(p5)for t in process_list:t.join()yimaitong = Yimaitong()
# yimaitong.main()
pathtxt1 = '内容.txt'
data = yimaitong.parser_url(0, 1, pathtxt1)
# yimaitong.parser_data(data)

医脉通数据爬取 http://disease.medlive.cn相关推荐

大众点评数据爬取（字体反爬）
大众点评数据爬取 (字体反爬) 项目描述在码市的平台上看到的一个项目:现在已经能爬取到需要的数据,但是在爬取的效率和反爬措施上还需要加强. 项目分析 1.打开大众点评的首页'http://www. ...
搜狗·疫情数据爬取（Python）
上周已经分享过搜狗·疫情数据爬取(R语言),这次分享一下搜狗·疫情数据爬取(Python) 不说废话,直接上代码.有什么问题,可以在留言区讨论. from urllib import request ...
python手机端下载-Python3,x：如何进行手机APP的数据爬取
Python3,x:如何进行手机APP的数据爬取一.简介平时我们的爬虫多是针对网页的,但是随着手机端APP应用数量的增多,相应的爬取需求也就越来越多,因此手机端APP的数据爬取对于一名爬虫工程师来 ...
python爬虫案例-陶瓷公司数据爬取
用requests爬取要注意HTTPConnectionPool(host=xxx, port=xxx): Max retries exceeded with url...异常,出现这个异常的解决方法 ...
每日一练：Python国内疫情数据爬取与地图绘制
Python 国内疫情数据爬取与地图绘制效果图累计确诊疫情地图绘制 ① 时时数据抓取 ② 获取省份疫情数据 ③ 视觉配置项分段颜色数据设置 ④ 累计确诊疫情地图绘制现存确诊疫情地图绘制 ① 获取 ...
python财务报表预测股票价格_机器学习股票价格预测从爬虫到预测-数据爬取部分...
声明:本文已授权公众号「AI极客研修站」独家发布前言各位朋友大家好,小之今天又来给大家带来一些干货了.上篇文章机器学习股票价格预测初级实战是我在刚接触量化交易那会,因为苦于找不到数据源,所以找的一 ...
python爬虫网络数据包_Python爬虫之多线程图虫网数据爬取（十六）
Python爬虫之多线程图虫网数据爬取(十六) 发布时间:2019-05-14 10:11, 浏览次数:289 , 标签: Python 原创不易,转载前请注明博主的链接地址:Blessy_Zhu h ...
python如何爬虫股票数据_简单爬虫：东方财富网股票数据爬取(python_017)
需求:将东方财富网行情中心的股票数据爬取下来,包括上证指数.深圳指数.上证A股.深圳A股.新股.中小板.创业板等一.目标站点分析东方财富网的行情中心页面包含了所有股票信息.在左侧的菜单栏中包含了 ...
结合Selenium 和 Requests完成动态数据爬取
Selenium 简介 Selenium是一个用于Web应用程序测试的工具.Selenium测试直接调用操作浏览器,就像真正的用户在操作一样.支持的浏览器包括IE(7, 8, 9, 10, 11),M ...

医脉通数据爬取 http://disease.medlive.cn

医脉通数据爬取 http://disease.medlive.cn相关推荐

最新文章

热门文章