import requests
import time
from lxml import etree
import re
import xlwt
import random
import xlrdfrom multiprocessing import Processclass Yimaitong():def __init__(self):# 请求的urlself.url = 'http://disease.medlive.cn/wiki/list/171'f = time.time()nd = '%s' % int(round(f))# 请求头self.headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9','Accept-Encoding': 'gzip, deflate','Accept-Language': 'zh-CN,zh;q=0.9','Connection': 'keep-alive','Cookie': 'eyJ1aWQiOiIwIiwicmVzb3VyY2UiOiIiLCJhcHBfbmFtZSI6IiIsImV4dF92ZXJzaW9uIjoiMSJ9','Host': 'www.medlive.cn','Referer': 'http://disease.medlive.cn/wiki/list/178','User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Mobile Safari/537.36',}header = {'Referer': 'http://disease.medlive.cn/wiki/entry/0_0_37977?row=1','Cookie': 'ymt_pk_id=b7470997e9241352; _pk_ses.3.a971=*; sess=3ve23k417je2d1hhipuku66fg2; ymtinfo=eyJ1aWQiOiIwIiwicmVzb3VyY2UiOiIiLCJhcHBfbmFtZSI6IiIsImV4dF92ZXJzaW9uIjoiMSJ9; Hm_lvt_62d92d99f7c1e7a31a11759de376479f=1588835592,1589005542,1589006182; Hm_lpvt_62d92d99f7c1e7a31a11759de376479f=1589013816; _pk_id.3.a971=b7470997e9241352.1588835592.7.1589013816.1588925929.',}def parser_url(self, start, end, pathtxt):# 解析第一次返回的数据response = requests.get(self.url)html = etree.HTML(response.text)href = html.xpath('//*[@id="wiki_list_box"]/div[1]/div[2]/ul/li/a/@href')fina_dic = {}fina_data = []# for url in range(2):for url in range(start, end):# 遍历所需要的下标,下标对应的网址f = []time.sleep(0.2)new_url = 'http://disease.medlive.cn' + href[url]print(url, new_url)# 解析urlfirst_response = requests.get(new_url)new_html = etree.HTML(first_response.text)first_href = new_html.xpath('//*[@id="wiki_list_box"]/div[2]/ul/li/dl/dd/a/@href')# 获取疾病分类jibing_name = new_html.xpath('//*[@id="wiki_list_box"]/div[1]/div[2]/ul/li[' + str(url + 1) + ']/a/text()')[0]# 解析第二次的urlfor second in first_href:list_data = []# for second in first_href:time.sleep(1)try:second_url = 'http://disease.medlive.cn' + secondsecond_response = requests.get(second_url)second_html = etree.HTML(second_response.text)second_name = second_html.xpath('/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/label/text()')[0]second_href = second_html.xpath('/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/a/@href')[0]three_response = requests.get(second_href)# 获取精要three_html = etree.HTML(three_response.text)three_url = three_html.xpath('//*[@id="content"]/div/div[1]/div[1]/div[2]/div[1]/dl/dd[3]/a/@href')[0]four_url = 'http://disease.medlive.cn' + three_urlfour_response = requests.get(four_url)four_html = etree.HTML(four_response.text)time.sleep(0.2)# 点击详细one_detail = \four_html.xpath('//*[@id="content"]/div/div[1]/div[2]/div/div[2]/table/tbody/tr[2]/td[1]/p/a/@href')[0]one_detail_url = 'http://disease.medlive.cn' + one_detail# print(1,one_detail_url)one_detail_response = requests.get(one_detail_url)one_detail_html = etree.HTML(one_detail_response.text)keyword = ['关键因素', ]keyword.append(one_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[1]/ul/li/div/h5/span[1]/text()'))key_data = ['关键内容', ]key_data.append(one_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[1]/ul/li/div[2]/div/p/text()'))otherword = ['其它诊断因素', ]otherword.append(one_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[2]/ul/li/div/h5/span[1]/text()'))other_data = ['其他诊断内容', ]other_data.append(one_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[2]/ul/li/div[2]/div/p/text()'))dengerword = ['危险因素', ]dengerword.append(one_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[3]/ul/li/div/h5/span[1]/text()'))dengerdata = ['危险内容', ]dengerdata.append(one_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[3]/ul/li/div[2]/div/p/text()'))list_data.append(keyword)list_data.append(key_data)list_data.append(dengerword)list_data.append(dengerdata)list_data.append(otherword)list_data.append(other_data)# time.sleep(random.randint(2, 8))two_detail = \four_html.xpath('//*[@id="content"]/div/div[1]/div[2]/div/div[2]/table/tbody/tr[2]/td[2]/p/a/@href')[0]two_detail_url = 'http://disease.medlive.cn' + two_detail# print(2,two_detail_url)two_detail_response = requests.get(two_detail_url)two_detail_html = etree.HTML(two_detail_response.text)precedence = ['优先检测', ]precedence.append(two_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[1]/ul/li/div[1]/h5/span/text()'))precedencedata = ['优先检测内容', ]precedencedata.append(two_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[1]/ul/li/div[2]/table/tbody//text()'))# for aaa in nnnn:#     print(aaa.text)# print(666)select = ['可选检测', ]select.append(two_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[2]/ul/li/div[1]/h5/text()'))selectdata = ['可选检测内容', ]selectdata.append(two_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[2]/ul/li/div[2]/table/tbody//text()'))new = ['新的检测', ]new.append(two_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[3]/ul/li/div[1]/h5/span/text()'))newdata = ['新的检测内容', ]newdata.append(two_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[3]/ul/li/div[2]/table/tbody/tr//text()'))list_data.append(precedence)list_data.append(precedencedata)list_data.append(select)list_data.append(selectdata)list_data.append(new)list_data.append(newdata)# time.sleep(random.randint(1, 5))three_detail = \four_html.xpath('//*[@id="content"]/div/div[1]/div[2]/div/div[2]/table/tbody/tr[2]/td[3]/p/a/@href')[0]three_detail_url = 'http://disease.medlive.cn' + three_detail# print(3,three_detail_url)three_detail_response = requests.get(three_detail_url)three_detail_html = etree.HTML(three_detail_response.text)Treatment_conditions = three_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[1]/ul/li/a//text()')Treatment_conditions_url = three_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[1]/ul/li/a/@href')p = r'\S+'three_re = ['治疗细则', ]t_re = []for i in Treatment_conditions:t_re.append(re.findall(p, i)[0])three_re.append(t_re)three_data = ['治疗细则内容', ]for Treatment_url in Treatment_conditions_url:new_Treatment_url = 'http://disease.medlive.cn' + Treatment_urlnew_Treatment_urlresponse = requests.get(new_Treatment_url, headers=self.header)Treatment_urlresponse = etree.HTML(new_Treatment_urlresponse.text)three_data.append(Treatment_urlresponse.xpath('//*[@id="wiki_view_frm"]/div/div[1]/div[2]//text()'))list_data.append(three_re)list_data.append(three_data)detail_data = {second_name: list_data}f.append(detail_data)fina_dic = {jibing_name: f}except:passfina_data.append(fina_dic)# with open(pathtxt, 'w+')as f:#     f.write(str(fina_data))# print(fina_data)print(fina_data)# 返回获取的数据return fina_datadef first_parser_data(self, data, path):# 将获取的数据进行解析# 写入文件workbook = xlwt.Workbook(encoding='utf-8')worksheet = workbook.add_sheet('Sheet1')row = 2for i in data:  # 拿到儿科 jibingkey 儿科:[]for jibingkey, jibingvalue in i.items():  # 拿到print(0, jibingkey)worksheet.write(row, 0, jibingkey)for bingzhong in jibingvalue:for bingzhongkey, bingzhongvalue in bingzhong.items():# 拿到病种 bingzhongkeyprint(1, bingzhongkey)worksheet.write(row, 1, bingzhongkey)for detail_data in bingzhongvalue:# 拿到细则列表  判断长度并写入# print(detail_data)for write_num in detail_data[1]:if len(detail_data[1]) == 0:detail_data[1].append('0')for write_data in write_num:print(2, detail_data[0])worksheet.write(row, 2, detail_data[0])print(3, write_data)worksheet.write(row, 3, write_data)# print(4,write_data)row += 1# workbook.save('yimaitong.xlsx')workbook.save(path)passdef parser_data(self, data):# 写入文件workbook = xlwt.Workbook(encoding='utf-8')worksheet = workbook.add_sheet('Sheet1')row1 = 0row3 = 0row = 0a = 0for i in data:  # 拿到儿科 jibingkey {儿科:[]}for jibingkey, jibingvalue in i.items():  # 拿到 儿科:[]worksheet.write(row3, 0, jibingkey)for bingzhong in jibingvalue:  # 拿到{Alport综合征:[]}for bingzhongkey, bingzhongvalue in bingzhong.items():  # 拿到 [[因素,[]]]worksheet.write(row1, 1, bingzhongkey)for detail_data in bingzhongvalue:  # 遍历每一组因素  ['关键因素', ['肾脏病变', '听力障碍']]worksheet.write(a, 2, detail_data[0])a += 1col = 3for write_num in detail_data[1]:worksheet.write(row, col, write_num)col += 1row += 1row3 += 1row1 += 7row3 += row1workbook.save('医脉通—数据11-18.xlsx')def main(self):process_list = []yimaitong = Yimaitong()pathtxt1 = 'yimaitong 1-5.txt'data1 = yimaitong.parser_url(0, 5, pathtxt1)path1 = 'yimaitong 1-5.xlsx'# yimaitong.first_parser_data(data1, path1)p1 = Process(target=self.first_parser_data, args=(data1, path1))p1.start()pathtxt2 = 'yimaitong 5-10.txt'data2 = yimaitong.parser_url(5, 10, pathtxt2)path2 = 'yimaitong 5-10.xlsx'# yimaitong.first_parser_data(data2, path2)p2 = Process(target=self.first_parser_data, args=(data2, path2))p2.start()pathtxt3 = 'yimaitong 10-15.txt'data3 = yimaitong.parser_url(10, 15, pathtxt3)path3 = 'yimaitong 10-15.xlsx'# yimaitong.first_parser_data(data3, path3)p3 = Process(target=self.first_parser_data, args=(data3, path3))p3.start()pathtxt4 = 'yimaitong 15-20.txt'data4 = yimaitong.parser_url(15, 20, pathtxt4)path4 = 'yimaitong 15-20.xlsx'# yimaitong.first_parser_data(data4, path4)p4 = Process(target=self.first_parser_data, args=(data4, path4))p4.start()pathtxt5 = 'yimaitong 20-25.txt'data5 = yimaitong.parser_url(20, 25, pathtxt5)path5 = 'yimaitong 20-25.xlsx'# yimaitong.first_parser_data(data4, path4)p5 = Process(target=self.first_parser_data, args=(data5, path5))p5.start()process_list.append(p1)process_list.append(p2)process_list.append(p3)process_list.append(p4)process_list.append(p5)for t in process_list:t.join()yimaitong = Yimaitong()
# yimaitong.main()
pathtxt1 = '内容.txt'
data = yimaitong.parser_url(0, 1, pathtxt1)
# yimaitong.parser_data(data)

医脉通数据爬取 http://disease.medlive.cn相关推荐

  1. 大众点评 数据爬取 (字体反爬)

    大众点评 数据爬取 (字体反爬) 项目描述 在码市的平台上看到的一个项目:现在已经能爬取到需要的数据,但是在爬取的效率和反爬措施上还需要加强. 项目分析 1.打开大众点评的首页'http://www. ...

  2. 搜狗·疫情数据爬取(Python)

    上周已经分享过搜狗·疫情数据爬取(R语言),这次分享一下搜狗·疫情数据爬取(Python) 不说废话,直接上代码.有什么问题,可以在留言区讨论. from urllib import request ...

  3. python手机端下载-Python3,x:如何进行手机APP的数据爬取

    Python3,x:如何进行手机APP的数据爬取 一.简介 平时我们的爬虫多是针对网页的,但是随着手机端APP应用数量的增多,相应的爬取需求也就越来越多,因此手机端APP的数据爬取对于一名爬虫工程师来 ...

  4. python爬虫案例-陶瓷公司数据爬取

    用requests爬取要注意HTTPConnectionPool(host=xxx, port=xxx): Max retries exceeded with url...异常,出现这个异常的解决方法 ...

  5. 每日一练:Python国内疫情数据爬取与地图绘制

    Python 国内疫情数据爬取与地图绘制 效果图 累计确诊疫情地图绘制 ① 时时数据抓取 ② 获取省份疫情数据 ③ 视觉配置项分段颜色数据设置 ④ 累计确诊疫情地图绘制 现存确诊疫情地图绘制 ① 获取 ...

  6. python财务报表预测股票价格_机器学习股票价格预测从爬虫到预测-数据爬取部分...

    声明:本文已授权公众号「AI极客研修站」独家发布 前言 各位朋友大家好,小之今天又来给大家带来一些干货了.上篇文章机器学习股票价格预测初级实战是我在刚接触量化交易那会,因为苦于找不到数据源,所以找的一 ...

  7. python爬虫网络数据包_Python爬虫之多线程图虫网数据爬取(十六)

    Python爬虫之多线程图虫网数据爬取(十六) 发布时间:2019-05-14 10:11, 浏览次数:289 , 标签: Python 原创不易,转载前请注明博主的链接地址:Blessy_Zhu h ...

  8. python如何爬虫股票数据_简单爬虫:东方财富网股票数据爬取(python_017)

    需求:将东方财富网行情中心的股票数据爬取下来,包括上证指数.深圳指数.上证A股.深圳A股.新股.中小板.创业板 等 一.目标站点分析 东方财富网的行情中心页面包含了所有股票信息.在左侧的菜单栏中包含了 ...

  9. 结合Selenium 和 Requests完成动态数据爬取

    Selenium 简介 Selenium是一个用于Web应用程序测试的工具.Selenium测试直接调用操作浏览器,就像真正的用户在操作一样.支持的浏览器包括IE(7, 8, 9, 10, 11),M ...

最新文章

  1. R语言使用geompointdensity包的geom_pointdensity函数将散点图和密度图结合起来、使用viridis包的scale_color_virdis函数为密度数据添加调色板色彩渐变
  2. PHP正则表达式快速学习方法
  3. .NET 6 攻略大全(四)
  4. mysql的搜索效率_Mysql模糊查询like效率,以及更高效的写法
  5. 数据装载 计算执行脚本总耗时_shell源码_01
  6. C++中doulbe/float/int转为CString方法(转)
  7. 计算机组成原理测试题
  8. Python学习_4_if_while_for
  9. C语言:使用冒泡算法将数组中的数据从大到小进行排序
  10. JS 100内与7相关的数
  11. 教你如何打造网页爬虫工具(实现思路及源码下载)
  12. 【虚拟主机】服务器搭建
  13. mysql 左连接查询记录数,MySQL左连接查询记录行数
  14. 基于wavesurfer,regions 封装的可视化音标标注控件
  15. NLP:自然语言处理技术近十年发展技术更迭的简介、案例之详细攻略(持续更新)daiding待更新
  16. Xsolla对话成都游戏茶馆CEO
  17. c语言五一快乐源码,快乐“五一”
  18. Android之ListView展示多类型的条目
  19. 5G标准设立 智慧城市还有哪四大挑战?
  20. 一文分析 Web3 尚未被主流采用的 6 个主要原因

热门文章

  1. 我该如何在30天后找到一份数据挖掘相关工作
  2. 计算机系统设置图标异常,桌面图标大小出现异常win7如何更改电脑桌面的分辨率...
  3. 神经网络中隐藏层的作用,深度神经网络隐藏层数
  4. 苹果微信王者荣耀哪个服务器人多,王者荣耀:QQ区玩家和微信区玩家的几大区别 你中枪了几个...
  5. 有道云导入html,有道云笔记网页剪报怎么用 有道云笔记网页剪报使用教程
  6. 网络教育计算机统考40分单选,2021年6月网络教育统考《计算机应用基础》统考必过题库最全试题及答案...
  7. scrcpy学习--多点触控事件的模拟
  8. GPU-BASED PROCEDURAL PLACEMENT IN HORIZON ZERO DAWN
  9. Ubuntu设置虚拟机共享目录在mnt下无法显示文件
  10. 【排错日记】java.lang.IllegalStateException: It is illegal to call this method if the current...