patho爬虫-爬取--华夏基金代码

import re
import requests#创建一个华夏基金类：
class Fund():def __init__(self):self.run()def run(self):#获取初始的url：base_url="http://fund.chinaamc.com/portal/cn/include/newproducthome.jsp"response=requests.get(base_url)html=response.text# print(html)"""接着，创建四个函数，分别表示四个基金列表的table，然后分别爬取每个table中包含的数据"""self.get_data_tb(html)self.get_data_tb1(html)self.get_data_tb2(html)self.get_data_tb3(html)#第一张def get_data_tb(self,html):print('---------------------打印第一张表-------------------------')#获取第一个基金列表table1_list=re.findall('<table width="100%" border="0" cellspacing="0" cellpadding="0" style="margin-bottom:10px;border-bottom:1px solid #eee;" id="tb">(.*?)</table>',html,re.S)#获取table中的每一个tr标签：tr1_list=re.findall('<tr align="center"(.*?)</tr>',table1_list[0],re.S)# print(tr1_list)#检查是否获取到数据。fund_key=re.findall('<span class="p16_libe">(.*?)</span>',table1_list[0],re.S)# print(fund_key)for t_list in tr1_list:#获取基金名：title=re.findall('title="(.*?)" target=',t_list)# print(title)#第一个是空值#其他的信息：other=re.findall('<td height="30">(.*?)</td>',t_list)# print(other)#第一个是空值#因为有空值的存在，需要做出相应的判断if title and other:if other[5]=='---':other[5]=''table1_dict={fund_key[0]:title[0],#基金简称fund_key[1]:other[1],#基金代码fund_key[2]:other[2],#净值日期fund_key[3]:other[3],#净值fund_key[4]:other[4],#累计净值fund_key[5]:other[5],#涨跌幅fund_key[6]:other[6],#成立日期fund_key[7]:other[7],#申购状态fund_key[8]:other[8],#赎回状态fund_key[9]:other[9],#定投状态}print(table1_dict)
#第二张def get_data_tb1(self,html):#第二个table的内容：print('---------------------打印第二张表-------------------------')table2_list=re.findall('<table width="100%" border="0" cellspacing="0" cellpadding="0" style="margin-bottom:10px; border-bottom:1px solid #eee;border-top:1px solid #eee;" id="tb1">(.*?)</table>',html,re.S)#获取第二个table中的每个tr标签：tr2_list=re.findall('<tr align="center"(.*?)</tr>', table2_list[0], re.S)#打印查看数据是否获取到# print(tr2_list)for t2_list in tr2_list:#找到基金的名称：title2=re.findall('title="(.*?)"',t2_list)# print(title2)#查看数据other2=re.findall('<td height="30">(.*?)</td>',t2_list,re.S)# print(other2)#查看数据if other2 and title2:#获取基金代码：daima=other2[2]#获取净值日期：date=other2[3]#获取百万分收益jingzhi=other2[4]#获取七年收益率jingzhi7=other2[5]#最近30天的年华day30=other2[6]# 获取今年以来的年化day_yuar = other2[7]# 成立日期chengli_date = other2[8]# 申购状态gou = other2[9]# 赎回状态shu= other2[10]# 定投状态tou = other2[11]table2_dict={'基金简称': title2[0],'基金代码': daima,'净值日期': date,'百万盘收益': jingzhi,'七日年收益': jingzhi7,'最近30天的年化': day30,'获取今年以来的年化': day_yuar,'成立日期': chengli_date,'申购状态': gou,'赎回状态': shu,'定投状态': tou,}print(table2_dict)
#第三张def get_data_tb2(self,html):#第三个table的内容print('---------------------打印第三张表-------------------------')#获取table表格的内容：table3_list=re.findall( '<table width="100%" border="0" cellspacing="0" cellpadding="0" style="margin-bottom:10px;border-bottom:1px solid #eee; border-top:1px solid #eee;" id="tb2">(.*?)</table>',html, re.S)#获取表格的每个tr标签tr3_list=re.findall('<tr align="center"(.*?)</tr>', table3_list[0], re.S)# print(tr3_list)#遍历tr3_listfor t3_list in tr3_list:# 找到基金名name = re.findall('title="(.*?)"', t3_list)# print(name)# 基金名以外的东西需要遍历出来other = re.findall('<td height="30">(.*?)</td>', t3_list)# print(other)# 第一个是空值，需要判断if name and other:table3_dict = {'基金简称': name[0],'基金代码': other[2],'净值日期': other[3],'万盘收益': other[4],'七日年收益': other[5],'运作期年化收益': '','成立日期': other[6],'申购状态': other[7],'赎回状态': other[8],'定投状态': other[9],}print(table3_dict)
#第四张def get_data_tb3(self,html):#4获取最后一个tableprint('---------------------打印第四张表-------------------------')#获取表单的所有信息table4_list=re.findall( '<table width="100%" border="0" cellspacing="0" cellpadding="0" style="border-top:1px solid #eee;border-bottom:1px solid #eee;margin-bottom:10px" id="tb3">(.*?)</table>',html,re.S)#获取每个tr标签tr4_list=re.findall('<tr align="center"(.*?)</tr>',table4_list[0],re.S)# print(tr4_list)fund4_key = re.findall('<span class="p16_libe">(.*?)</span>', table4_list[0], re.S)print(fund4_key)#循环遍历for t4_list in tr4_list:#获取基金名称name=re.findall('title="(.*?)"',t4_list)# print(name)#获取其他的内容other4=re.findall('<td height="30">(.*?)</td>',t4_list)# print(other4)if name and other4:if not other4[4] or other4[4]=='--':other4[4]=''if not other4[5] or other4[5]=='--':other4[5]=''if not other4[7]:other4[7]=''if other4[8]=='---':other4[8]=''table4_dict={fund4_key[0]:name[0],fund4_key[1]:other4[2],fund4_key[2]:other4[3],fund4_key[3]:other4[4],fund4_key[4]:other4[5],fund4_key[5]:other4[6],fund4_key[6]:other4[7],fund4_key[7]:other4[8],}print(table4_dict)
if __name__ == '__main__':Fund()

patho爬虫-爬取--华夏基金代码相关推荐

python网络爬虫_Python网络爬虫——爬取视频网站源视频！
原标题:Python网络爬虫--爬取视频网站源视频! 学习前提 1.了解python基础语法 2.了解re.selenium.BeautifulSoup.os.requests等python第三方库 ...
python java 爬数据_如何用java爬虫爬取网页上的数据
当我们使用浏览器处理网页的时候,有时候是不需要浏览的,例如使用PhantomJS适用于无头浏览器,进行爬取网页数据操作.最近在进行java爬虫学习的小伙伴们有没有想过如何爬取js生成的网络页面吗?别急 ...
java用爬虫爬一个页面_使用Java写一个简单爬虫爬取单页面
使用Java爬虫爬取人民日报公众号页面图片使用Java框架Jsoup和HttpClient实现,先看代码爬取目标页面 1.使用Maven构建一个普通Java工程加入依赖: org.jsoup j ...
MATLAB爬虫爬取股票数据
近年来,大数据盛行,有关爬虫的教程层次不穷.那么,爬虫到底是什么呢? 什么是爬虫? 百度百科是这样定义的: 网络爬虫(又被称为网页蜘蛛,网络机器人,在FOAF社区中间,更经常的称为网页追逐者),是一种 ...
在当当买了python怎么下载源代码-python爬虫爬取当当网
[实例简介]python爬虫爬取当当网 [实例截图] [核心代码] ''' Function: 当当网图书爬虫 Author: Charles 微信公众号: Charles的皮卡丘 ''' impor ...
python如何爬虫网页数据-python网络爬虫爬取网页内容
1.什么是网络爬虫? 网络爬虫(又被称为网页蜘蛛,网络机器人,在FOAF社区中间,更经常的称为网页追逐者),是一种按照一定的规则,自动地抓取万维网信息的程序或者脚本.另外一些不常使用的名字还有蚂蚁.自 ...
python爬虫代码实例-Python爬虫爬取百度搜索内容代码实例
这篇文章主要介绍了Python爬虫爬取百度搜索内容代码实例,文中通过示例代码介绍的非常详细,对大家的学习或者工作具有一定的参考学习价值,需要的朋友可以参考下搜索引擎用的很频繁,现在利用Python爬 ...
python爬虫数据分析可以做什么-python爬虫爬取的数据可以做什么
在Python中连接到多播服务器问题,怎么解决你把redirect关闭就可以了.在send时,加上参数allow_redirects=False 通常每个浏览器都会设置redirect的次数.如果re ...
爬虫python的爬取步骤-Python爬虫爬取数据的步骤
爬虫: 网络爬虫是捜索引擎抓取系统(Baidu.Google等)的重要组成部分.主要目的是将互联网上的网页下载到本地,形成一个互联网内容的镜像备份. 步骤: 第一步:获取网页链接 1.观察需要爬取的多 ...

patho爬虫-爬取--华夏基金代码

patho爬虫-爬取--华夏基金代码相关推荐

最新文章

热门文章