巨潮资讯网爬取年报（存在错误）

import requests
import string
import os.path
import pandas
import re
import time# 请求头放在函数外面共用
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
}# 1.定义获取org_id的函数,code为证券代码
def get_org_id(code):org_id_url = 'http://www.cninfo.com.cn/new/information/topSearch/query'org_id_data = {'keyWord': code,'maxNum': '10',}org_id_response = requests.post(url=org_id_url, data=org_id_data, headers=headers).json()# 从response中获取org_idfor data in org_id_response:org_id = data['orgId']return org_id# org_id_response.close()return None# 2.定义get_plate_colum, 获取plate: 深圳sz  或者  上海sh  或者 北京bj;third,以及column: 深圳szse   或者  上海sse  或者 北京bj
def get_plate_colum_compy_short_listing(code):# 如何在网址里加变量：在url前写一个f，就可以在url中以{}的形式写入变量# global colum, plateglobal colum, plateplate_url = f'http://www.cninfo.com.cn/data20/companyOverview/getCompanyIntroduction?scode={code}'plate_response = requests.get(url=plate_url, headers=headers).json()#json的解析提取，用在线网站转换格式，有字典就用序号，没字典就用键名plate_market = plate_response['data']['records'][0]['basicInformation'][0]['MARKET']# print(plate_market)compy_short  = plate_response['data']['records'][0]['basicInformation'][0]['ASECNAME']listing_date  = plate_response['data']['records'][0]['basicInformation'][0]['F006D']# address = plate_response['data']['records'][0]['basicInformation'][0]['F004V']    #获取地址分文件夹# # print(type(address))# province = address[0:6]# print(province)if plate_market == '上交所':plate = 'sh'colum = 'sse'else:if str(plate_market)[0:3] == '深交所':plate = 'sz'colum = 'szse'else:if plate_market == '北交所':plate = 'bj;third'colum = 'bj'else:print(str(code)+'的所属市场非上非深非北，是：'+str(plate_market))# print(plate, colum)# plate_response.close()return plate, colum, compy_short, listing_date# 3.定义get_pdf_url_dict项目字典的函数,
def get_pdf_url_dict(code, org_id, plate, colum):pdf_url_dict = {}page = 1while True:fild_list_data = {'stock': '{},{}'.format(code, org_id),  # code.org_id'tabName': 'fulltext','pageSize': '30','pageNum': page,  # 需要得到该公告共有几页'column': colum,  #'category': 'category_ndbg_szsh;','plate': plate,  #'seDate': '','searchkey': '','secid': '','sortName': '','sortType': '','isHLtitle': 'true',}fild_list_url = 'http://www.cninfo.com.cn/new/hisAnnouncement/query'fild_list_response = requests.post(url=fild_list_url, data=fild_list_data, headers=headers).json() fild_list_content = fild_list_response.get("announcements")  if fild_list_content is not None:for i in range(len(fild_list_content)):content_1 = fild_list_content[i]item_title = content_1.get("announcementTitle")item_type = content_1.get("adjunctType")# print(item_title,item_type)item_url = content_1.get("adjunctUrl")item_compy = content_1.get("secName")item_code = content_1.get("secCode")if item_type != 'PDF':continueif item_title.find('摘要') != -1:continueif item_title.find('已取消') != -1:continueif item_title.find('补充公告') != -1:continueif item_title.find('英文版') != -1:continuepdf_url = 'http://static.cninfo.com.cn/' + str(item_url)pdf_title = re.findall(r'\d+.*', item_title)[0]pdf_url_dict[pdf_title + str(item_code) + str(item_compy)] = [pdf_url]# pdf_name.append(str(pdf_title))page += 1else:break# fild_list_response.close()return pdf_url_dict# 4.创建文件夹分类下载pdf_url_dict中的连接
def download_pdf(pdf_url_dict, compy_short, province):for title in pdf_url_dict:# print(title)url = pdf_url_dict[title][0]## print(url)# 获取各个pdf链接的内容response = requests.get(url=url, headers=headers).content# 新建文件夹分类#   四川和重庆分开放if not os.path.exists('.file/{}'.format(province)):os.path.exists('.file/{}'.format(province))#   建立每个公司的文件夹if not os.path.exists('./file/{}/{}{}'.format(province, code, compy_short)):os.makedirs('./file/{}/{}{}'.format(province, code, compy_short))#   将pdf的内容写入with open('./file/{}/{}{}/{}.pdf'.format(province, code, compy_short, title), 'wb')as f:f.write(response)print('已下载——地区：' + str(province) + '  证券代码：' + str(code) + '  证券简称：' + str(compy_short) + '  PDF名称：' + str(title))# 5.定义run运行函数
def run(code, province):org_id = get_org_id(code)print(org_id)time.sleep(2)plate = get_plate_colum_compy_short_listing(code)[0]colum = get_plate_colum_compy_short_listing(code)[1]compy_short = get_plate_colum_compy_short_listing(code)[2]listing_date = get_plate_colum_compy_short_listing(code)[3]# print(plate, colum, compy_short)time.sleep(2)#因为有些刚上市的公司没有年报，需要跳过screen_data = {'stock': '{},{}'.format(code, org_id),  # code.org_id'tabName': 'fulltext','pageSize': '30','pageNum': '1','column': colum,'category': 'category_ndbg_szsh;','plate': plate,'seDate': '','searchkey': '','secid': '','sortName': '','sortType': '','isHLtitle': 'true',}screen_url = 'http://www.cninfo.com.cn/new/hisAnnouncement/query'screen_response = requests.post(url=screen_url, data=screen_data, headers=headers).json()  # screen_response就变成了字典screen_content = screen_response.get("announcements")  # screen_content是个列表if screen_content is None:print(str(compy_short) + str(code) + str(listing_date) + '上市无年报')else:time.sleep(1)pdf_url_dict = get_pdf_url_dict(code, org_id, plate, colum)download_pdf(pdf_url_dict, compy_short, province)# screen_response.close()df = pandas.read_excel('./川渝上市企业证券代码.xls', converters={'code': str, 'province': str})
for i in range(df.shape[0]):    #循环df的行数code = df['code'][i]province = df['province'][i]print(code, province)run(code, province)

报错：(10054, '远程主机强迫关闭了一个现有的连接。', None, 10054, None)

603100 重庆市
9900022948
已下载——地区：重庆市  证券代码：603100  证券简称：川仪股份PDF名称：2021年年度报告603100川仪股份
已下载——地区：重庆市  证券代码：603100  证券简称：川仪股份PDF名称：2020年年度报告603100川仪股份
已下载——地区：重庆市  证券代码：603100  证券简称：川仪股份PDF名称：2019年年度报告603100川仪股份
已下载——地区：重庆市  证券代码：603100  证券简称：川仪股份PDF名称：2018年年度报告603100川仪股份
已下载——地区：重庆市  证券代码：603100  证券简称：川仪股份PDF名称：2017年年度报告603100川仪股份
已下载——地区：重庆市  证券代码：603100  证券简称：川仪股份PDF名称：2016年年度报告603100川仪股份
已下载——地区：重庆市  证券代码：603100  证券简称：川仪股份PDF名称：2015年年度报告603100川仪股份
已下载——地区：重庆市  证券代码：603100  证券简称：川仪股份PDF名称：2014年年度报告603100川仪股份
600917 重庆市
9900029304
已下载——地区：重庆市  证券代码：600917  证券简称：重庆燃气PDF名称：2021年年度报告600917重庆燃气
已下载——地区：重庆市  证券代码：600917  证券简称：重庆燃气PDF名称：2020年年度报告600917重庆燃气
已下载——地区：重庆市  证券代码：600917  证券简称：重庆燃气PDF名称：2019年年度报告600917重庆燃气
已下载——地区：重庆市  证券代码：600917  证券简称：重庆燃气PDF名称：2018年年度报告600917重庆燃气
已下载——地区：重庆市  证券代码：600917  证券简称：重庆燃气PDF名称：2017年年度报告600917重庆燃气
已下载——地区：重庆市  证券代码：600917  证券简称：重庆燃气PDF名称：2016年年度报告600917重庆燃气
已下载——地区：重庆市  证券代码：600917  证券简称：重庆燃气PDF名称：2015年年度报告600917重庆燃气
已下载——地区：重庆市  证券代码：600917  证券简称：重庆燃气PDF名称：2014年年度报告600917重庆燃气
301121 重庆市
9900046816
紫建电子3011212022-08-08上市无年报
003006 重庆市
9900032173
已下载——地区：重庆市  证券代码：003006  证券简称：百亚股份PDF名称：2021年年度报告003006百亚股份
已下载——地区：重庆市  证券代码：003006  证券简称：百亚股份PDF名称：2020年年度报告003006百亚股份
603697 重庆市
gfbj0831377
已下载——地区：重庆市  证券代码：603697  证券简称：有友食品PDF名称：2021年年度报告603697有友食品
已下载——地区：重庆市  证券代码：603697  证券简称：有友食品PDF名称：2020年年度报告603697有友食品
已下载——地区：重庆市  证券代码：603697  证券简称：有友食品PDF名称：2019年年度报告603697有友食品
000514 重庆市
gssz0000514
Traceback (most recent call last):File "E:\Study\Python\venv\lib\site-packages\urllib3\connectionpool.py", line 703, in urlopenhttplib_response = self._make_request(File "E:\Study\Python\venv\lib\site-packages\urllib3\connectionpool.py", line 449, in _make_requestsix.raise_from(e, None)File "<string>", line 3, in raise_fromFile "E:\Study\Python\venv\lib\site-packages\urllib3\connectionpool.py", line 444, in _make_requesthttplib_response = conn.getresponse()File "D:\Python3.10.6\lib\http\client.py", line 1374, in getresponseresponse.begin()File "D:\Python3.10.6\lib\http\client.py", line 318, in beginversion, status, reason = self._read_status()File "D:\Python3.10.6\lib\http\client.py", line 279, in _read_statusline = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")File "D:\Python3.10.6\lib\socket.py", line 705, in readintoreturn self._sock.recv_into(b)
ConnectionResetError: [WinError 10054] 远程主机强迫关闭了一个现有的连接。During handling of the above exception, another exception occurred:Traceback (most recent call last):File "E:\Study\Python\venv\lib\site-packages\requests\adapters.py", line 489, in sendresp = conn.urlopen(File "E:\Study\Python\venv\lib\site-packages\urllib3\connectionpool.py", line 787, in urlopenretries = retries.increment(File "E:\Study\Python\venv\lib\site-packages\urllib3\util\retry.py", line 550, in incrementraise six.reraise(type(error), error, _stacktrace)File "E:\Study\Python\venv\lib\site-packages\urllib3\packages\six.py", line 769, in reraiseraise value.with_traceback(tb)File "E:\Study\Python\venv\lib\site-packages\urllib3\connectionpool.py", line 703, in urlopenhttplib_response = self._make_request(File "E:\Study\Python\venv\lib\site-packages\urllib3\connectionpool.py", line 449, in _make_requestsix.raise_from(e, None)File "<string>", line 3, in raise_fromFile "E:\Study\Python\venv\lib\site-packages\urllib3\connectionpool.py", line 444, in _make_requesthttplib_response = conn.getresponse()File "D:\Python3.10.6\lib\http\client.py", line 1374, in getresponseresponse.begin()File "D:\Python3.10.6\lib\http\client.py", line 318, in beginversion, status, reason = self._read_status()File "D:\Python3.10.6\lib\http\client.py", line 279, in _read_statusline = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")File "D:\Python3.10.6\lib\socket.py", line 705, in readintoreturn self._sock.recv_into(b)
urllib3.exceptions.ProtocolError: ('Connection aborted.', ConnectionResetError(10054, '远程主机强迫关闭了一个现有的连接。', None, 10054, None))During handling of the above exception, another exception occurred:Traceback (most recent call last):File "E:\Study\Python\python学习\上市公司年报下载\自己再写写试试.py", line 262, in <module>run(code, province)File "E:\Study\Python\python学习\上市公司年报下载\自己再写写试试.py", line 223, in runplate = get_plate_colum_compy_short_listing(code)[0]File "E:\Study\Python\python学习\上市公司年报下载\自己再写写试试.py", line 91, in get_plate_colum_compy_short_listingplate_response = requests.get(url=plate_url, headers=headers).json()File "E:\Study\Python\venv\lib\site-packages\requests\api.py", line 73, in getreturn request("get", url, params=params, **kwargs)File "E:\Study\Python\venv\lib\site-packages\requests\api.py", line 59, in requestreturn session.request(method=method, url=url, **kwargs)File "E:\Study\Python\venv\lib\site-packages\requests\sessions.py", line 587, in requestresp = self.send(prep, **send_kwargs)File "E:\Study\Python\venv\lib\site-packages\requests\sessions.py", line 701, in sendr = adapter.send(request, **kwargs)File "E:\Study\Python\venv\lib\site-packages\requests\adapters.py", line 547, in sendraise ConnectionError(err, request=request)
requests.exceptions.ConnectionError: ('Connection aborted.', ConnectionResetError(10054, '远程主机强迫关闭了一个现有的连接。', None, 10054, None))进程已结束,退出代码1

巨潮资讯网爬取年报（存在错误）相关推荐

python + selenium实现巨潮资讯网指定范围年报下载
大家好!第一次写文章,紧张滴捏! 这段时间在做课设,课设里需要下载沪市600000到601000号的年报原文做数字化关键词的词频分析,想着用程序帮我批量下载一下,但是找了一下貌似没有类似的代码,就写了 ...
爬虫|巨潮资讯网上市公司年报爬取
爬虫|巨潮资讯网上市公司年报爬取 import pandas as pd from selenium import webdriver from selenium.webdriver.common.k ...
python3爬取巨潮资讯网的年报数据
python3爬取巨潮资讯网的年报数据前期准备: 需要用到的库: 完整代码: 前期准备: 巨潮资讯网有反爬虫机制,所以先打开巨潮资讯网的年报板块,看看有什么解决办法. 巨潮咨询年报板块可以通过这样 ...
基于python+selenium+Chrome自动化爬取巨潮资讯网A股财务报表
转自同学的博客引言: 网页爬虫分为静态网页爬虫和动态网页爬虫,前者是指索要获取的网页内容不需要经过js运算或者人工交互, 后者是指获取的内容必须要经过js运算或者人工交互.这里的js运算可能是aja ...
批量爬取巨潮资讯网中“贵州茅台”相关公告的PDF文件。
1 需求批量爬取巨潮资讯网中"贵州茅台"相关公告的PDF文件. 2 代码实现 import reimport requests from selenium import webd ...
请用Python语言写一个巨潮资讯网批量下载PDF的程序
下面是一个使用 Python 的简单程序,可以批量下载巨潮资讯网上的 PDF 文件: import requests import os# 巨潮资讯网 PDF 文件的 URL 前缀 url_prefi ...
爬取年报(巨潮资讯网)
https://blog.csdn.net/herr_kun/article/details/89707078#commentBox
2023-Python实现巨潮资讯网数据采集
目录
ChatGPT炒股：从巨潮资讯网上批量下载特定主题的股票公告
巨潮资讯网是股票公告的指定披露渠道之一,上面有非常详细的A股股票公告内容. 现在,我们要获取2023-01-04~2023-07-04期间所有新三板公司中标题包含"2023年日常性关联交易& ...

巨潮资讯网爬取年报（存在错误）

巨潮资讯网爬取年报（存在错误）相关推荐

最新文章

热门文章