python 读取pdf 两栏_python 读取pdf

# #-*- coding: UTF-8 -*-

# -*- coding:utf-8 -*-

##提取pdf文件中的文字

importtime,os.path,requests,re

time1=time.time()

frompdfminer.pdfinterp importPDFResourceManager, PDFPageInterpreter

frompdfminer.converter importPDFPageAggregator

frompdfminer.layout importLAParams,LTTextBoxHorizontal

frompdfminer.pdfpage importPDFTextExtractionNotAllowed,PDFPage

frompdfminer.pdfparser importPDFParser

frompdfminer.pdfdocument importPDFDocument

classCPdf2TxtManager():

defchangePdfToText(self, filePath):

# 以二进制读模式打开

file = open(path, 'rb')

#用文件对象来创建一个pdf文档分析器

praser = PDFParser(file)

# 创建一个PDF文档对象存储文档结构,提供密码初始化，没有就不用传该参数

doc = PDFDocument(praser, password='')

##检查文件是否允许文本提取

if notdoc.is_extractable:

raisePDFTextExtractionNotAllowed

# 创建PDf 资源管理器来管理共享资源，#caching = False不缓存

rsrcmgr = PDFResourceManager(caching = False)

# 创建一个PDF设备对象

laparams = LAParams()

# 创建一个PDF页面聚合对象

device = PDFPageAggregator(rsrcmgr, laparams=laparams)

# 创建一个PDF解析器对象

interpreter = PDFPageInterpreter(rsrcmgr, device)

# 获得文档的目录(纲要),文档没有纲要会报错

#PDF文档没有目录时会报：raise PDFNoOutlines pdfminer.pdfdocument.PDFNoOutlines

# print(doc.get_outlines())

# 获取page列表

print(PDFPage.get_pages(doc))

# 循环遍历列表，每次处理一个page的内容

forpage inPDFPage.create_pages(doc):

interpreter.process_page(page)

# 接受该页面的LTPage对象

layout = device.get_result()

# 这里layout是一个LTPage对象里面存放着这个page解析出的各种对象

# 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等

forx inlayout:

ifhasattr(x, "get_text"):

fileNames = os.path.splitext(filePath)

withopen(fileNames[0] + '.txt','a+') asf:

results = x.get_text()

print(results)

f.write(results + '\n')

# 如果x是水平文本对象的话

# if (isinstance(x, LTTextBoxHorizontal)):

# text = re.sub(replace, '', x.get_text())

# if len(text) != 0:

# print(text)

if__name__ == '__main__':

path = r'd:\tmp\c.pdf'

pdf2TxtManager = CPdf2TxtManager()

pdf2TxtManager.changePdfToText(path)

time2 = time.time()

print('ok,解析pdf结束!')

print('总共耗时：'+ str(time2 - time1) + 's')

# 方法2

# # -*- coding: utf-8 -*-

# from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter

# from pdfminer.converter import TextConverter

# from pdfminer.layout import LAParams

# from pdfminer.pdfpage import PDFPage

# import requests,os,re

# try:#python3

# from io import StringIO

# from urllib.request import urlopen

# except:#python2

# from urllib import urlopen

# from cStringIO import StringIO

# def convert_pdf_to_txt(path,save_name):

# if debug:

# # 加载内存的方式

# retstr = StringIO()

# fp = StringIO(path)

# else:

# #读取文件的方式

# retstr = open(path, 'rb')

# fp = open(path, 'rb')

# # 创建一个PDF资源管理器对象来存储共享资源,caching = False不缓存

# rsrcmgr = PDFResourceManager(caching=False)

# # 创建一个PDF设备对象

# laparams = LAParams()

# device = TextConverter(rsrcmgr, retstr, codec='utf-8', laparams=laparams)

# # 创建一个PDF解析器对象

# interpreter = PDFPageInterpreter(rsrcmgr, device)

# for page in PDFPage.get_pages(fp, pagenos=set(), maxpages=0, password='',caching=True, check_extractable=True):

# interpreter.process_page(page)

# fp.close()#关闭输入流

# device.close()#关闭输出流

# str = retstr.getvalue()

# retstr.close()

# try:

# with open("%s"%save_name,"w") as f:

# for i in str:

# f.write(i)

# print("%s Writing Succeed!"%save_name)

# except:

# print("Writing Failed!")

# if __name__ == '__main__':

# try:

# debug=True

# if debug:

# #这种方式暂时还有问题

# pdf_file = urlopen(url).read() # 也可以换成本地pdf文件，用open rb模式打开

# # pdf_file = requests.get(url).content

# # 加载内存的方式

# convert_pdf_to_txt(pdf_file, "123.txt")

# else:

# #读取文件的方式

# convert_pdf_to_txt('11.pdf',"123.txt")

# except Exception as e:

# import traceback

# ex_msg = '{exception}'.format(exception=traceback.format_exc())

# print(ex_msg)

python 读取pdf 两栏_python 读取pdf相关推荐

chatgpt赋能python：Python如何分成两栏写入Word文档
Python如何分成两栏写入Word文档在进行文本排版时,有些时候我们需要将文字分成两栏来排版,这样可以让文章更加美观,易读. 本文将介绍一种使用Python将文本分成两栏写入Word文档的方法.在 ...
python提取pdf发票信息_python读取pdf（发票）
想读取文件夹*.pdf格式的发票并写入到excel当中,当然也可以写入txt(注释代码有) 详见下面代码,代码开头有参考的几篇文章的地址一开始用的是pdfplumber,不好用,识别率不高,后来使用 ...
python获取pdf页数_Python读取pdf页面的一部分
假设您使用的是pdfminer和pypdf2,那么我假设所讨论的PDF文件是生成的PDF而不是扫描的(如您所给出的示例中所示).如果您知道以英寸为单位的列和行的大小,您可以使用^{}(完全公开:我写了 ...
python读取配置文件失败原因_python读取配置文件报keyerror-文件路径不正确导致的错误...
- 在其他模块使用反射读取配置文件报错,但是在反射模块中读取GetData.check_list又是正确的反射模块如下: # get_data.py from API_AUTO.p2p_projec ...
python数据处理pdf百度云_Python数据处理 PDF 高清版
给大家带来的一篇关于Python相关的电子书资源,介绍了关于Python.数据.处理方面的内容,本书是由人民邮电出版社出版,格式为PDF,资源大小12.0 MB,杰奎琳·凯泽尔编写,目前豆瓣.亚马逊. ...
python合并pdf 加书签_Python生成pdf目录书签的实例方法
有时候我们用的一些pdf资料是没有目录的,这样找寻我们想到的东西比较麻烦.本篇文章就为大家带来python来生成pdf目录书签的方法. 首先,我们需要下载一个软件FreePic2Pdf,利用它我们可以 ...
python读取ini文件编码格式_Python读取txt（.ini）文件BOM问题
2018-06-13 11:20:40 在windows上使用open打开utf-8编码的txt文件时开头会有一个多余的字符,它叫BOM,是用来声明编码等信息的,但python会把它当作文本解析 ...
python区域找图命令_python读取图片任意范围区域
使用python进行图片处理,现在需要读出图片的任意一块区域,并将其转化为一维数组,方便后续卷积操作的使用. 下面使用两种方法进行处理: convert 函数 from PIL import Imag ...
python从txt读取数据并画图_Python读取txt某几列绘图的方法
晚上帮同学用Python脚本绘图,大概需求是读取一个txt文件的两列分别作为x和y的值,绘图即可,代码如下: #coding:utf-8 import numpy as np import matpl ...

python 读取pdf 两栏_python 读取pdf

python 读取pdf 两栏_python 读取pdf相关推荐

最新文章

热门文章