python pdf转txt_Python之pdf转txt
PDFMiner----python的PDF解析器和分析器
PDFParser:
从一个文件中获取数据
PDFDocument:
保存获取的数据,和PDFParser是相互关联的
PDFPageInterpreter:处理页面内容
PDFDevice: 将其翻译成你需要的格式
PDFResourceManager:
用于存储共享资源,如字体或图像。
#!/usr/bin/python
import sys
import os
from binascii import b2a_hex
###
### pdf-miner requirements
###
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument,
PDFNoOutlines
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager,
PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTTextLine,
LTFigure, LTImage, LTChar
def with_pdf (pdf_doc, fn, pdf_pwd, *args):
"""Open the pdf
document, and apply the function, returning the results"""
result = None
try:
# open the pdf file
fp = open(pdf_doc, 'rb')
# create a parser object associated with the
file object
parser = PDFParser(fp)
# create a PDFDocument object that stores the
document structure
doc = PDFDocument(parser, pdf_pwd)
# connect the parser and document objects
parser.set_document(doc)
# supply the password for initialization
if doc.is_extractable:
# apply
the function and return the result
result =
fn(doc, *args)
# close the pdf file
fp.close()
except IOError:
# the file doesn't exist or similar
problem
pass
return result
###
### Table of Contents
###
def _parse_toc (doc):
"""With an open
PDFDocument object, get the table of contents (toc) data
[this is a higher-order
function to be passed to with_pdf()]"""
toc = []
try:
outlines = doc.get_outlines()
for (level,title,dest,a,se) in outlines:
toc.append( (level, title) )
except
PDFNoOutlines:
pass
return toc
def get_toc (pdf_doc, pdf_pwd=''):
"""Return the table of
contents (toc), if any, for this pdf file"""
return with_pdf(pdf_doc,
_parse_toc, pdf_pwd)
###
### Extracting Images
###
def write_file (folder, filename, filedata, flags='w'):
"""Write the file data
to the folder and filename combination
(flags: 'w' for write
text, 'wb' for write binary, use 'a' instead of 'w' for
append)"""
result = False
if
os.path.isdir(folder):
try:
file_obj =
open(os.path.join(folder, filename), flags)
file_obj.write(filedata)
file_obj.close()
result =
True
except IOError:
pass
return result
def determine_image_type (stream_first_4_bytes):
"""Find out the image
file type based on the magic number comparison of the first 4 (or
2) bytes"""
file_type = None
bytes_as_hex =
b2a_hex(stream_first_4_bytes)
if
bytes_as_hex.startswith('ffd8'):
file_type = '.jpeg'
elif bytes_as_hex ==
'89504e47':
file_type = '.png'
elif bytes_as_hex ==
'47494638':
file_type = '.gif'
elif
bytes_as_hex.startswith('424d'):
file_type = '.bmp'
return file_type
def save_image (lt_image, page_number, images_folder):
"""Try to save the image
data from this LTImage object, and return the file name, if
successful"""
result = None
if
lt_image.stream:
file_stream =
lt_image.stream.get_rawdata()
if file_stream:
file_ext =
determine_image_type(file_stream[0:4])
if
file_ext:
file_name =
''.join([str(page_number), '_', lt_image.name, file_ext])
if write_file(images_folder,
file_name, file_stream, flags='wb'):
result = file_name
return result
###
### Extracting Text
###
def to_bytestring (s, enc='utf-8'):
"""Convert the given
unicode string to a bytestring, using the standard encoding,
unless it's already a
bytestring"""
if s:
if isinstance(s, str):
return
s
else:
return
s.encode(enc)
def update_page_text_hash (h, lt_obj, pct=0.2):
"""Use the bbox x0,x1
values within pct% to produce lists of associated text within the
hash"""
x0 =
lt_obj.bbox[0]
x1 =
lt_obj.bbox[2]
key_found = False
for k, v in
h.items():
hash_x0 = k[0]
if x0 >= (hash_x0 * (1.0-pct)) and (hash_x0 *
(1.0+pct)) >= x0:
hash_x1 =
k[1]
if x1
>= (hash_x1 * (1.0-pct)) and (hash_x1 * (1.0+pct)) >=
x1:
# the text inside this LT*
object was positioned at the same
# width as a prior series of
text, so it belongs together
key_found = True
v.append(to_bytestring(lt_obj.get_text()))
h[k] = v
if not key_found:
# the text, based on width, is a new
series,
# so it gets its own series (entry in the
hash)
h[(x0,x1)] =
[to_bytestring(lt_obj.get_text())]
return h
def parse_lt_objs (lt_objs, page_number, images_folder,
text=[]):
"""Iterate through the
list of LT* objects and capture the text or image data contained in
each"""
text_content =
[]
page_text = {} # k=(x0,
x1) of the bbox, v=list of text strings within that bbox width
(physical column)
for lt_obj in
lt_objs:
if isinstance(lt_obj, LTTextBox) or
isinstance(lt_obj, LTTextLine):
# text, so
arrange is logically based on its column width
page_text
= update_page_text_hash(page_text, lt_obj)
elif isinstance(lt_obj, LTImage):
# an
image, so save it to the designated folder, and note its place in
the text
saved_file
= save_image(lt_obj, page_number, images_folder)
if
saved_file:
# use html style
tag
to mark the position of the image within the text
text_content.append('
')
else:
print >> sys.stderr,
"error saving image on page", page_number, lt_obj.__repr__
elif isinstance(lt_obj, LTFigure):
# LTFigure
objects are containers for other LT* objects, so recurse through
the children
text_content.append(parse_lt_objs(lt_obj, page_number,
images_folder, text_content))
for k, v in
sorted([(key,value) for (key,value) in page_text.items()]):
# sort the page_text hash by the keys (x0,x1
values of the bbox),
# which produces a top-down, left-to-right
sequence of related columns
text_content.append(''.join(v))
return
'\n'.join(text_content)
###
### Processing Pages
###
def _parse_pages (doc, images_folder):
"""With an open
PDFDocument object, get the pages and parse each one
[this is a higher-order
function to be passed to with_pdf()]"""
rsrcmgr =
PDFResourceManager()
laparams =
LAParams()
device =
PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter =
PDFPageInterpreter(rsrcmgr, device)
text_content = []
for i, page in
enumerate(PDFPage.create_pages(doc)):
interpreter.process_page(page)
# receive the LTPage object for this page
layout = device.get_result()
# layout is an LTPage object which may contain
child objects like LTTextBox, LTFigure, LTImage, etc.
text_content.append(parse_lt_objs(layout, (i+1),
images_folder))
return
text_content
def get_pages (pdf_doc, pdf_pwd='',
images_folder='/tmp'):
"""Process each of the
pages in this pdf file and return a list of strings representing
the text found in each page"""
return with_pdf(pdf_doc,
_parse_pages, pdf_pwd, *tuple([images_folder]))
a = open('a.txt','a')
for i in get_pages('/home/test.pdf'):
a.write(i)
a.close()
python pdf转txt_Python之pdf转txt相关推荐
- python 保存pdf文件_PyPDF2读取PDF文件内容保存到本地TXT实例
我就废话不多说了,大家还是直接看代码吧! from PyPDF2.pdf import PdfFileReader import pandas as pd def Pdf_to_txt(pdf): f ...
- Python打开文件并进行处理,txt、excel、pdf、word!
1. 概要 在办公处理中,我们常常要打开一些文件,面临大量的数据时,传统的人工方法耗时耗力.在python中,有一系列包装好的库,让我们能够很方便的操作各种类型的文件.当然,python的内置函数也能 ...
- 【Python小技巧】将pdf转为txt,并使用edge-tts将txt批量转为MP3(不想看书想听书的转过来,送源代码)
文章目录 前言 一.PDF转为MP3 ? 二.准备工作 1. 安装pdfplumber包,用于将pdf转为txt 2. 安装edge-tts包,用于将txt转为音频 三.代码很简单 四.变更播音员 总 ...
- 用python汇总pdf文件_Python处理PDF文件-简译与总结
最近看到一篇介绍Python中pyPDF模块的文章,详细介绍了使用pyPDF模块获取PDF文件信息,合并拆分PDF文件等功能.很方便,在此搬运分享以下: 全文介绍了以下几方面的功能 提取文件信息 旋转 ...
- 数据挖掘基础之数据清洗:用python把“深圳二手房参考价”PDF保存为EXCEL
坑DIE的住建局再一次不限富豪限刚需,公布了深圳市住宅小区二手住房成交参考价格,买房更难,首付更难凑... 数据挖掘基础之数据清理:用python把深圳二手房参考价PDF保存为EXCEL,以便其他分析 ...
- python读取pdf文件_python读取pdf文件
广告关闭 腾讯云11.11云上盛惠 ,精选热门产品助力上云,云服务器首年88元起,买的越多返的越多,最高返5000元! 一.安装pdfminer3k模块?二. 读取pdf文件import sysimp ...
- python不可以处理pdf文件_Python处理PDF文件-简译与总结
最近看到一篇介绍Python中pyPDF模块的文章,详细介绍了使用pyPDF模块获取PDF文件信息,合并拆分PDF文件等功能.很方便,在此搬运分享以下: 全文介绍了以下几方面的功能 提取文件信息 旋转 ...
- 【Python】用python将html转化为pdf
其实早在去年就有做过,一直没有写,先简单记录下 1.主要用到的工具[wkhtmltopdf] [下载地址]wkhtmltopdf 根据系统选择安装包,速度有点慢,先挂着 2.下载Python库 pip ...
- 《我的Python之路V1.3.pdf》可以下载了,这版pdf更精美!
1 前言 Python之路V1.3.pdf,使用更加专业的latex脚本和专业的Tex Live编译器生成,在经过这周3,4天的学习,编写,制作,终于完成年前制定的计划. 在制作V1.3.pdf中,遇 ...
- python精彩编程200例pdf下载-最经典的25本Python编程开发电子书(附下载地址)!...
如果,有人想转行程序猿,而让给出"如何切入编程"的建议的话,许多人一定毫不犹豫地向他推荐"Python"! 没错,就是这样一只Python,它让众多程序开发者, ...
最新文章
- [转]emacs中文输入问题
- Linux I/O 那些事儿
- 一文读懂 HTTP/1HTTP/2HTTP/3
- linux svn apt get,Ubuntu下安装SVN客户端
- QQ 鼻祖立功!世界最大僵尸网络 Andromeda 为祸七年终被捣毁
- 架构设计--逻辑层 vs 物理层
- textView 属性总结
- 工程施工工地进度监控带天气经纬度相机(监理日志不再难写)
- checkIP——烂代码又堆了一个ip活性检测工具
- pandas统计个数
- python为在线漫画网站自制非官方API(未完待续)
- “Failed to load response data“ django@xframe_options_exempt 网站不许 Firefox 显示被嵌入的网页
- Qt实现 基于ffmpeg拉流播放视频
- Windows Server 2008R2 取消屏幕自动锁定
- 制作往图片里插入视频,视频添加到图片上播放
- 什么是太阳光模拟器整车全光谱阳光模拟系统?
- 陈老师排课软件12A(正式版)
- 《C++ Templates》笔记 Chapter 12 Fundamentals in Depth-Chapter 13 Names in Templates
- Golang map源码浅析
- 面积与弦长_Simplelife_新浪博客