python pdf转txt_Python之pdf转txt

PDFMiner----python的PDF解析器和分析器

PDFParser:

从一个文件中获取数据

PDFDocument:

保存获取的数据，和PDFParser是相互关联的

PDFPageInterpreter:处理页面内容

PDFDevice: 将其翻译成你需要的格式

PDFResourceManager:

用于存储共享资源，如字体或图像。

#!/usr/bin/python

import sys

import os

from binascii import b2a_hex

###

### pdf-miner requirements

###

from pdfminer.pdfparser import PDFParser

from pdfminer.pdfdocument import PDFDocument,

PDFNoOutlines

from pdfminer.pdfpage import PDFPage

from pdfminer.pdfinterp import PDFResourceManager,

PDFPageInterpreter

from pdfminer.converter import PDFPageAggregator

from pdfminer.layout import LAParams, LTTextBox, LTTextLine,

LTFigure, LTImage, LTChar

def with_pdf (pdf_doc, fn, pdf_pwd, *args):

"""Open the pdf

document, and apply the function, returning the results"""

result = None

try:

# open the pdf file

fp = open(pdf_doc, 'rb')

# create a parser object associated with the

file object

parser = PDFParser(fp)

# create a PDFDocument object that stores the

document structure

doc = PDFDocument(parser, pdf_pwd)

# connect the parser and document objects

parser.set_document(doc)

# supply the password for initialization

if doc.is_extractable:

# apply

the function and return the result

result =

fn(doc, *args)

# close the pdf file

fp.close()

except IOError:

# the file doesn't exist or similar

problem

pass

return result

###

### Table of Contents

###

def _parse_toc (doc):

"""With an open

PDFDocument object, get the table of contents (toc) data

[this is a higher-order

function to be passed to with_pdf()]"""

toc = []

try:

outlines = doc.get_outlines()

for (level,title,dest,a,se) in outlines:

toc.append( (level, title) )

except

PDFNoOutlines:

pass

return toc

def get_toc (pdf_doc, pdf_pwd=''):

"""Return the table of

contents (toc), if any, for this pdf file"""

return with_pdf(pdf_doc,

_parse_toc, pdf_pwd)

###

### Extracting Images

###

def write_file (folder, filename, filedata, flags='w'):

"""Write the file data

to the folder and filename combination

(flags: 'w' for write

text, 'wb' for write binary, use 'a' instead of 'w' for

append)"""

result = False

os.path.isdir(folder):

try:

file_obj =

open(os.path.join(folder, filename), flags)

file_obj.write(filedata)

file_obj.close()

result =

True

except IOError:

pass

return result

def determine_image_type (stream_first_4_bytes):

"""Find out the image

file type based on the magic number comparison of the first 4 (or

2) bytes"""

file_type = None

bytes_as_hex =

b2a_hex(stream_first_4_bytes)

bytes_as_hex.startswith('ffd8'):

file_type = '.jpeg'

elif bytes_as_hex ==

'89504e47':

file_type = '.png'

elif bytes_as_hex ==

'47494638':

file_type = '.gif'

elif

bytes_as_hex.startswith('424d'):

file_type = '.bmp'

return file_type

def save_image (lt_image, page_number, images_folder):

"""Try to save the image

data from this LTImage object, and return the file name, if

successful"""

result = None

lt_image.stream:

file_stream =

lt_image.stream.get_rawdata()

if file_stream:

file_ext =

determine_image_type(file_stream[0:4])

file_ext:

file_name =

''.join([str(page_number), '_', lt_image.name, file_ext])

if write_file(images_folder,

file_name, file_stream, flags='wb'):

result = file_name

return result

###

### Extracting Text

###

def to_bytestring (s, enc='utf-8'):

"""Convert the given

unicode string to a bytestring, using the standard encoding,

unless it's already a

bytestring"""

if s:

if isinstance(s, str):

return

else:

return

s.encode(enc)

def update_page_text_hash (h, lt_obj, pct=0.2):

"""Use the bbox x0,x1

values within pct% to produce lists of associated text within the

hash"""

x0 =

lt_obj.bbox[0]

x1 =

lt_obj.bbox[2]

key_found = False

for k, v in

h.items():

hash_x0 = k[0]

if x0 >= (hash_x0 * (1.0-pct)) and (hash_x0 *

(1.0+pct)) >= x0:

hash_x1 =

k[1]

if x1

>= (hash_x1 * (1.0-pct)) and (hash_x1 * (1.0+pct)) >=

x1:

# the text inside this LT*

object was positioned at the same

# width as a prior series of

text, so it belongs together

key_found = True

v.append(to_bytestring(lt_obj.get_text()))

h[k] = v

if not key_found:

# the text, based on width, is a new

series,

# so it gets its own series (entry in the

hash)

h[(x0,x1)] =

[to_bytestring(lt_obj.get_text())]

return h

def parse_lt_objs (lt_objs, page_number, images_folder,

text=[]):

"""Iterate through the

list of LT* objects and capture the text or image data contained in

each"""

text_content =

[]

page_text = {} # k=(x0,

x1) of the bbox, v=list of text strings within that bbox width

(physical column)

for lt_obj in

lt_objs:

if isinstance(lt_obj, LTTextBox) or

isinstance(lt_obj, LTTextLine):

# text, so

arrange is logically based on its column width

page_text

= update_page_text_hash(page_text, lt_obj)

elif isinstance(lt_obj, LTImage):

# an

image, so save it to the designated folder, and note its place in

the text

saved_file

= save_image(lt_obj, page_number, images_folder)

saved_file:

# use html style

tag

to mark the position of the image within the text

text_content.append('

else:

print >> sys.stderr,

"error saving image on page", page_number, lt_obj.__repr__

elif isinstance(lt_obj, LTFigure):

# LTFigure

objects are containers for other LT* objects, so recurse through

the children

text_content.append(parse_lt_objs(lt_obj, page_number,

images_folder, text_content))

for k, v in

sorted([(key,value) for (key,value) in page_text.items()]):

# sort the page_text hash by the keys (x0,x1

values of the bbox),

# which produces a top-down, left-to-right

sequence of related columns

text_content.append(''.join(v))

return

'\n'.join(text_content)

###

### Processing Pages

###

def _parse_pages (doc, images_folder):

"""With an open

PDFDocument object, get the pages and parse each one

[this is a higher-order

function to be passed to with_pdf()]"""

rsrcmgr =

PDFResourceManager()

laparams =

LAParams()

device =

PDFPageAggregator(rsrcmgr, laparams=laparams)

interpreter =

PDFPageInterpreter(rsrcmgr, device)

text_content = []

for i, page in

enumerate(PDFPage.create_pages(doc)):

interpreter.process_page(page)

# receive the LTPage object for this page

layout = device.get_result()

# layout is an LTPage object which may contain

child objects like LTTextBox, LTFigure, LTImage, etc.

text_content.append(parse_lt_objs(layout, (i+1),

images_folder))

return

text_content

def get_pages (pdf_doc, pdf_pwd='',

images_folder='/tmp'):

"""Process each of the

pages in this pdf file and return a list of strings representing

the text found in each page"""

return with_pdf(pdf_doc,

_parse_pages, pdf_pwd, *tuple([images_folder]))

a = open('a.txt','a')

for i in get_pages('/home/test.pdf'):

a.write(i)

a.close()

python pdf转txt_Python之pdf转txt相关推荐

python 保存pdf文件_PyPDF2读取PDF文件内容保存到本地TXT实例
我就废话不多说了,大家还是直接看代码吧! from PyPDF2.pdf import PdfFileReader import pandas as pd def Pdf_to_txt(pdf): f ...
Python打开文件并进行处理，txt、excel、pdf、word！
1. 概要在办公处理中,我们常常要打开一些文件,面临大量的数据时,传统的人工方法耗时耗力.在python中,有一系列包装好的库,让我们能够很方便的操作各种类型的文件.当然,python的内置函数也能 ...
【Python小技巧】将pdf转为txt，并使用edge-tts将txt批量转为MP3（不想看书想听书的转过来，送源代码）
文章目录前言一.PDF转为MP3 ? 二.准备工作 1. 安装pdfplumber包,用于将pdf转为txt 2. 安装edge-tts包,用于将txt转为音频三.代码很简单四.变更播音员总 ...
用python汇总pdf文件_Python处理PDF文件-简译与总结
最近看到一篇介绍Python中pyPDF模块的文章,详细介绍了使用pyPDF模块获取PDF文件信息,合并拆分PDF文件等功能.很方便,在此搬运分享以下: 全文介绍了以下几方面的功能提取文件信息旋转 ...
数据挖掘基础之数据清洗：用python把“深圳二手房参考价”PDF保存为EXCEL
坑DIE的住建局再一次不限富豪限刚需,公布了深圳市住宅小区二手住房成交参考价格,买房更难,首付更难凑... 数据挖掘基础之数据清理:用python把深圳二手房参考价PDF保存为EXCEL,以便其他分析 ...
python读取pdf文件_python读取pdf文件
广告关闭腾讯云11.11云上盛惠 ,精选热门产品助力上云,云服务器首年88元起,买的越多返的越多,最高返5000元! 一.安装pdfminer3k模块?二. 读取pdf文件import sysimp ...
python不可以处理pdf文件_Python处理PDF文件-简译与总结
最近看到一篇介绍Python中pyPDF模块的文章,详细介绍了使用pyPDF模块获取PDF文件信息,合并拆分PDF文件等功能.很方便,在此搬运分享以下: 全文介绍了以下几方面的功能提取文件信息旋转 ...
【Python】用python将html转化为pdf
其实早在去年就有做过,一直没有写,先简单记录下 1.主要用到的工具[wkhtmltopdf] [下载地址]wkhtmltopdf 根据系统选择安装包,速度有点慢,先挂着 2.下载Python库 pip ...
《我的Python之路V1.3.pdf》可以下载了，这版pdf更精美！
1 前言 Python之路V1.3.pdf,使用更加专业的latex脚本和专业的Tex Live编译器生成,在经过这周3,4天的学习,编写,制作,终于完成年前制定的计划. 在制作V1.3.pdf中,遇 ...
python精彩编程200例pdf下载-最经典的25本Python编程开发电子书（附下载地址）！...
如果,有人想转行程序猿,而让给出"如何切入编程"的建议的话,许多人一定毫不犹豫地向他推荐"Python"! 没错,就是这样一只Python,它让众多程序开发者, ...

python pdf转txt_Python之pdf转txt

python pdf转txt_Python之pdf转txt相关推荐

最新文章

热门文章