PDFMiner----python的PDF解析器和分析器

PDFParser:

从一个文件中获取数据

PDFDocument:

保存获取的数据,和PDFParser是相互关联的

PDFPageInterpreter:处理页面内容

PDFDevice: 将其翻译成你需要的格式

PDFResourceManager:

用于存储共享资源,如字体或图像。

#!/usr/bin/python

import sys

import os

from binascii import b2a_hex

###

### pdf-miner requirements

###

from pdfminer.pdfparser import PDFParser

from pdfminer.pdfdocument import PDFDocument,

PDFNoOutlines

from pdfminer.pdfpage import PDFPage

from pdfminer.pdfinterp import PDFResourceManager,

PDFPageInterpreter

from pdfminer.converter import PDFPageAggregator

from pdfminer.layout import LAParams, LTTextBox, LTTextLine,

LTFigure, LTImage, LTChar

def with_pdf (pdf_doc, fn, pdf_pwd, *args):

"""Open the pdf

document, and apply the function, returning the results"""

result = None

try:

# open the pdf file

fp = open(pdf_doc, 'rb')

# create a parser object associated with the

file object

parser = PDFParser(fp)

# create a PDFDocument object that stores the

document structure

doc = PDFDocument(parser, pdf_pwd)

# connect the parser and document objects

parser.set_document(doc)

# supply the password for initialization

if doc.is_extractable:

# apply

the function and return the result

result =

fn(doc, *args)

# close the pdf file

fp.close()

except IOError:

# the file doesn't exist or similar

problem

pass

return result

###

### Table of Contents

###

def _parse_toc (doc):

"""With an open

PDFDocument object, get the table of contents (toc) data

[this is a higher-order

function to be passed to with_pdf()]"""

toc = []

try:

outlines = doc.get_outlines()

for (level,title,dest,a,se) in outlines:

toc.append( (level, title) )

except

PDFNoOutlines:

pass

return toc

def get_toc (pdf_doc, pdf_pwd=''):

"""Return the table of

contents (toc), if any, for this pdf file"""

return with_pdf(pdf_doc,

_parse_toc, pdf_pwd)

###

### Extracting Images

###

def write_file (folder, filename, filedata, flags='w'):

"""Write the file data

to the folder and filename combination

(flags: 'w' for write

text, 'wb' for write binary, use 'a' instead of 'w' for

append)"""

result = False

if

os.path.isdir(folder):

try:

file_obj =

open(os.path.join(folder, filename), flags)

file_obj.write(filedata)

file_obj.close()

result =

True

except IOError:

pass

return result

def determine_image_type (stream_first_4_bytes):

"""Find out the image

file type based on the magic number comparison of the first 4 (or

2) bytes"""

file_type = None

bytes_as_hex =

b2a_hex(stream_first_4_bytes)

if

bytes_as_hex.startswith('ffd8'):

file_type = '.jpeg'

elif bytes_as_hex ==

'89504e47':

file_type = '.png'

elif bytes_as_hex ==

'47494638':

file_type = '.gif'

elif

bytes_as_hex.startswith('424d'):

file_type = '.bmp'

return file_type

def save_image (lt_image, page_number, images_folder):

"""Try to save the image

data from this LTImage object, and return the file name, if

successful"""

result = None

if

lt_image.stream:

file_stream =

lt_image.stream.get_rawdata()

if file_stream:

file_ext =

determine_image_type(file_stream[0:4])

if

file_ext:

file_name =

''.join([str(page_number), '_', lt_image.name, file_ext])

if write_file(images_folder,

file_name, file_stream, flags='wb'):

result = file_name

return result

###

### Extracting Text

###

def to_bytestring (s, enc='utf-8'):

"""Convert the given

unicode string to a bytestring, using the standard encoding,

unless it's already a

bytestring"""

if s:

if isinstance(s, str):

return

s

else:

return

s.encode(enc)

def update_page_text_hash (h, lt_obj, pct=0.2):

"""Use the bbox x0,x1

values within pct% to produce lists of associated text within the

hash"""

x0 =

lt_obj.bbox[0]

x1 =

lt_obj.bbox[2]

key_found = False

for k, v in

h.items():

hash_x0 = k[0]

if x0 >= (hash_x0 * (1.0-pct)) and (hash_x0 *

(1.0+pct)) >= x0:

hash_x1 =

k[1]

if x1

>= (hash_x1 * (1.0-pct)) and (hash_x1 * (1.0+pct)) >=

x1:

# the text inside this LT*

object was positioned at the same

# width as a prior series of

text, so it belongs together

key_found = True

v.append(to_bytestring(lt_obj.get_text()))

h[k] = v

if not key_found:

# the text, based on width, is a new

series,

# so it gets its own series (entry in the

hash)

h[(x0,x1)] =

[to_bytestring(lt_obj.get_text())]

return h

def parse_lt_objs (lt_objs, page_number, images_folder,

text=[]):

"""Iterate through the

list of LT* objects and capture the text or image data contained in

each"""

text_content =

[]

page_text = {} # k=(x0,

x1) of the bbox, v=list of text strings within that bbox width

(physical column)

for lt_obj in

lt_objs:

if isinstance(lt_obj, LTTextBox) or

isinstance(lt_obj, LTTextLine):

# text, so

arrange is logically based on its column width

page_text

= update_page_text_hash(page_text, lt_obj)

elif isinstance(lt_obj, LTImage):

# an

image, so save it to the designated folder, and note its place in

the text

saved_file

= save_image(lt_obj, page_number, images_folder)

if

saved_file:

# use html style

tag

to mark the position of the image within the text

text_content.append('

')

else:

print >> sys.stderr,

"error saving image on page", page_number, lt_obj.__repr__

elif isinstance(lt_obj, LTFigure):

# LTFigure

objects are containers for other LT* objects, so recurse through

the children

text_content.append(parse_lt_objs(lt_obj, page_number,

images_folder, text_content))

for k, v in

sorted([(key,value) for (key,value) in page_text.items()]):

# sort the page_text hash by the keys (x0,x1

values of the bbox),

# which produces a top-down, left-to-right

sequence of related columns

text_content.append(''.join(v))

return

'\n'.join(text_content)

###

### Processing Pages

###

def _parse_pages (doc, images_folder):

"""With an open

PDFDocument object, get the pages and parse each one

[this is a higher-order

function to be passed to with_pdf()]"""

rsrcmgr =

PDFResourceManager()

laparams =

LAParams()

device =

PDFPageAggregator(rsrcmgr, laparams=laparams)

interpreter =

PDFPageInterpreter(rsrcmgr, device)

text_content = []

for i, page in

enumerate(PDFPage.create_pages(doc)):

interpreter.process_page(page)

# receive the LTPage object for this page

layout = device.get_result()

# layout is an LTPage object which may contain

child objects like LTTextBox, LTFigure, LTImage, etc.

text_content.append(parse_lt_objs(layout, (i+1),

images_folder))

return

text_content

def get_pages (pdf_doc, pdf_pwd='',

images_folder='/tmp'):

"""Process each of the

pages in this pdf file and return a list of strings representing

the text found in each page"""

return with_pdf(pdf_doc,

_parse_pages, pdf_pwd, *tuple([images_folder]))

a = open('a.txt','a')

for i in get_pages('/home/test.pdf'):

a.write(i)

a.close()

python pdf转txt_Python之pdf转txt相关推荐

  1. python 保存pdf文件_PyPDF2读取PDF文件内容保存到本地TXT实例

    我就废话不多说了,大家还是直接看代码吧! from PyPDF2.pdf import PdfFileReader import pandas as pd def Pdf_to_txt(pdf): f ...

  2. Python打开文件并进行处理,txt、excel、pdf、word!

    1. 概要 在办公处理中,我们常常要打开一些文件,面临大量的数据时,传统的人工方法耗时耗力.在python中,有一系列包装好的库,让我们能够很方便的操作各种类型的文件.当然,python的内置函数也能 ...

  3. 【Python小技巧】将pdf转为txt,并使用edge-tts将txt批量转为MP3(不想看书想听书的转过来,送源代码)

    文章目录 前言 一.PDF转为MP3 ? 二.准备工作 1. 安装pdfplumber包,用于将pdf转为txt 2. 安装edge-tts包,用于将txt转为音频 三.代码很简单 四.变更播音员 总 ...

  4. 用python汇总pdf文件_Python处理PDF文件-简译与总结

    最近看到一篇介绍Python中pyPDF模块的文章,详细介绍了使用pyPDF模块获取PDF文件信息,合并拆分PDF文件等功能.很方便,在此搬运分享以下: 全文介绍了以下几方面的功能 提取文件信息 旋转 ...

  5. 数据挖掘基础之数据清洗:用python把“深圳二手房参考价”PDF保存为EXCEL

    坑DIE的住建局再一次不限富豪限刚需,公布了深圳市住宅小区二手住房成交参考价格,买房更难,首付更难凑... 数据挖掘基础之数据清理:用python把深圳二手房参考价PDF保存为EXCEL,以便其他分析 ...

  6. python读取pdf文件_python读取pdf文件

    广告关闭 腾讯云11.11云上盛惠 ,精选热门产品助力上云,云服务器首年88元起,买的越多返的越多,最高返5000元! 一.安装pdfminer3k模块?二. 读取pdf文件import sysimp ...

  7. python不可以处理pdf文件_Python处理PDF文件-简译与总结

    最近看到一篇介绍Python中pyPDF模块的文章,详细介绍了使用pyPDF模块获取PDF文件信息,合并拆分PDF文件等功能.很方便,在此搬运分享以下: 全文介绍了以下几方面的功能 提取文件信息 旋转 ...

  8. 【Python】用python将html转化为pdf

    其实早在去年就有做过,一直没有写,先简单记录下 1.主要用到的工具[wkhtmltopdf] [下载地址]wkhtmltopdf 根据系统选择安装包,速度有点慢,先挂着 2.下载Python库 pip ...

  9. 《我的Python之路V1.3.pdf》可以下载了,这版pdf更精美!

    1 前言 Python之路V1.3.pdf,使用更加专业的latex脚本和专业的Tex Live编译器生成,在经过这周3,4天的学习,编写,制作,终于完成年前制定的计划. 在制作V1.3.pdf中,遇 ...

  10. python精彩编程200例pdf下载-最经典的25本Python编程开发电子书(附下载地址)!...

    如果,有人想转行程序猿,而让给出"如何切入编程"的建议的话,许多人一定毫不犹豫地向他推荐"Python"! 没错,就是这样一只Python,它让众多程序开发者, ...

最新文章

  1. [转]emacs中文输入问题
  2. Linux I/O 那些事儿
  3. 一文读懂 HTTP/1HTTP/2HTTP/3
  4. linux svn apt get,Ubuntu下安装SVN客户端
  5. QQ 鼻祖立功!世界最大僵尸网络 Andromeda 为祸七年终被捣毁
  6. 架构设计--逻辑层 vs 物理层
  7. textView 属性总结
  8. 工程施工工地进度监控带天气经纬度相机(监理日志不再难写)
  9. checkIP——烂代码又堆了一个ip活性检测工具
  10. pandas统计个数
  11. python为在线漫画网站自制非官方API(未完待续)
  12. “Failed to load response data“ django@xframe_options_exempt 网站不许 Firefox 显示被嵌入的网页
  13. Qt实现 基于ffmpeg拉流播放视频
  14. Windows Server 2008R2 取消屏幕自动锁定
  15. 制作往图片里插入视频,视频添加到图片上播放
  16. 什么是太阳光模拟器整车全光谱阳光模拟系统?
  17. 陈老师排课软件12A(正式版)
  18. 《C++ Templates》笔记 Chapter 12 Fundamentals in Depth-Chapter 13 Names in Templates
  19. Golang map源码浅析
  20. 面积与弦长_Simplelife_新浪博客

热门文章

  1. 尚硅谷Docker---docker安装及简介
  2. 设计一函数,求整数区间[a,b]和[c,d]的交集
  3. eclipse重定向输入输出到文件
  4. iOS之摇一摇功能实现
  5. 2011华为上机试题-Java
  6. java获取字符串实际编码_Java获取字符串编码方式
  7. 超详细 Spring @RequestMapping 注解使用技巧
  8. 设计模式 (九) 组合模式
  9. RAC3——RAC原理开始
  10. Installation openQRM