Python PDF转Word,以及提取Word中图片里的文字

PDF转Word,以及提取Word中图片里的文字

PS: 也是从网上各个帖子中学习的Python，因此代码的格式以及内容有粘贴网上其他大神的代码，如有侵权请告知删除

软件截图：

#!/usr/bin/python
# -*- coding: gbk -*-import six
from pdf2docx import Converter
import zipfile  # 压缩包
import os  # 文件库
import pytesseract
from tkinter import *
import tkinter.filedialog
from PIL import Image
import shutilroot = Tk()
root.title('PDF转换WORD,TXT')
root.geometry('580x300')PDF_FilePath = ''#用来保存PDF文件路径
DOCX_FilePath = ''#用来保存Word文件路径
TEXT_FilePth = ''#用来保存Text文件路径
Image_BaseForder = 'C:\\Temp'#临时用来存放WORD中的图片，方便后续分析识别图片中的文字def Button1Anwser():global PDF_FilePathPath = Label(root, text='')PDF_FilePath = tkinter.filedialog.askopenfilename()Path.config(text = PDF_FilePath)Path.grid(row=2, column=0, sticky=W)def Button4Anwser():global DOCX_FilePathPath = Label(root, text='')DOCX_FilePath = tkinter.filedialog.askopenfilename()Path.config(text = DOCX_FilePath)Path.grid(row=6, column=0, sticky=W)def Button2Anwser():global PDF_FilePathglobal DOCX_FilePathif(PDF_FilePath == ""):Text = Label(root, text='')Text.config(text='请先选择PDF文件')Text.grid(row=3, column=1)DOCX_FilePath = PDF_FilePath.split('.')[0] + '.docx'Text = Label(root, text='')Text.config(text='开始转换')Text.grid(row=3, column=1)cv = Converter(PDF_FilePath)cv.convert(DOCX_FilePath, start=0, end=None)cv.close()Text = Label(root, text='')Text.config(text='转换完成')Text.grid(row=3, column=1)Path = Label(root, text='')Path.config(text=DOCX_FilePath)Path.grid(row=4, column=0, sticky=W)def Button3Anwser():global DOCX_FilePathglobal TEXT_FilePthglobal Image_BaseForderif(PDF_FilePath == ""):Text = Label(root, text='')Text.config(text='请先选择PDF文件')Text.grid(row=7, column=1)TEXT_FilePth = PDF_FilePath.split('.')[0] + '.txt'Text = Label(root, text='')Text.config(text='开始转换')Text.grid(row=7, column=1)ToText(DOCX_FilePath)Text = Label(root, text='')Text.config(text='转换完成')Text.grid(row=7, column=1)Path = Label(root, text='')Path.config(text=TEXT_FilePth)Path.grid(row=8, column=0, sticky=W)shutil.rmtree(Image_BaseForder)
class Get_WordContent:def __init__(self, Word_Path):self.Word_Path = Word_Path# 将Word文档中的图片提取出来，存放到文件夹def Get_Image(self):global TEXT_FilePthWord_Path = self.Word_Pathtry:TEXT_FilePth = PDF_FilePath.split('.')[0] + '.txt'Doc = zipfile.ZipFile(Word_Path)#存放的文件夹名称要按照文档名称进行区分Image_Forder = Image_BaseForder + "\\" + PDF_FilePath.split("/")[-1].split('.')[0]if(os.path.exists(Image_Forder) == 0):os.makedirs(Image_Forder)#进行图片的保存for Info in Doc.infolist():if Info.filename.endswith((".jpg", '.jpeg', '.png', '.gif', 'bmp')):Doc.extract(Info.filename, Image_Forder)except Exception as e:print(e)finally:passdef Get_Words(self):global TEXT_FilePthtry:fd = open(TEXT_FilePth, 'w')Image_Forder = Image_BaseForder + "\\" + DOCX_FilePath.split("/")[-1].split('.')[0] + "\\word\\media"  # 图片路径if os.path.exists(Image_Forder) != 0:for filename in os.listdir(Image_Forder):words = ""#获取每一张图片的文本信息words = str(pytesseract.image_to_string(Image.open(Image_Forder + "\\" + filename), lang="chi_sim"))fd.write(words)fd.close()except Exception as e:print(e)finally:passdef ToText(WordPath):WordContent = Get_WordContent(WordPath)WordContent.Get_Image()WordContent.Get_Words()
# -------------------------------------界面---------------------------------
SelectFile = Button(root, text="待转换PDF", command=Button1Anwser,width=15, height=1, bg="DarkGray", fg="white")
SelectFile.grid(row=1, column=0, sticky=W)SelectFile = Button(root, text="转Word", command=Button2Anwser, width=15, height=1, bg="DarkGray", fg="white")
SelectFile.grid(row=3, sticky=W)SelectFile = Button(root, text="待提取文字Word", command=Button4Anwser,width=15, height=1, bg="DarkGray", fg="white")
SelectFile.grid(row=5, column=0, sticky=W)SelectFile = Button(root, text="转Text", command=Button3Anwser, width=15, height=1, bg="DarkGray", fg="white")
SelectFile.grid(row=7, sticky=W)Advice1 = Label(root, text='')
Advice1.config(text='1 : PDF转WORD后，发现Word中的文字是以图片显示的情况下，进一步进行Word中针对图片的文字识别')
Advice1.grid(row=9, column=0, sticky=W)Advice2 = Label(root, text='')
Advice2.config(text='2 : 文字识别比不上网上需花钱的识别，会出现漏字，错字等情况，请提取后仔细检查')
Advice2.grid(row=10, column=0, sticky=W)Advice3 = Label(root, text='')
Advice3.config(text='3 : 转换过程可能很长，转换按钮一直处于按下状态，请等待“转换完成”字样提示')
Advice3.grid(row=11, column=0, sticky=W)root.mainloop()

Python PDF转Word,以及提取Word中图片里的文字相关推荐

python 替换array中的值_利用Python提取视频中的字幕（文字识别）
我的CSDN博客id:qq_39783601,昵称是糖潮丽子~辣丽从今天开始我会陆续将数据分析师相关的知识点分享在这里,包括Python.机器学习.数据库等等. 今天来分享一个Python小项目! ...
access数据放到list中_利用Python提取视频中的字幕（文字识别）
我的CSDN博客id:qq_39783601,昵称是糖潮丽子~辣丽从今天开始我会陆续将数据分析师相关的知识点分享在这里,包括Python.机器学习.数据库等等. 今天来分享一个Python小项目! ...
python获取视频帧的时间_Python提取视频中图片的示例（按帧、按秒）
一.按帧提取 #coding=utf-8 import os import cv2 def save_img(): #提取视频中图片按照每帧提取 video_path = r'D:\\test\\' ...
python做视频抽帧图_Python提取视频中图片的示例（按帧、按秒）
一.按帧提取 #coding=utf-8 import os import cv2 def save_img(): #提取视频中图片按照每帧提取 video_path = r'd:\\test\\' ...
Word图文混排中图片的高级处理技巧
Word图文混排中图片的高级处理技巧 1.插入联机图片 2.图片的处理方式 1.插入联机图片 2.图片的处理方式
初次爬虫：读取PDF转成图片，再提取图片里的文字信息
读取PDF转成图片,再提取图片里的文字信息三步走第一步读取PDF并转换成图片第二步调用百度API来识别图片里面的文字信息第三步提取自己想要的文字信息三步走 1 读取PDF,将PDF转换 ...
java中怎样导入图片6_Spire.Doc系列教程（6）：插入图片到 Word 以及提取 Word 中的图片...
图片是Word文档的基本要素之一,常见的对Word图片的操作有插入.删除.替换和提取.本文将介绍如何使通过编程的方式添加图片到指定位置,以及如何获取Word文档中的图片并保存到本地路径. 在指定位置插 ...
python 输出纯音频_提取视频中的音频python三行程序搞定
写在开头身处数据爆炸增长的时代,各种各样的数据都飞速增长,视频数据也不例外.我们可以使用 python 来提取视频中的音频,而这仅仅需要安装一个体量很小的python包,然后执行三行程序! 语音数据 ...
解决WPS或Word中“图片隐藏在文字下面”的问题
问题描述:WPS或Word在插入图片时,图片格式是嵌入型,但是图片隐藏在文字下面. 解决办法:看文字行距设置是否为固定值,改为其他行距即可.

Python PDF转Word,以及提取Word中图片里的文字

PDF转Word,以及提取Word中图片里的文字

Python PDF转Word,以及提取Word中图片里的文字相关推荐

最新文章

热门文章