python 提取pdf格式电子发票并改名

本人小公司企业主一枚，经常接受电子发票，然后有空发给会计，默认的发票基本都是发票号，看不出是哪个公司开的，哪个公司收的。
经过多次研究使用pymupdf读取pdf格式，但是不同省份和城市开出的电子发票细节格式并不相同，最终是按区域先读取再细分信息。
读取多数信息了，至于改名规则可以根据喜好修改。如果有相同的发票会在重复的文件前面加上下划线_便于删除
目前广东省，厦门市，浙江省开出的发票测试无误，本程序有一定的容错机制其他省份的大概率是没问题，除非格式特殊
ps：不支持图片及完全曲线化的pdf电子发票（无字体pdf）
python 3.7 及fitz pymupdf ver =1.18.14下运行

# _*_ coding:utf-8 _*_
# fitz pymupdf ver =1.18.14
# Python 3.7
# 2021-11-17
# by gingeer @ CSDN
# 按位置获取发票信息并改名
# pymupdf doc see https://pymupdf.readthedocs.io/en/latest/## make a new pdf with same size of old one.
##doc = fitz.open("some.file")
##page = doc[0]
##paths = page.get_drawings()  # extract existing drawings
### this is a list of "paths", which can directly be drawn again using Shape
### -------------------------------------------------------------------------
###
### define some output page with the same dimensions
##outpdf = fitz.open()
##outpage = outpdf.new_page(width=page.rect.width, height=page.rect.height)
##shape = outpage.new_shape()  # make a drawing canvas for the output pageimport sys
import fitz
import re
import os, math
import datetime
# pip install pyMuPDF not fitz!!!!!!!!!!!!!
def make_text(words):"""Return textstring output of get_text("words").Word items are sorted for reading sequence left to right,top to bottom."""line_dict = {}  # key: vertical coordinate, value: list of wordswords.sort(key=lambda w: w[0])  # sort by horizontal coordinatefor w in words:  # fill the line dictionaryy1 = round(w[3], 1)  # bottom of a word: don't be too picky!word = w[4]  # the text of the wordline = line_dict.get(y1, [])  # read current line contentline.append(word)  # append new wordline_dict[y1] = line  # write back to dictlines = list(line_dict.items())lines.sort()  # sort verticallylines_2=[" ".join(line[1]) for line in lines]return lines_2
##    return "\n".join(lines_2)  #原方法pdfPath='./'
pagesfiles=[]
newnames=[]
files = os.listdir(pdfPath)pdffiles = [f for f in files if f.lower().endswith('.pdf')]
##print(pdffiles)#大区域按百分比划分
invoice_rect_per = [0.69,0.02,0.978,0.2]
buyer_rect_per = [0.16,0.214, 0.57,0.369]
seller_rect_per = [0.16,0.744, 0.57,0.884]
total_amounts_per =[0.666,0.692, 0.95,0.749] for pdffile in pdffiles:print(pdffile)doc = fitz.open(pdfPath+pdffile)page = doc[0]
##    print(page)x_end=page.rect[2]y_end=page.rect[3]shape=page.new_shape()
##    print(x_end,y_end,page.rect)shape.draw_rect(page.rect)words = page.get_text("words")  # list of words on page##===debug===
##    for word in words:
##        print(word[0]/x_end,word[1]/y_end,word[2]/x_end,word[3]/y_end,word[4])
##        print(word)
##        text_point=fitz.Point(word[0],word[3])
##        shape.draw_circle(text_point,1)
##        shape.insertText(text_point,word[4],color=(1,0,0),fontsize=8)#===debug end===invoive_rect = fitz.Rect(invoice_rect_per[0] * x_end,invoice_rect_per[1] * y_end, invoice_rect_per[2] * x_end,invoice_rect_per[3] * y_end)buyer_rect = fitz.Rect(buyer_rect_per[0] * x_end,buyer_rect_per[1] * y_end,buyer_rect_per[2] * x_end,buyer_rect_per[3] * y_end)seller_rect = fitz.Rect(seller_rect_per[0] * x_end,seller_rect_per[1] * y_end,seller_rect_per[2] * x_end,seller_rect_per[3] * y_end)total_amounts_rect = fitz.Rect(total_amounts_per[0] * x_end,total_amounts_per[1] * y_end,total_amounts_per[2] * x_end,total_amounts_per[3] * y_end)shape.draw_rect(total_amounts_rect) ## check positions#-----获取重要信息-------#----发票信息mywords_1 = [w for w in words if fitz.Rect(w[:4]) in invoive_rect]mywords_2 = [w for w in words if fitz.Rect(w[:4]).intersects(invoive_rect)]invoice_infos = make_text(mywords_1)check_invoice=Falsefor info in invoice_infos:if re.search('\d{12}',info) != None:invoice_cata = infocheck_invoice=Trueelif re.search('\d{8}',info) != None:invoice_num = infocheck_invoice=Trueelif re.search('(\d{4})\D+(\d{2})\D+(\d{2})',info) !=None:date_re_result=re.search('(\d{4})\D+(\d{2})\D+(\d{2})',info)invoice_date='-'.join([date_re_result.group(1),date_re_result.group(2),date_re_result.group(3)])check_invoice=Trueif not check_invoice:print('不是发票文件',invoice_infos)# ===debug===
##        shape.finish(width=0.5,color=(1,0,0))
##        shape.commit()
##        doc.save('xjj.pdf')
##        doc.close()
##        dpi=200
##        zoom_x = dpi/72
##        zoom_y = dpi/72
##        mat = fitz.Matrix(zoom_x, zoom_y).preRotate(0)
##        pix = page.getPixmap(matrix=mat, alpha=False)
##        pix.set_dpi(dpi,dpi)
##        pix.writePNG(pdffile + '.png')# ===debug end===continueelse:invoice_num=invoice_num.replace('发票号码','')invoice_num=invoice_num.replace('：','')invoice_num=invoice_num.replace(':','')invoice_num=invoice_num.strip()#----购方buyer_words = [w for w in words if fitz.Rect(w[:4]) in buyer_rect]buyer_infos = make_text(buyer_words)for info in buyer_infos:if info.find('公司') >= 0:buyer=infobreakelif info =='个人':buyer='个人'#----卖方seller_words = [w for w in words if fitz.Rect(w[:4]) in seller_rect]seller_infos = make_text(seller_words)for info in seller_infos:if len(info) >3:seller=infobreak#----合计金额total_amount_words = [w for w in words if fitz.Rect(w[:4]) in total_amounts_rect]total_amount_infos = make_text(total_amount_words)total_amount=total_amount_infos[-1]total_amount=total_amount.replace('¥','')total_amount=total_amount.replace('￥','')#----新名字    connector='_'newname_elements=[invoice_date,invoice_num,buyer+'(购)',seller,'￥'+total_amount]newname=connector.join(newname_elements) + '.pdf'##    doc.save(pdffile)if newname == pdffile:print('发票名已经正规化！')newnames.append(newname)doc.close()print('------------------------')continueif newname not in newnames and newname not in pdffiles:print(f'原名：{pdffile}\n新名：{newname}')newnames.append(newname)doc.close()os.rename(pdffile,newname)else:while newname in newnames:newname = '_'+newnamenewnames.append(newname)doc.close()os.rename(pdffile,'_'+newname)print('重复文件名!已加前缀')print('------------------------')# ===debug===
##    shape.finish(width=0.5,color=(1,0,0))
##    shape.commit()
##    dpi=200
##    zoom_x = dpi/72
##    zoom_y = dpi/72
##    mat = fitz.Matrix(zoom_x, zoom_y).preRotate(0)
##    pix = page.getPixmap(matrix=mat, alpha=False)
##    pix.set_dpi(dpi,dpi)
##    pix.writePNG(pdffile + '.png')# ===debug end===#    print(newname)

python 提取pdf格式电子发票并改名相关推荐

python提取pdf发票信息_PDF电子发票内容提取
网页版程序使用地址:[在线使用](https://www.yooongchun.com/apps) 摘要本文介绍如何提取PDF版电子发票的内容. 1. 加载内容首先使用Python的pdfplum ...
python提取pdf发票信息_python读取pdf（发票）
想读取文件夹*.pdf格式的发票并写入到excel当中,当然也可以写入txt(注释代码有) 详见下面代码,代码开头有参考的几篇文章的地址一开始用的是pdfplumber,不好用,识别率不高,后来使用 ...
c++代码整洁之道pdf_别再问如何用python提取PDF内容了
作者:陈熹来源:早起Python 大家好,在之前的办公自动化系列文章中我们已经详细介绍了如何使用python批量处理PDF文件,包括合并.拆分.水印.加密等操作. 今天我们再次回到PDF,详细讲解如 ...
别再问如何用 Python 提取 PDF 内容了！
作者 | 陈熹头图 | CSDN 下载自东方 IC 来源 | 早起Python(ID:zaoqi-python) 导读大家好,在之前的办公自动化系列文章中我们已经详细介绍了如何使用Python批量 ...
利用Python提取PDF文件中的文本信息
如何利用Python提取PDF文件中的文本信息日常工作中我们经常会用到pdf格式的文件,大多数情况下是浏览或者编辑pdf信息,但有时候需要提取pdf中的文本,如果是单个文件的话还可以通过复制粘贴来直 ...
怎么在python提取别的数据了_别再问如何用python提取PDF内容了！
作者:陈熹来源:早起Python 大家好,在之前的办公自动化系列文章中我们已经详细介绍了如何使用python批量处理PDF文件,包括合并.拆分.水印.加密等操作. 今天我们再次回到PDF,详细讲解如 ...
用python提取PDF表格内容保存到excel
一提取pdf方法介绍任务是用python提取PDF里的表格文件到excel里面去.做为一个学了一个周python的人来说当然像尝试一下看能不能做到,事实证明是可以的只是可能代码有点烂...... ...
别再问如何用Python提取PDF内容了！
公众号后台回复"图书",了解更多号主新书内容作者:陈熹来源:早起Python 导读大家好,在之前的办公自动化系列文章中我们已经详细介绍了????如何使用Python批量处理P ...
Python提取PDF中的文字和图片
一,使用Python提取PDF中的文字 # 只能处理包含文本的PDF文件 #coding=utf-8 import sys import importlib importlib.reload(sys) ...

python 提取pdf格式电子发票并改名

python 提取pdf格式电子发票并改名相关推荐

最新文章

热门文章