PDF电子发票内容提取

可以点击这里使用发票提取软件：发票解析

请参考最新的实现方案： 浅谈电子发票识别方案

在线使用：发票提取

摘要

本文介绍如何提取PDF版电子发票的内容。

1. 加载内容

首先使用Python的pdfplumber库读入内容。

FILE=r"data/test-2.pdf"
pdf=pb.open(FILE)
page=pdf.pages[0]

接着读取内容并提取线段。

words=page.extract_words(x_tolerance=5)
lines=page.lines # 获取线段（不包括边框线）
for word in words:print(word)
# 坐标换算
for index,word in enumerate(words):words[index]["y0"]=word["top"]words[index]["y1"]=word["bottom"]
for index,line in enumerate(lines):lines[index]["x1"]=line["x0"]+line["width"]lines[index]["y0"]=line["top"]lines[index]["y1"]=line["bottom"]

2. 还原表格

为了将内容划分到合理的位置，需要还原出表格。

首先，把线段分类为横线和竖线，并且剔除较短的两根。

hlines=[line for line in lines if line["width"]>0] # 筛选横线
hlines=sorted(hlines,key=lambda h:h["width"],reverse=True)[:-2] #剔除较短的两根vlines=[line for line in lines if line["height"]>0] #筛选竖线
vlines=sorted(vlines,key=lambda v:v["y0"]) #按照坐标排列

将线段展示出来如下图。

此时的线段是不闭合的，将缺少的线段补齐得到表格如下。

# 查找边框顶点
hx0=hlines[0]["x0"] # 左侧
hx1=hlines[0]["x1"] # 右侧
vy0=vlines[0]["y0"] # 顶部
vy1=vlines[-1]["y1"] # 底部thline={"x0":hx0,"y0":vy0,"x1":hx1,"y1":vy0} # 顶部横线
bhline={"x0":hx0,"y0":vy1,"x1":hx1,"y1":vy1} # 底部横线
lvline={"x0":hx0,"y0":vy0,"x1":hx0,"y1":vy1} # 左侧竖线
rvline={"x0":hx1,"y0":vy0,"x1":hx1,"y1":vy1} # 右侧竖线hlines.insert(0,thline)
hlines.append(bhline)vlines.insert(0,lvline)
vlines.append(rvline)

接下来，查找所有线段的交点：

# 查找所有交点
points=[]delta=1
for vline in vlines:vx0=vline["x0"]vy0=vline["y0"]vx1=vline["x1"]vy1=vline["y1"]    for hline in hlines:hx0=hline["x0"]hy0=hline["y0"]hx1=hline["x1"]hy1=hline["y1"]        if (hx0-delta)<=vx0<=(hx1+delta) and (vy0-delta)<=hy0<=(vy1+delta):points.append((int(vx0),int(hy0)))
print('所有交点：',points)
print('交点总计：',len(points))

最后，根据交点构建矩形块

# 构造矩阵
X=sorted(set([int(p[0]) for p in points]))
Y=sorted(set([int(p[1]) for p in points]))df=pd.DataFrame(index=Y,columns=X)
for p in points:x,y=int(p[0]),int(p[1])df.loc[y,x]=1
df=df.fillna(0)# 寻找矩形
rects=[]
COLS=len(df.columns)-1
ROWS=len(df.index)-1for row in range(ROWS):for col in range(COLS):p0=df.iat[row,col] # 主点：必能构造一个矩阵cnt=col+1while cnt<=COLS:p1=df.iat[row,cnt]p2=df.iat[row+1,col]p3=df.iat[row+1,cnt]if p0 and p1 and p2 and p3:rects.append(((df.columns[col],df.index[row]),(df.columns[cnt],df.index[row]),(df.columns[col],df.index[row+1]),(df.columns[cnt],df.index[row+1])))breakelse:cnt+=1
print(len(rects))
for r in rects:print(r)

3.将单词放入矩形框

首先，在表格中查看一下单词的位置

接下来，将内容放入到矩形框中

# 判断点是否在矩形内
def inRect(point,rect):px,py=pointp1,p2,p3,p4=rectif p1[0]<=px<=p2[0] and p1[1]<=py<=p3[1]:return Trueelse:return False# 将words按照坐标层级放入矩阵中
groups={}
delta=2
for word in words:p=(int(word["x0"]),int((word["y0"]+word["y1"])/2))flag=Falsefor r in rects:if inRect(p,r):flag=Truegroups[("IN",r[0][1],r)]=groups.get(("IN",r[0][1],r),[])+[word]breakif not flag:y_range=[p[1]+x for x in range(delta)]+[p[1]-x for x in range(delta)]out_ys=[k[1] for k in list(groups.keys()) if k[0]=="OUT"]flag=Falsefor y in set(y_range):if y in out_ys:v=out_ys[out_ys.index(y)]groups[("OUT",v)].append(word)flag=Truebreakif not flag:groups[("OUT",p[1])]=[word]# 按照y坐标排序
keys=sorted(groups.keys(),key=lambda k:k[1])
for k in keys:g=groups[k]print(k,[w["text"] for w in g])print("*-*-"*20)

4. 结果及代码

最后，提取得到结果：

上图原样本示例：

最后，将代码封装整理为类：


class Extractor(object):def __init__(self, path):self.file = path if os.path.isfile else Nonedef _load_data(self):if self.file and os.path.splitext(self.file)[1] == '.pdf':pdf = pb.open(self.file)page = pdf.pages[0]words = page.extract_words(x_tolerance=5)lines = page.lines# convert coordinationfor index, word in enumerate(words):words[index]['y0'] = word['top']words[index]['y1'] = word['bottom']for index, line in enumerate(lines):lines[index]['x1'] = line['x0']+line['width']lines[index]['y0'] = line['top']lines[index]['y1'] = line['bottom']return {'words': words, 'lines': lines}else:print("file %s cann't be opened." % self.file)return Nonedef _fill_line(self, lines):hlines = [line for line in lines if line['width'] > 0]  # 筛选横线hlines = sorted(hlines, key=lambda h: h['width'], reverse=True)[:-2]  # 剔除较短的两根vlines = [line for line in lines if line['height'] > 0]  # 筛选竖线vlines = sorted(vlines, key=lambda v: v['y0'])  # 按照坐标排列# 查找边框顶点hx0 = hlines[0]['x0']  # 左侧hx1 = hlines[0]['x1']  # 右侧vy0 = vlines[0]['y0']  # 顶部vy1 = vlines[-1]['y1']  # 底部thline = {'x0': hx0, 'y0': vy0, 'x1': hx1, 'y1': vy0}  # 顶部横线bhline = {'x0': hx0, 'y0': vy1, 'x1': hx1, 'y1': vy1}  # 底部横线lvline = {'x0': hx0, 'y0': vy0, 'x1': hx0, 'y1': vy1}  # 左侧竖线rvline = {'x0': hx1, 'y0': vy0, 'x1': hx1, 'y1': vy1}  # 右侧竖线hlines.insert(0, thline)hlines.append(bhline)vlines.insert(0, lvline)vlines.append(rvline)return {'hlines': hlines, 'vlines': vlines}def _is_point_in_rect(self, point, rect):'''判断点是否在矩形内'''px, py = pointp1, p2, p3, p4 = rectif p1[0] <= px <= p2[0] and p1[1] <= py <= p3[1]:return Trueelse:return Falsedef _find_cross_points(self, hlines, vlines):points = []delta = 1for vline in vlines:vx0 = vline['x0']vy0 = vline['y0']vy1 = vline['y1']for hline in hlines:hx0 = hline['x0']hy0 = hline['y0']hx1 = hline['x1']if (hx0-delta) <= vx0 <= (hx1+delta) and (vy0-delta) <= hy0 <= (vy1+delta):points.append((int(vx0), int(hy0)))return pointsdef _find_rects(self, cross_points):# 构造矩阵X = sorted(set([int(p[0]) for p in cross_points]))Y = sorted(set([int(p[1]) for p in cross_points]))df = pd.DataFrame(index=Y, columns=X)for p in cross_points:x, y = int(p[0]), int(p[1])df.loc[y, x] = 1df = df.fillna(0)# 寻找矩形rects = []COLS = len(df.columns)-1ROWS = len(df.index)-1for row in range(ROWS):for col in range(COLS):p0 = df.iat[row, col]  # 主点：必能构造一个矩阵cnt = col+1while cnt <= COLS:p1 = df.iat[row, cnt]p2 = df.iat[row+1, col]p3 = df.iat[row+1, cnt]if p0 and p1 and p2 and p3:rects.append(((df.columns[col], df.index[row]), (df.columns[cnt], df.index[row]), (df.columns[col], df.index[row+1]), (df.columns[cnt], df.index[row+1])))breakelse:cnt += 1return rectsdef _put_words_into_rect(self, words, rects):# 将words按照坐标层级放入矩阵中groups = {}delta = 2for word in words:p = (int(word['x0']), int((word['y0']+word['y1'])/2))flag = Falsefor r in rects:if self._is_point_in_rect(p, r):flag = Truegroups[('IN', r[0][1], r)] = groups.get(('IN', r[0][1], r), [])+[word]breakif not flag:y_range = [p[1]+x for x in range(delta)]+[p[1]-x for x in range(delta)]out_ys = [k[1] for k in list(groups.keys()) if k[0] == 'OUT']flag = Falsefor y in set(y_range):if y in out_ys:v = out_ys[out_ys.index(y)]groups[('OUT', v)].append(word)flag = Truebreakif not flag:groups[('OUT', p[1])] = [word]return groupsdef _find_text_by_same_line(self, group, delta=1):words = {}group = sorted(group, key=lambda x: x['x0'])for w in group:bottom = int(w['bottom'])text = w['text']k1 = [bottom-i for i in range(delta)]k2 = [bottom+i for i in range(delta)]k = set(k1+k2)flag = Falsefor kk in k:if kk in words:words[kk] = words.get(kk, '')+textflag = Truebreakif not flag:words[bottom] = words.get(bottom, '')+textreturn wordsdef _split_words_into_diff_line(self, groups):groups2 = {}for k, g in groups.items():words = self._find_text_by_same_line(g, 3)groups2[k] = wordsreturn groups2def _index_of_y(self, x, rects):for index, r in enumerate(rects):if x == r[2][0][0]:return index+1 if index+1 < len(rects) else Nonereturn Nonedef _find_outer(self, k, words):df = pd.DataFrame()for pos, text in words.items():if re.search(r'发票$', text):  # 发票名称df.loc[0, '发票名称'] = textelif re.search(r'发票代码', text):  # 发票代码num = ''.join(re.findall(r'[0-9]+', text))df.loc[0, '发票代码'] = numelif re.search(r'发票号码', text):  # 发票号码num = ''.join(re.findall(r'[0-9]+', text))df.loc[0, '发票号码'] = numelif re.search(r'开票日期', text):  # 开票日期date = ''.join(re.findall(r'[0-9]{4}年[0-9]{1,2}月[0-9]{1,2}日', text))df.loc[0, '开票日期'] = dateelif '机器编号' in text and '校验码' in text:  # 校验码text1 = re.search(r'校验码:\d+', text)[0]num = ''.join(re.findall(r'[0-9]+', text1))df.loc[0, '校验码'] = numtext2 = re.search(r'机器编号:\d+', text)[0]num = ''.join(re.findall(r'[0-9]+', text2))df.loc[0, '机器编号'] = numelif '机器编号' in text:num = ''.join(re.findall(r'[0-9]+', text))df.loc[0, '机器编号'] = numelif '校验码' in text:num = ''.join(re.findall(r'[0-9]+', text))df.loc[0, '校验码'] = numelif re.search(r'收款人', text):items = re.split(r'收款人:|复核:|开票人:|销售方:', text)items = [item for item in items if re.sub(r'\s+', '', item) != '']df.loc[0, '收款人'] = items[0] if items and len(items) > 0 else ''df.loc[0, '复核'] = items[1] if items and len(items) > 1 else ''df.loc[0, '开票人'] = items[2] if items and len(items) > 2 else ''df.loc[0, '销售方'] = items[3] if items and len(items) > 3 else ''return dfdef _find_and_sort_rect_in_same_line(self, y, groups):same_rects_k = [k for k, v in groups.items() if k[1] == y]return sorted(same_rects_k, key=lambda x: x[2][0][0])def _find_inner(self, k, words, groups, groups2, free_zone_flag=False):df = pd.DataFrame()sort_words = sorted(words.items(), key=lambda x: x[0])text = [word for k, word in sort_words]context = ''.join(text)if '购买方' in context or '销售方' in context:y = k[1]x = k[2][0][0]same_rects_k = self._find_and_sort_rect_in_same_line(y, groups)target_index = self._index_of_y(x, same_rects_k)target_k = same_rects_k[target_index]group_context = groups2[target_k]prefix = '购买方' if '购买方' in context else '销售方'for pos, text in group_context.items():if '名称' in text:name = re.sub(r'名称:', '', text)df.loc[0, prefix+'名称'] = nameelif '纳税人识别号' in text:tax_man_id = re.sub(r'纳税人识别号:', '', text)df.loc[0, prefix+'纳税人识别号'] = tax_man_idelif '地址、电话' in text:addr = re.sub(r'地址、电话:', '', text)df.loc[0, prefix+'地址电话'] = addrelif '开户行及账号' in text:account = re.sub(r'开户行及账号:', '', text)df.loc[0, prefix+'开户行及账号'] = accountelif '密码区' in context:y = k[1]x = k[2][0][0]same_rects_k = self._find_and_sort_rect_in_same_line(y, groups)target_index = self._index_of_y(x, same_rects_k)target_k = same_rects_k[target_index]words = groups2[target_k]context = [v for k, v in words.items()]context = ''.join(context)df.loc[0, '密码区'] = contextelif '价税合计' in context:y = k[1]x = k[2][0][0]same_rects_k = self._find_and_sort_rect_in_same_line(y, groups)target_index = self._index_of_y(x, same_rects_k)target_k = same_rects_k[target_index]group_words = groups2[target_k]group_context = ''.join([w for k, w in group_words.items()])items = re.split(r'[(（]小写[)）]', group_context)b = items[0] if items and len(items) > 0 else ''s = items[1] if items and len(items) > 1 else ''df.loc[0, '价税合计(大写)'] = bdf.loc[0, '价税合计(小写)'] = selif '备注' in context:y = k[1]x = k[2][0][0]same_rects_k = self._find_and_sort_rect_in_same_line(y, groups)target_index = self._index_of_y(x, same_rects_k)if target_index:target_k = same_rects_k[target_index]group_words = groups2[target_k]group_context = ''.join([w for k, w in group_words.items()])df.loc[0, '备注'] = group_contextelse:df.loc[0, '备注'] = ''else:if free_zone_flag:return df, free_zone_flagy = k[1]x = k[2][0][0]same_rects_k = self._find_and_sort_rect_in_same_line(y, groups)if len(same_rects_k) == 8:free_zone_flag = Truefor kk in same_rects_k:yy = kk[1]xx = kk[2][0][0]words = groups2[kk]words = sorted(words.items(), key=lambda x: x[0]) if words and len(words) > 0 else Nonekey = words[0][1] if words and len(words) > 0 else Noneval = [word[1] for word in words[1:]] if key and words and len(words) > 1 else ''val = '\n'.join(val) if val else ''if key:df.loc[0, key] = valreturn df, free_zone_flagdef extract(self):data = self._load_data()words = data['words']lines = data['lines']lines = self._fill_line(lines)hlines = lines['hlines']vlines = lines['vlines']cross_points = self._find_cross_points(hlines, vlines)rects = self._find_rects(cross_points)word_groups = self._put_words_into_rect(words, rects)word_groups2 = self._split_words_into_diff_line(word_groups)df = pd.DataFrame()free_zone_flag = Falsefor k, words in word_groups2.items():if k[0] == 'OUT':df_item = self._find_outer(k, words)else:df_item, free_zone_flag = self._find_inner(k, words, word_groups, word_groups2, free_zone_flag)df = pd.concat([df, df_item], axis=1)return dfif __name__=="__main__":path=r'data.pdf'data = Extractor(path).extract()print(data)

PDF电子发票内容提取相关推荐

python提取pdf发票信息_PDF电子发票内容提取
网页版程序使用地址:[在线使用](https://www.yooongchun.com/apps) 摘要本文介绍如何提取PDF版电子发票的内容. 1. 加载内容首先使用Python的pdfplum ...
如何把pdf电子发票转为excel
如何把pdf电子发票转为excel 越来越多的发票是电子发票版,发票的收集整理就是问题.如何能转为excel? 经过一番搜索找到一个靠谱工具.不但能把发票转为excel,还能把商品明细都提取出来. 下 ...
OFD、PDF电子发票同时打开预览
跟着国家的脚本,OFD版式的电子发票已经是大势所趋,OFD电子发票带来了很多便利,例如安全.便捷等.但是需要额外安装阅读工具才可以打开它预览核对,同时市面上也还有PDF的电子发票,那每一种发票都需要配 ...
C#pdf电子发票转图片
提示:文章写完后,目录可以自动生成,如何生成可参考右边的帮助文档文章目录前言一.所需要的包? 二.使用步骤总结前言提示:最近对接了百旺的电子发票,有个需求是要在移动端上展示电子发票图片,结 ...
打印pdf文件 vfp_新技能，如何将多份pdf电子发票文件合成一份文档打印
作为一名行政人员,每个月都会有大量的发票需要处理.特别是到了年底,更是会有大量的电子发票需要存档.打印.报销等. 如果将大量的电子发票分别打印到A4纸上,会造成资源大量的浪费,不利于行政办公室节约的主 ...
pdf电子发票打印方法
1.如下图所示,pdf版电子发票,使用极速pdf阅读器打印方式: 2.如下图所示,pdf版电子发票,使用浏览器打印方式:
多张图片或PDF电子发票怎么用PYTHON打印在一张A4纸上
from os import listdir from os.path import join import fitz,os from PIL import Image import shutil,t ...
python 提取pdf格式电子发票并改名
本人小公司企业主一枚,经常接受电子发票,然后有空发给会计,默认的发票基本都是发票号,看不出是哪个公司开的,哪个公司收的. 经过多次研究使用pymupdf读取pdf格式,但是不同省份和城市开出的电子发票 ...
python 发票信息提取_Python提取发票内容保存到Excel.md
Python提取PDF发票内容保存到Excel --- 摘要:这篇文章介绍如何把发票内容提取出来保存到Excel中.文章分为两个部分,第一部分程序用法,第二部分介绍代码. --- 作者:yooongc ...