java+OpenCV3 +百度OCR(或tesseract) 识别表格数据

原理:先用opencv识别出表格按点拆分每个单元格图片交给百度或tesseract识别

当然有钱的可以买百度的OCR表格识别。。

package com.test;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import javax.servlet.ServletException;
import javax.servlet.annotation.WebServlet;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

import org.json.JSONArray;
import org.json.JSONObject;
import org.opencv.core.Core;
import org.opencv.core.Mat;
import org.opencv.core.MatOfPoint;
import org.opencv.core.MatOfPoint2f;
import org.opencv.core.Point;
import org.opencv.core.Rect;
import org.opencv.core.Scalar;
import org.opencv.core.Size;
import org.opencv.imgcodecs.Imgcodecs;
import org.opencv.imgproc.Imgproc;

import com.baidu.aip.ocr.AipOcr;

/**
* Servlet implementation class TutableRead
*/
@WebServlet("/TutableReadBaidu")
public class TutableReadBaidu extends HttpServlet {
private static final long serialVersionUID = 1L;

protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
// TODO Auto-generated method stub
doPost(request, response);
}
static{
System.load("F:/opencv/build/java/x86/opencv_java310.dll");
}

//注册百度有设置APPID/AK/SK
public static final String APP_ID = "";
public static final String API_KEY = "";
public static final String SECRET_KEY = "";
/**
* @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response)
*/
protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
long startTime=System.currentTimeMillis();
String basePath = request.getSession().getServletContext().getRealPath("/images/");
File dir=new File(basePath);
if(dir.isDirectory()){
for (File f : dir.listFiles()) {f.delete(); }
}
Mat src = Imgcodecs.imread( "C:/Users/lilin/Desktop/"+request.getParameter("name")+".png");
if(src.empty()){ System.out.println( "not found file" ); return; }
Mat gray = new Mat();
Mat erod = new Mat();
Mat blur = new Mat();
int src_height=src.cols(), src_width=src.rows();
//先转为灰度 cvtColor(src, gray, COLOR_BGR2GRAY);
Imgproc.cvtColor(src, gray, Imgproc.COLOR_BGR2GRAY);

/**
* 腐蚀（黑色区域变大）
Mat element = getStructuringElement(MORPH_RECT, Size(erodeSize, erodeSize));
erode(gray, erod, element);
*/
int erodeSize = src_height / 200;
if (erodeSize % 2 == 0){ erodeSize++; }
Mat element = Imgproc.getStructuringElement(Imgproc.MORPH_RECT, new Size(erodeSize, erodeSize));
Imgproc.erode(gray, erod, element);

//高斯模糊化
int blurSize = src_height / 200;
if (blurSize % 2 == 0) {blurSize++; }
Imgproc.GaussianBlur(erod, blur, new Size(blurSize, blurSize), 0, 0);

//封装的二值化 adaptiveThreshold(~gray, thresh, 255, CV_ADAPTIVE_THRESH_MEAN_C, CV_THRESH_BINARY, 15, -2);
Mat thresh = gray.clone();
Mat xx = new Mat();
Core.bitwise_not(gray,xx);//反色
Imgproc.adaptiveThreshold(xx, thresh, 255, Imgproc.ADAPTIVE_THRESH_MEAN_C, Imgproc.THRESH_BINARY, 15, -2);
/*
这部分的思想是将线条从横纵的方向处理后抽取出来，再进行交叉，矩形的点，进而找到矩形区域的过程

*/
// Create the images that will use to extract the horizonta and vertical lines
//使用二值化后的图像来获取表格横纵的线
Mat horizontal = thresh.clone();
Mat vertical = thresh.clone();
//这个值越大，检测到的直线越多
String parameter = request.getParameter("xian"); if(parameter==null||parameter.equals("") ){ parameter="20"; }
int scale = Integer.parseInt(parameter); // play with this variable in order to increase/decrease the amount of lines to be detected 使用这个变量来增加/减少待检测的行数

// Specify size on horizontal axis 指定水平轴上的大小
int horizontalsize = horizontal.cols() / scale;
// Create structure element for extracting horizontal lines through morphology operations 创建通过形态学运算提取水平线的结构元素
// 为了获取横向的表格线，设置腐蚀和膨胀的操作区域为一个比较大的横向直条
Mat horizontalStructure = Imgproc.getStructuringElement(Imgproc.MORPH_RECT,new Size(horizontalsize, 1));
// Apply morphology operations
// 先腐蚀再膨胀
// iterations 最后一个参数，迭代次数，越多，线越多。在页面清晰的情况下1次即可。
Imgproc.erode(horizontal, horizontal, horizontalStructure,new Point(-1, -1),1 );
Imgproc.dilate(horizontal, horizontal, horizontalStructure,new Point(-1, -1),1);
// dilate(horizontal, horizontal, horizontalStructure, Point(-1, -1)); // expand horizontal lines

// Specify size on vertical axis 同上
int verticalsize = vertical.rows() / scale;
// Create structure element for extracting vertical lines through morphology operations
Mat verticalStructure = Imgproc.getStructuringElement(Imgproc.MORPH_RECT,new Size(1, verticalsize));
Imgproc.erode(vertical, vertical, verticalStructure,new Point(-1, -1),1);
Imgproc.dilate(vertical, vertical, verticalStructure, new Point(-1, -1),1);
/*
* 合并线条
* 将垂直线，水平线合并为一张图
*/
Mat mask = new Mat();
Core.add(horizontal,vertical,mask);
/*
* 通过 bitwise_and 定位横线、垂直线交汇的点
*/
Mat joints=new Mat();
Core.bitwise_and(horizontal, vertical, joints);
/*
* 通过 findContours 找轮廓
*
* 第一个参数，是输入图像，图像的格式是8位单通道的图像，并且被解析为二值图像（即图中的所有非零像素之间都是相等的）。
* 第二个参数，是一个 MatOfPoint 数组，在多数实际的操作中即是STL vectors的STL vector，这里将使用找到的轮廓的列表进行填充（即，这将是一个contours的vector,其中contours[i]表示一个特定的轮廓，这样，contours[i][j]将表示contour[i]的一个特定的端点）。
* 第三个参数，hierarchy，这个参数可以指定，也可以不指定。如果指定的话，输出hierarchy，将会描述输出轮廓树的结构信息。0号元素表示下一个轮廓（同一层级）；1号元素表示前一个轮廓（同一层级）；2号元素表示第一个子轮廓（下一层级）；3号元素表示父轮廓（上一层级）
* 第四个参数，轮廓的模式，将会告诉OpenCV你想用何种方式来对轮廓进行提取，有四个可选的值：
* CV_RETR_EXTERNAL （0）：表示只提取最外面的轮廓；
* CV_RETR_LIST （1）：表示提取所有轮廓并将其放入列表；
* CV_RETR_CCOMP （2）:表示提取所有轮廓并将组织成一个两层结构，其中顶层轮廓是外部轮廓，第二层轮廓是“洞”的轮廓；
* CV_RETR_TREE （3）：表示提取所有轮廓并组织成轮廓嵌套的完整层级结构。
* 第五个参数，见识方法，即轮廓如何呈现的方法，有三种可选的方法：
* CV_CHAIN_APPROX_NONE （1）：将轮廓中的所有点的编码转换成点；
* CV_CHAIN_APPROX_SIMPLE （2）：压缩水平、垂直和对角直线段，仅保留它们的端点；
* CV_CHAIN_APPROX_TC89_L1 （3）or CV_CHAIN_APPROX_TC89_KCOS（4）：应用Teh-Chin链近似算法中的一种风格
* 第六个参数，偏移，可选，如果是定，那么返回的轮廓中的所有点均作指定量的偏移
*/
List<MatOfPoint> contours = new ArrayList<MatOfPoint>();
Mat hierarchy = new Mat();
Imgproc.findContours(mask,contours,hierarchy, Imgproc.RETR_EXTERNAL, Imgproc.CHAIN_APPROX_SIMPLE,new Point(0,0));

List<MatOfPoint> contours_poly = contours;
Rect[] boundRect = new Rect[contours.size()];
List<Mat> tables = new ArrayList<Mat>();
//my
List<Rect> haveReacts = new ArrayList();
Map<String, Map<String, Map<String, Double>>> mappoint=new HashMap<String, Map<String, Map<String, Double>>>();
//循环所有找到的轮廓-点
for(int i=0 ; i< contours.size(); i++){
//每个表的点
MatOfPoint point = contours.get(i);
MatOfPoint contours_poly_point = contours_poly.get(i);
/*
* 获取区域的面积
* 第一个参数，InputArray contour：输入的点，一般是图像的轮廓点
* 第二个参数，bool oriented = false:表示某一个方向上轮廓的的面积值，顺时针或者逆时针，一般选择默认false
*/
double area = Imgproc.contourArea(contours.get(i));
//如果小于某个值就忽略，代表是杂线不是表格
if(area < 100){ continue; }
/*
* approxPolyDP 函数用来逼近区域成为一个形状，true值表示产生的区域为闭合区域。比如一个带点幅度的曲线，变成折线
*
* MatOfPoint2f curve：像素点的数组数据。
* MatOfPoint2f approxCurve：输出像素点转换后数组数据。
* double epsilon：判断点到相对应的line segment 的距离的阈值。（距离大于此阈值则舍弃，小于此阈值则保留，epsilon越小，折线的形状越“接近”曲线。）
* bool closed：曲线是否闭合的标志位。
*/
Imgproc.approxPolyDP(new MatOfPoint2f(point.toArray()),new MatOfPoint2f(contours_poly_point.toArray()),3,true);
//为将这片区域转化为矩形，此矩形包含输入的形状
boundRect[i] = Imgproc.boundingRect(contours_poly.get(i));
// 找到交汇处的的表区域对象
Mat table_image = joints.submat(boundRect[i]);

List<MatOfPoint> table_contours = new ArrayList<MatOfPoint>();
Mat joint_mat = new Mat();
Imgproc.findContours(table_image, table_contours,joint_mat, Imgproc.RETR_CCOMP, Imgproc.CHAIN_APPROX_SIMPLE);
//从表格的特性看，如果这片区域的点数小于4，那就代表没有一个完整的表格，忽略掉
if (table_contours.size() < 4){ continue; }

//表格里面的每个点
Map<String, Double> x_zhis=new HashMap<String, Double>();
Map<String, Double> y_zhis=new HashMap<String, Double>();
for (MatOfPoint matOfPoint : table_contours) {
Point[] array = matOfPoint.toArray();
for (Point point2 : array) { x_zhis.put("x"+point2.x, point2.x); y_zhis.put("y"+point2.y, point2.y); }
}
//System.out.println( boundRect[i].x+"|"+boundRect[i].y+"|"+boundRect[i].width+"|"+boundRect[i].height+"|"+table_contours.size()+">>>>>>>>>>>>>>>>>>>");
//my add
haveReacts.add( boundRect[i]);
Map<String, Map<String, Double>> x =new HashMap<String, Map<String,Double>>(); x.put("x", x_zhis);x.put("y", y_zhis);
mappoint.put("key"+(haveReacts.size()-1),x );

//保存图片
tables.add(src.submat(boundRect[i]).clone());
//将矩形画在原图上
Imgproc.rectangle(src, boundRect[i].tl(), boundRect[i].br(), new Scalar(255, 0, 255), 1, 8, 0);

}

//页面数据
Map<String,String> jspdata=new HashMap<String, String>();

for(int i=0; i< tables.size(); i++ ){ Mat table = tables.get(i); Rect rect = haveReacts.get(i);
int width = rect.width,height=rect.height;
Map<String, Map<String, Double>> mapdata = mappoint.get("key"+i);
int[] x_z = maptoint(mapdata.get("x"));
int[] y_z = maptoint(mapdata.get("y"));

//纵切
String px_biao = request.getParameter("x_biao"); if(px_biao==null||px_biao.equals("") ){ px_biao="5"; }
int x_len=0,x_biao=Integer.parseInt(px_biao);
List<Mat> mats=new ArrayList<Mat>();
for (int j = 0; j < x_z.length; j++) {
if(j==0){
Mat img=new Mat(table,new Rect(0,0,x_z[j],height ));if(img.cols()>x_biao ){ mats.add(img); x_len++;}
}else{
Mat img=new Mat(table,new Rect(x_z[j-1],0,x_z[j]-x_z[j-1],height )); if(img.cols()>x_biao ){mats.add(img);x_len++;}
if(j==x_z.length-1){//最后一个处理
Mat img1=new Mat(table,new Rect(x_z[x_z.length-1],0,width-x_z[x_z.length-1],height )); if(img.cols()>x_biao ){mats.add(img1); }
}
}
}
imshow(basePath,table,"table_"+i+".png");//当前table图
//横切保存
String py_biao = request.getParameter("y_biao"); if(py_biao==null||py_biao.equals("") ){ py_biao="5"; }
int y_len=0,y_biao=Integer.parseInt(py_biao );
for (int j = 0; j <mats.size() ; j++) { Mat mat = mats.get(j);
int tuwidth = mat.cols(),tugao=mat.rows();
int cy_len=0;
for (int k = 0; k < y_z.length; k++) {
if(k==0){
Mat img=new Mat(mat,new Rect(0,0,tuwidth , y_z[k] ));if(img.rows()>y_biao ){ imshow(basePath, img,"table_"+i+"_"+j+"_"+cy_len+".png"); cy_len++; }
}else{
Mat img=new Mat(mat,new Rect(0,y_z[k-1],tuwidth,y_z[k]-y_z[k-1]));if(img.rows()>y_biao ){ imshow(basePath, img,"table_"+i+"_"+j+"_"+cy_len+".png"); cy_len++;}
if(k==y_z.length-1){//最后一个处理
Mat img1=new Mat(mat,new Rect(0,y_z[k],tuwidth,tugao-y_z[k] ));if(img.rows()>y_biao ){ imshow(basePath, img1,"table_"+i+"_"+j+"_"+(cy_len)+".png"); }
}
}
}
y_len=cy_len;
}
//保存数据信息
jspdata.put("table_"+i, x_len+"_"+y_len);
}
request.setAttribute("data", jspdata);

//百度识别处理
AipOcr client = new AipOcr(APP_ID, API_KEY, SECRET_KEY);
// 可选：设置网络连接参数
client.setConnectionTimeoutInMillis(2000);
client.setSocketTimeoutInMillis(60000);

Map<String,String> jspdata1=new HashMap<String, String>();
int num=0;
for (Map.Entry<String, String> d : jspdata.entrySet()) {
String value= d.getValue();
if(value.indexOf("_")!=-1){
//
String x="";
String len[]=value.split("_");
int xlen=Integer.parseInt(len[0]);int ylen=Integer.parseInt(len[1]);
for(int i=0;i<ylen;i++){
//行
for(int j=0;j<xlen;j++){
String name="table_"+num+"_"+j+"_"+i+".png";
JSONObject res = client.basicGeneral(basePath+"/"+name, new HashMap<String, String>());
String text="";
try {
Object words_result = res.get("words_result");
JSONArray array=(JSONArray) words_result;
text=getjsontext(array);
} catch (Exception e) { System.out.println("cuowu"); }
try {
Thread.sleep(400);//百度qps限制
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
x+=(text.equalsIgnoreCase("")?" ":text)+"&_&";
}
x=x.substring(0, x.lastIndexOf("&_&"));
x+="#_#";
}
//
jspdata1.put("shibie"+num, x);
}
num++;
}
long endTime=System.currentTimeMillis();
request.setAttribute("time", (float)(endTime-startTime)/1000);
request.setAttribute("shibiedata", jspdata1);
request.getRequestDispatcher("tutableread.jsp").forward(request,response);
}
public void imshow(String basePath,Mat dst,String name) {
Imgcodecs.imwrite(basePath+"/"+name, dst);
}
public String getjsontext(JSONArray array){
String text="";
for (int i = 0; i < array.length(); i++) { JSONObject textx = (JSONObject)array.get(i); text+=textx.get("words"); }
return text;
}
public int[] maptoint(Map<String, Double> x) {
int[] zhi=new int[x.size()];int num=0;
for (Map.Entry<String, Double> m :x.entrySet()) {
zhi[num]=m.getValue().intValue(); num++;
}
Arrays.sort(zhi);
return zhi;
}

}

效果图（我主要获取数据的单元格所以拆分的比较大体没对合并的单元格处理哦）

参考文章

https://my.oschina.net/u/3767256/blog/1615720

https://blog.csdn.net/yomo127/article/details/52045146

如果你对此感兴趣的可以加群261074724讨论

java+OpenCV3 +百度OCR(或tesseract) 识别表格数据相关推荐

保姆级——Java调用百度OCR实现身份证识别
具体实现功能和参数,可以看百度的API开发文档:https://ai.baidu.com/ai-doc/OCR/rk3h7xzck 其实我是看不懂API文档的文章中的请求参数调用百度的OCR需要做 ...
java调用ocr识别api_Java文字识别软件-调用百度ocr实现文字识别
java_baidu_ocr Java调用百度OCR文字识别API实现图片文字识别软件项目源代码在文末,放到了GitHub上 - https://github.com/Ymy214/java_bai ...
java ocr文字识别软件_Java文字识别软件-调用百度ocr实现文字识别
java_baidu_ocr Java调用百度OCR文字识别API实现图片文字识别软件这是一款小巧方便,强大的文字识别软件,由Java编写,配上了窗口界面调用了百度ocr文字识别API 识别精度高 ...
Java调用百度OCR文字识别API实现图片文字识别软件
java_baidu_ocr Java调用百度OCR文字识别API实现图片文字识别软件这是一款小巧方便,强大的文字识别软件,由Java编写,配上了窗口界面调用了百度ocr文字识别API 识别精度高 ...
一篇文章搞定百度OCR图片文字识别API
一篇文章搞定百度OCR图片文字识别API https://www.jianshu.com/p/7905d3b12104 转载于:https://www.cnblogs.com/chongdongxia ...
Java后端 + 百度SDK实现人脸识别
Java后端 + 百度SDK实现人脸识别人工智能越来越贴近我们的生活,相信大家也经常接触到人脸识别,手机付款.app注册验证.门禁等等. 如果要用Java后台使用这些功能,那么需要怎么做呢?请看完下 ...
python 百度识图_python截图+百度ocr（图片识别）+ 百度翻译
python截图+百度ocr(图片识别)+ 百度翻译一直想用python做一个截图并自动翻译的工具,恰好最近有时间就在网上找了资料,根据资料以及自己的理解做了一个简单的截图翻译工具.整理一下并把代码 ...
java 操作 word 表格和样式,java读取word表格中的表格 java如何读取word中的excel表格数据...
Java 利用poi 可以直接读取word中的表格保持样式生1.读取word 2003及word 2007需要的jar包读取 2003 版本(.doc)的word文件相对来说比较简单,只需要 poi ...
Java调用百度OCR文字识别的接口
调用百度OCR文字识别的接口,来自于百度官网,亲测可以使用跳转链接 FileUtil的下载链接 Base64Util下载链接 HttpUtil下载链接 GsonUtils下载链接 Accurate. ...

java+OpenCV3 +百度OCR(或tesseract) 识别表格数据

java+OpenCV3 +百度OCR(或tesseract) 识别表格数据相关推荐

最新文章

热门文章