使用的POI版本为3.14

在sax模式下读取一个20万行的xlsx报错:org.apache.xmlbeans.XmlException: java.io.CharConversionException: Characters larger than 4 bytes are not supported: byte 0xb1 implies a length of more than 4 bytes

到处找答案说是xlsx文件有乱码,sax转换为xml其解析时xml文件中有超过了4byte的字符,故抛出此异常。

各种研究均无法解决,最后发现如下代码,将读取xlsx的代码替换为如下,成功解决,原因不明。

此处给出大神的原地址:https://download.csdn.net/download/zwyjg/9606945

我自己的代码对以上链接的源码做了些修改。

ExcelTool.java:

package Tools;import java.io.File;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;import Tools.RowDataProcesser;
import Tools.Xlsx2ListData;public class ExcelTool {/*** 读取xlsx文件,不适用于读取大文件*/public static List<List<String>> read(String filePath) throws Exception {List<List<String>> result = new ArrayList<>();Workbook wb = new XSSFWorkbook(new File(filePath));for (Sheet sheet : wb) {for (Row row : sheet) {List<String> rowData = getRowData(row);result.add(rowData);}}wb.close();return result;}/*** 读取xlsx文件,不适用于读取大文件* * @return map: key->sheet name, value->rowData*/public static Map<String, List<List<String>>> readMultiSheet(String filePath)throws Exception {Map<String, List<List<String>>> result = new LinkedHashMap<>();Workbook wb = new XSSFWorkbook(new File(filePath));for (Sheet sheet : wb) {List<List<String>> sheetResult = new ArrayList<>();for (Row row : sheet) {List<String> rowData = getRowData(row);sheetResult.add(rowData);}result.put(sheet.getSheetName(), sheetResult);}wb.close();return result;}/*** 以SAX模式读取xlsx文件,读取后将结果全部加载到内存,适用于读取较大文件* * @param filePath*            文件路径* @param minColumns*            补齐到多少列,-1表示不补齐*/public static List<List<Object>> readBigFile(String filePath, int minColumns)throws Exception {File xlsxFile = new File(filePath);OPCPackage p = OPCPackage.open(xlsxFile.getPath(), PackageAccess.READ);Xlsx2ListData xlsx2ListData = new Xlsx2ListData(p, minColumns, null);List<List<Object>> result = xlsx2ListData.process();p.close();return result;}/*** 以SAX方式读取xlsx文件,读取每一行调用RowDataProcesser,适用于读超大文件* * @param filePath*            文件路径* @param minColumns*            补齐到多少列,-1表示不补齐* @param rowDataProcesser*            处理每一行的数据*/public static void readBigFile(String filePath, int minColumns,RowDataProcesser rowDataProcesser) throws Exception {File xlsxFile = new File(filePath);OPCPackage p = OPCPackage.open(xlsxFile.getPath(), PackageAccess.READ);Xlsx2ListData xlsx2ListData = new Xlsx2ListData(p, minColumns,rowDataProcesser);xlsx2ListData.process();p.close();}private static List<String> getRowData(Row row) {List<String> rowData = new ArrayList<>();int cellNum = row.getLastCellNum();for (int i = 0; i < cellNum; i++) {Cell cell = row.getCell(i);if (cell == null) {rowData.add("");continue;}switch (cell.getCellType()) {case Cell.CELL_TYPE_STRING:rowData.add(cell.getRichStringCellValue().getString());break;case Cell.CELL_TYPE_NUMERIC:rowData.add(new DataFormatter().formatCellValue(cell));break;case Cell.CELL_TYPE_BOOLEAN:rowData.add(cell.getBooleanCellValue() + "");break;case Cell.CELL_TYPE_FORMULA:rowData.add(cell.getCellFormula());break;case Cell.CELL_TYPE_BLANK:case Cell.CELL_TYPE_ERROR:default:rowData.add("");break;}}return rowData;}
}

RowDataProcesser.java:

package Tools;import java.util.List;import model.Tweet;public interface RowDataProcesser {/*** 处理一行的数据* @param rowData: 该行数据*/public void processRowData(List<Object> rowData);
}

Xlsx2ListData.java:

package Tools;import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;import javax.xml.parsers.ParserConfigurationException;import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.ss.util.CellAddress;
import org.apache.poi.ss.util.CellReference;
import org.apache.poi.util.SAXHelper;
import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler;
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFComment;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;public class Xlsx2ListData {/*** Uses the XSSF Event SAX helpers to do most of the work of parsing the* Sheet XML, and outputs the contents as a List.*/private class SheetToList implements SheetContentsHandler {private boolean firstCellOfRow = false;private int currentRow = -1;private int currentCol = -1;private Object defaultValue = "";List<Object> currRowData = new ArrayList<>();private List<List<Object>> data = new ArrayList<>();private void outputMissingRows(int number) {for (int i = 0; i < number; i++) {List<Object> rowData = new ArrayList<>();for (int j = 0; j < minColumns; j++) {rowData.add(defaultValue);}processRowData(rowData);}}private void processRowData(List<Object> rowData) {if (rowDataProcesser != null) {rowDataProcesser.processRowData(rowData);} else {data.add(rowData);}currRowData = new ArrayList<>();}public List<List<Object>> getData() {return data;}public void startRow(int rowNum) {// If there were gaps, output the missing rowsoutputMissingRows(rowNum - currentRow - 1);// Prepare for this rowfirstCellOfRow = true;currentRow = rowNum;currentCol = -1;}public void endRow(int rowNum) {// Ensure the minimum number of columnsfor (int i = currentCol; i < minColumns - 1; i++) {currRowData.add(defaultValue);}processRowData(currRowData);}@Overridepublic void cell(String cellReference, String formattedValue,XSSFComment comment) {if (firstCellOfRow) {firstCellOfRow = false;}// gracefully handle missing CellRef here in a similar way as// XSSFCell doesif (cellReference == null) {cellReference = new CellAddress(currentRow, currentCol).formatAsString();}// Did we miss any cells?int thisCol = (new CellReference(cellReference)).getCol();int missedCols = thisCol - currentCol - 1;for (int i = 0; i < missedCols; i++) {currRowData.add(defaultValue);}currentCol = thisCol;currRowData.add(formattedValue);}public void headerFooter(String text, boolean isHeader, String tagName) {// Skip, ignore headers or footers}}// /private final OPCPackage xlsxPackage;/*** Number of columns to read starting with leftmost*/private final int minColumns;private final RowDataProcesser rowDataProcesser;/*** Creates a new XLSX -> List converter** @param pkg*            The XLSX package to process* @param minColumns*            The minimum number of columns to output, or -1 for no minimum* @param rowDataProcesser*            process row data*/public Xlsx2ListData(OPCPackage pkg, int minColumns,RowDataProcesser rowDataProcesser) {this.xlsxPackage = pkg;this.minColumns = minColumns;this.rowDataProcesser = rowDataProcesser;}/*** Parses and shows the content of one sheet using the specified styles and* shared-strings tables.** @param styles* @param strings* @param sheetInputStream*/public void processSheet(StylesTable styles,ReadOnlySharedStringsTable strings,SheetContentsHandler sheetHandler, InputStream sheetInputStream)throws IOException, ParserConfigurationException, SAXException {DataFormatter formatter = new DataFormatter();InputSource sheetSource = new InputSource(sheetInputStream);try {XMLReader sheetParser = SAXHelper.newXMLReader();ContentHandler handler = new XSSFSheetXMLHandler(styles, null,strings, sheetHandler, formatter, false);sheetParser.setContentHandler(handler);sheetParser.parse(sheetSource);} catch (ParserConfigurationException e) {throw new RuntimeException("SAX parser appears to be broken - "+ e.getMessage());}}/*** Initiates the processing of the XLS workbook file to List.** @throws IOException* @throws OpenXML4JException* @throws ParserConfigurationException* @throws SAXException*/public List<List<Object>> process() throws IOException, OpenXML4JException,ParserConfigurationException, SAXException {ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(this.xlsxPackage);XSSFReader xssfReader = new XSSFReader(this.xlsxPackage);StylesTable styles = xssfReader.getStylesTable();XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();List<List<Object>> result = new ArrayList<>();while (iter.hasNext()) {InputStream stream = iter.next();SheetToList sheetToList = new SheetToList();processSheet(styles, strings, sheetToList, stream);result.addAll(sheetToList.getData());stream.close();}return result;}}

POI在sax模式下读取xlsx报错 Characters larger than 4 bytes are not supported解决办法相关推荐

  1. aMDcpu不支持mysql_AMD黑苹果 使用numpy或pytorch或Adobe 报错Intel MKL ERROR: CPU 0 is not supported解决办法...

    日期:2020-11-04 macos版本:10.15.7 Python版本:anaconda python 3.8 CPU:AMD Ryzen5 3600 参考文档: http://bbs.pcbe ...

  2. 【记录】IDEA未正确关闭导致打开报错,进不了主界面,含解决办法

    [记录]IDEA未正确关闭导致打开报错,进不了主界面,含解决办法 错误提示 解决方案 参考 错误提示 截取了错误的主要部分 java.util.concurrent.CompletionExcepti ...

  3. pycharm报错:Error configuring SDK: Accessing invalid virtual file: 解决办法

    pycharm报错:Error configuring SDK: Accessing invalid virtual file: 解决办法: 针对linux系统, 删除home目录下pycharm配置 ...

  4. oracle operation_type,案例:Oracle报错performing DML/DDL operation over object in bin解决办法

    天萃荷净 运维DBA在巡检时发现alert日志文件中出现Oracle报错performing DML/DDL operation over object in bin,分析原因为回收站中的对象执行了d ...

  5. pandas的read_excel 报错:OverflowError: date value out of range‘ 的解决办法

    pandas的read_excel 报错:OverflowError: date value out of range' 的解决办法 首先,报错了要进行原因分析,导致这个错误的原因是将文件中的数值读成 ...

  6. 搭建网站服务器时报错url,服务器网站总报错“ERROR the requested URL could not be retrieved”解决办法...

    服务器网站总报错"ERROR the requested URL could not be retrieved" ERROR The requested URL could not ...

  7. Unity在OpenGL模式下Shader编译报错

    报错信息 GLSL compilation failed: 0(21) : error C7528: OpenGL reserves names containing '__' 双击报错VS自动打开V ...

  8. R语言加载xlsx报错错误: JAVA_HOME cannot be determined from the Registry解决方法

    错误: package or namespace load failed for 'xlsx':  loadNamespace()里算'rJava'时.onLoad失败了,详细内容:   调用: fu ...

  9. windwos下启动Redis或者Sentinel报错【listen:Unknown error】的原因及解决办法

    笔者一般在linux下使用redis,因为redis团队并没有编写windows版本的redis,所以windows版本的redis维护和更新没有linux版本的及时.最近有同事在windows下使用 ...

最新文章

  1. QUIC学习笔记之 如何做到0RTT加密传输
  2. redis服务器防止入侵,加ip,密码限制
  3. ipywidgets_未来价值和Ipywidgets
  4. IOS 文本文字下面添加下划线
  5. 共建数据库软件全生态,新数科技宣布 ShinSight 开放共享!
  6. [运维笔记] PowerShell (模块).模块的查找、安装、卸载、更新、保存、发布
  7. Windows核心编程_HOOk SOCKET实现封包拦截
  8. PC-hosts 的使用 [可使电脑无法正常上网]
  9. Idea导出项目jar包
  10. 100套精品PPT模板免费拿!以后再也不用怕老板叫你制作PPT了
  11. storm风暴英雄 tempo_《绝地求生》Tempo Storm北美黑马,掌控战场
  12. R Shiny module学习笔记
  13. 中央民族大学计算机考研2020,2020年中央民族大学856计算机学科专业综合考研复习资料...
  14. 手机服务器异常修复,手机服务器异常
  15. 强训之【走方格的方案数和另类加法】
  16. 清华大学五道口金融学院2023年博士生招生简章(普博+直博)
  17. play_win7截图工具
  18. 计数问题:1~n中x出现了多少次?
  19. 智慧建造安全质量管理数字化解决方案
  20. Minecraft神奇玩家不用键盘,行走全部靠骑猪最后通关MC

热门文章

  1. 想用c做能存档的小游戏?来学习文件操作吧。
  2. 倒计时2天 校园行|AI TIME “未来杯” 走进浙江大学
  3. 易验APP一键登录对接文档
  4. D - 折线分割平面 Time Limit:1000MS Memory Limit:32768KB 64bit IO Format:%I64d %I64u
  5. c3p0和dbcp的使用和区别
  6. Appium常用操作及H5页面元素定位
  7. 32导联 博睿康_赛题详情(Competition Details)-运动想象无训练数据集
  8. oracle表空间undotbs1,解决Oracle 表空间UNDOTBS1太大的有关问题
  9. Java 模拟栈结构
  10. python语言之父 是谁_Python之父与Python发展简史