当前位置:   article > 正文

Excel大文件时读取内存溢出的解决方案,测试50M的Excel通过。_xssfreader.getsharedstringtable内存溢出

xssfreader.getsharedstringtable内存溢出

1.大文件xlsx的文件解析,本案例仅仅只是读取Excel文件的内容为例。具体需求可按自己的需求改。

  1. package com.sundy.parse.util;
  2. import java.io.File;
  3. import java.io.IOException;
  4. import java.io.InputStream;
  5. import java.util.ArrayList;
  6. import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
  7. import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
  8. import org.apache.poi.openxml4j.opc.OPCPackage;
  9. import org.apache.poi.openxml4j.opc.PackageAccess;
  10. import org.apache.poi.xssf.eventusermodel.XSSFReader;
  11. import org.apache.poi.xssf.model.SharedStringsTable;
  12. import org.apache.poi.xssf.usermodel.XSSFRichTextString;
  13. import org.xml.sax.Attributes;
  14. import org.xml.sax.InputSource;
  15. import org.xml.sax.SAXException;
  16. import org.xml.sax.XMLReader;
  17. import org.xml.sax.helpers.DefaultHandler;
  18. import org.xml.sax.helpers.XMLReaderFactory;
  19. public class ParseXlsxExcel extends DefaultHandler {
  20. private OPCPackage opcPackage = null;
  21. private XSSFReader xssfReader = null;
  22. private boolean nextIsStr;
  23. private String cellContent; //单元格的内容
  24. private ArrayList<String> sheetList; // sheet的所有内容
  25. private ArrayList<String> excelList; // Excel的所有内容
  26. private SharedStringsTable sst;
  27. private int currentColumn = 0;
  28. private XMLReader reader;
  29. /**
  30. * @param path .xlxs后缀的Excel文件
  31. * @param sheetId
  32. * @throws IOException
  33. * @throws OpenXML4JException
  34. * @throws SAXException
  35. */
  36. public ParseXlsxExcel(String path,int sheetId) throws IOException, OpenXML4JException, SAXException{
  37. init(new File(path));
  38. parseSheet(sheetId);
  39. }
  40. public ParseXlsxExcel(File file,int sheetId) throws IOException, OpenXML4JException, SAXException{
  41. init(file);
  42. parseSheet(sheetId);
  43. }
  44. /**
  45. * @param file .xlxs 后缀的Excel文件
  46. * @param flag true 表示查询所有 ,false 表示只获取sheetId为1的内容
  47. * @throws IOException
  48. * @throws OpenXML4JException
  49. * @throws SAXException
  50. */
  51. public ParseXlsxExcel(File file,boolean flag) throws IOException, OpenXML4JException, SAXException{
  52. initAll(file, flag);
  53. }
  54. public ParseXlsxExcel(String path,boolean flag) throws IOException, OpenXML4JException, SAXException{
  55. initAll(new File(path), flag);
  56. }
  57. private void initAll(File file, boolean flag)throws InvalidFormatException, IOException, OpenXML4JException,SAXException {
  58. init(file);
  59. if(flag){
  60. parseAllSheet();
  61. }else {
  62. parseSheet(1);
  63. }
  64. }
  65. private void parseAllSheet() {
  66. int x = 0;
  67. int y = 0;
  68. while(x==y){
  69. try {
  70. parseSheet(++x);
  71. excelList.addAll(sheetList);
  72. y=x;
  73. currentColumn=0;
  74. } catch (Exception e) {
  75. // e.printStackTrace();
  76. }
  77. }
  78. }
  79. private void init(File file) throws IOException, OpenXML4JException, SAXException{
  80. opcPackage = OPCPackage.open((file), PackageAccess.READ);
  81. sheetList = new ArrayList<String>();
  82. excelList = new ArrayList<String>();
  83. xssfReader = new XSSFReader(opcPackage);
  84. sst = xssfReader.getSharedStringsTable();
  85. reader = XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser");
  86. reader.setContentHandler(this);
  87. }
  88. public void close() throws IOException{
  89. if(opcPackage!=null) opcPackage.close();
  90. }
  91. private void parseSheet(int sheetId) throws IOException, OpenXML4JException, SAXException{
  92. sheetList.clear();
  93. InputStream inStream = xssfReader.getSheet("rId"+sheetId);
  94. InputSource inputSource = new InputSource(inStream);
  95. reader.parse(inputSource);
  96. }
  97. @Override
  98. public void startElement(String uri, String localName, String qName,Attributes attributes) throws SAXException {
  99. if(qName.equals("c")){ // c为单元格,v为值,row为行。
  100. String type = attributes.getValue("t");
  101. if(type!=null && type.equals("s")){
  102. nextIsStr = true;
  103. }else {
  104. nextIsStr = false;
  105. }
  106. }
  107. }
  108. @Override
  109. public void characters(char[] ch, int start, int length) throws SAXException {
  110. cellContent = new String(ch, start, length);
  111. }
  112. @Override
  113. public void endElement(String uri, String localName, String qName) throws SAXException{
  114. if(nextIsStr&&cellContent!=""&&cellContent!=null){
  115. try {
  116. int idx = Integer.parseInt(cellContent);
  117. cellContent = new XSSFRichTextString(sst.getEntryAt(idx)).toString();
  118. } catch (Exception e) {
  119. // e.printStackTrace();
  120. }
  121. }
  122. if(qName.equals("v")){
  123. sheetList.add(currentColumn++, cellContent);
  124. cellContent="";
  125. }
  126. }
  127. /** sheet的所有内容 */
  128. public ArrayList<String> getSheetList() {
  129. return sheetList;
  130. }
  131. /** Excel的所有内容 */
  132. public ArrayList<String> getExcelList() {
  133. return excelList;
  134. }
  135. public static void main(String[] args) throws Exception{
  136. long start = System.currentTimeMillis();
  137. ParseXlsxExcel excel = new ParseXlsxExcel("C:/Users/admin/Desktop/大师傅家的感觉.xlsx",true);
  138. excel.close();
  139. ArrayList<String> list = excel.getExcelList();
  140. long end = System.currentTimeMillis();
  141. System.out.println(list.size());//19677984
  142. System.out.println(end-start); //16389ms
  143. }
  144. }

2.xls大文件的解析,参考如下(http://gaosheng08.iteye.com/blog/624604);

  1. package com.sundy.parse.util;
  2. import java.io.FileInputStream;
  3. import java.io.IOException;
  4. import java.util.ArrayList;
  5. import java.util.List;
  6. import java.util.regex.Pattern;
  7. import org.apache.poi.hssf.eventusermodel.EventWorkbookBuilder.SheetRecordCollectingListener;
  8. import org.apache.poi.hssf.eventusermodel.FormatTrackingHSSFListener;
  9. import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
  10. import org.apache.poi.hssf.eventusermodel.HSSFListener;
  11. import org.apache.poi.hssf.eventusermodel.HSSFRequest;
  12. import org.apache.poi.hssf.eventusermodel.MissingRecordAwareHSSFListener;
  13. import org.apache.poi.hssf.eventusermodel.dummyrecord.LastCellOfRowDummyRecord;
  14. import org.apache.poi.hssf.eventusermodel.dummyrecord.MissingCellDummyRecord;
  15. import org.apache.poi.hssf.model.HSSFFormulaParser;
  16. import org.apache.poi.hssf.record.BOFRecord;
  17. import org.apache.poi.hssf.record.BlankRecord;
  18. import org.apache.poi.hssf.record.BoolErrRecord;
  19. import org.apache.poi.hssf.record.BoundSheetRecord;
  20. import org.apache.poi.hssf.record.FormulaRecord;
  21. import org.apache.poi.hssf.record.LabelRecord;
  22. import org.apache.poi.hssf.record.LabelSSTRecord;
  23. import org.apache.poi.hssf.record.NoteRecord;
  24. import org.apache.poi.hssf.record.NumberRecord;
  25. import org.apache.poi.hssf.record.RKRecord;
  26. import org.apache.poi.hssf.record.Record;
  27. import org.apache.poi.hssf.record.SSTRecord;
  28. import org.apache.poi.hssf.record.StringRecord;
  29. import org.apache.poi.hssf.usermodel.HSSFWorkbook;
  30. import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  31. /**
  32. * 用于解析大于5M的后缀为.xls的Excel文件,防止内存溢出
  33. * @author sundy
  34. * 参考:http://gaosheng08.iteye.com/blog/624604
  35. */
  36. public class ParseXlsExcel implements HSSFListener{
  37. private int minColumns;
  38. private POIFSFileSystem fs;
  39. private int lastRowNumber;
  40. private int lastColumnNumber;
  41. /** Should we output the formula, or the value it has? */
  42. private boolean outputFormulaValues = true;
  43. /** For parsing Formulas */
  44. private SheetRecordCollectingListener workbookBuildingListener;
  45. private HSSFWorkbook stubWorkbook;
  46. // Records we pick up as we process
  47. private SSTRecord sstRecord;
  48. private FormatTrackingHSSFListener formatListener;
  49. /** So we known which sheet we're on */
  50. private int sheetIndex = -1;
  51. private BoundSheetRecord[] orderedBSRs;
  52. @SuppressWarnings("unchecked")
  53. private ArrayList boundSheetRecords = new ArrayList();
  54. // For handling formulas with string results
  55. private int nextRow;
  56. private int nextColumn;
  57. private boolean outputNextStringRecord;
  58. private int curRow;
  59. private List<String> rowlist;
  60. @SuppressWarnings( "unused")
  61. private String sheetName;
  62. private List<String> contents = new ArrayList<String>();
  63. public ParseXlsExcel(POIFSFileSystem fs) throws Exception {
  64. this.fs = fs;
  65. this.minColumns = -1;
  66. this.curRow = 0;
  67. this.rowlist = new ArrayList<String>();
  68. }
  69. public ParseXlsExcel(String filename) throws Exception {
  70. this(new POIFSFileSystem(new FileInputStream(filename)));
  71. }
  72. //excel记录行操作方法,以sheet索引,行索引和行元素列表为参数,对sheet的一行元素进行操作,元素为String类型
  73. public void optRows(int sheetIndex,int curRow, List<String> rowlist){
  74. StringBuffer sb = new StringBuffer();
  75. for (int i = 0 ;i< rowlist.size();i++){
  76. String str = rowlist.get(i);
  77. if(null!=str&&!"".equals(str)) sb.append(str.trim()+" ");
  78. }
  79. contents.add(sb.toString());
  80. }
  81. /**读取到的Excel文档,返回文档的content*/
  82. public List<String> getList() {
  83. return contents;
  84. }
  85. /**
  86. * 遍历 excel 文件
  87. */
  88. public void process() throws IOException {
  89. MissingRecordAwareHSSFListener listener = new MissingRecordAwareHSSFListener(this);
  90. formatListener = new FormatTrackingHSSFListener(listener);
  91. HSSFEventFactory factory = new HSSFEventFactory();
  92. HSSFRequest request = new HSSFRequest();
  93. if (outputFormulaValues) {
  94. request.addListenerForAllRecords(formatListener);
  95. } else {
  96. workbookBuildingListener = new SheetRecordCollectingListener(formatListener);
  97. request.addListenerForAllRecords(workbookBuildingListener);
  98. }
  99. factory.processWorkbookEvents(request, fs);
  100. }
  101. /**
  102. * HSSFListener 监听方法,处理 Record
  103. */
  104. @SuppressWarnings("unchecked")
  105. public void processRecord(Record record) {
  106. int thisRow = -1;
  107. int thisColumn = -1;
  108. String thisStr = null;
  109. String value = null;
  110. switch (record.getSid()) {
  111. case BoundSheetRecord.sid:
  112. boundSheetRecords.add(record);
  113. break;
  114. case BOFRecord.sid:
  115. BOFRecord br = (BOFRecord) record;
  116. if (br.getType() == BOFRecord.TYPE_WORKSHEET) {
  117. // Create sub workbook if required
  118. if (workbookBuildingListener != null && stubWorkbook == null) {
  119. stubWorkbook = workbookBuildingListener.getStubHSSFWorkbook();
  120. }
  121. sheetIndex++;
  122. if (orderedBSRs == null) {
  123. orderedBSRs = BoundSheetRecord.orderByBofPosition(boundSheetRecords);
  124. }
  125. sheetName = orderedBSRs[sheetIndex].getSheetname();
  126. }
  127. break;
  128. case SSTRecord.sid:
  129. sstRecord = (SSTRecord) record;
  130. break;
  131. case BlankRecord.sid:
  132. BlankRecord brec = (BlankRecord) record;
  133. thisRow = brec.getRow();
  134. thisColumn = brec.getColumn();
  135. thisStr = "";
  136. break;
  137. case BoolErrRecord.sid:
  138. BoolErrRecord berec = (BoolErrRecord) record;
  139. thisRow = berec.getRow();
  140. thisColumn = berec.getColumn();
  141. thisStr = "";
  142. break;
  143. case FormulaRecord.sid:
  144. FormulaRecord frec = (FormulaRecord) record;
  145. thisRow = frec.getRow();
  146. thisColumn = frec.getColumn();
  147. if (outputFormulaValues) {
  148. if (Double.isNaN(frec.getValue())) {
  149. outputNextStringRecord = true;
  150. nextRow = frec.getRow();
  151. nextColumn = frec.getColumn();
  152. } else {
  153. thisStr = formatListener.formatNumberDateCell(frec);
  154. }
  155. } else {
  156. thisStr = '"' + HSSFFormulaParser.toFormulaString(stubWorkbook, frec.getParsedExpression()) + '"';
  157. }
  158. break;
  159. case StringRecord.sid:
  160. if (outputNextStringRecord) {
  161. // String for formula
  162. StringRecord srec = (StringRecord) record;
  163. thisStr = srec.getString();
  164. thisRow = nextRow;
  165. thisColumn = nextColumn;
  166. outputNextStringRecord = false;
  167. }
  168. break;
  169. case LabelRecord.sid:
  170. LabelRecord lrec = (LabelRecord) record;
  171. curRow = thisRow = lrec.getRow();
  172. thisColumn = lrec.getColumn();
  173. value = lrec.getValue().trim();
  174. value = value.equals("")?"":value;
  175. // this.rowlist.add(thisColumn, value);
  176. rowlist.add(value);
  177. break;
  178. case LabelSSTRecord.sid:
  179. LabelSSTRecord lsrec = (LabelSSTRecord) record;
  180. curRow = thisRow = lsrec.getRow();
  181. thisColumn = lsrec.getColumn();
  182. if (sstRecord != null){
  183. value = sstRecord
  184. .getString(lsrec.getSSTIndex()).toString().trim();
  185. value = value.equals("")?"":value;
  186. rowlist.add(value);
  187. }
  188. break;
  189. case NoteRecord.sid:
  190. NoteRecord nrec = (NoteRecord) record;
  191. thisRow = nrec.getRow();
  192. thisColumn = nrec.getColumn();
  193. thisStr = '"' + "(TODO)" + '"';
  194. break;
  195. case NumberRecord.sid:
  196. NumberRecord numrec = (NumberRecord) record;
  197. curRow = thisRow = numrec.getRow();
  198. thisColumn = numrec.getColumn();
  199. value = formatListener.formatNumberDateCell(numrec).trim();
  200. value = value.equals("")?"":value;
  201. rowlist.add(value);
  202. break;
  203. case RKRecord.sid:
  204. RKRecord rkrec = (RKRecord) record;
  205. thisRow = rkrec.getRow();
  206. thisColumn = rkrec.getColumn();
  207. thisStr = '"' + "(TODO)" + '"';
  208. break;
  209. default:
  210. break;
  211. }
  212. // 遇到新行的操作
  213. if (thisRow != -1 && thisRow != lastRowNumber) {
  214. lastColumnNumber = -1;
  215. }
  216. // 空值的操作
  217. if (record instanceof MissingCellDummyRecord) {
  218. MissingCellDummyRecord mc = (MissingCellDummyRecord) record;
  219. curRow = thisRow = mc.getRow();
  220. thisColumn = mc.getColumn();
  221. // rowlist.add(thisColumn," ");
  222. }
  223. // 更新行和列的值
  224. if (thisRow > -1)
  225. lastRowNumber = thisRow;
  226. if (thisColumn > -1)
  227. lastColumnNumber = thisColumn;
  228. // 行结束时的操作
  229. if (record instanceof LastCellOfRowDummyRecord) {
  230. if (minColumns > 0) {
  231. // 列值重新置空
  232. if (lastColumnNumber == -1) {
  233. lastColumnNumber = 0;
  234. }
  235. }
  236. // 行结束时, 调用 optRows() 方法
  237. lastColumnNumber = -1;
  238. try {
  239. optRows(sheetIndex,curRow, rowlist);
  240. } catch (Exception e) {
  241. e.printStackTrace();
  242. }
  243. rowlist.clear();
  244. }
  245. }
  246. public static void main(String[] args) throws Exception {
  247. ParseXlsExcel excel = new ParseXlsExcel("F:\\srv\\tiptop\\crawl\\www.anxiang.gov.cn\\Excel\\3fbc7e659d1f42311ee89c6d47fac414.xls");
  248. excel.process();
  249. int size = excel.getList().size();
  250. List<String> list2 = excel.getList();
  251. String reg = "楚源收储点";
  252. Pattern pattern = Pattern.compile(reg);
  253. int x = 0;
  254. for (int i = 0; i < size; i++) {
  255. String replace = list2.get(i).trim().replace("","");
  256. if(null!=replace&&!"".equals(replace)){
  257. if(pattern.matcher(replace).find()) x+=1;
  258. System.out.println(replace);
  259. }
  260. }
  261. System.err.println(" 存在关键字【"+reg+"】的个数: "+x);
  262. }
  263. }


声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/繁依Fanyi0/article/detail/88890
推荐阅读
相关标签
  

闽ICP备14008679号