赞
踩
读取txt,doc,docx文档格式的文本内容,通过不同格式,读取逻辑不同,避免造成文本内容乱码问题,
这里需要安装Maven:
版本最好统一
- <dependency>
- <groupId>org.apache.poi</groupId>
- <artifactId>poi-ooxml</artifactId>
- <version>5.2.3</version>
- </dependency>
- <dependency>
- <groupId>org.apache.poi</groupId>
- <artifactId>poi</artifactId>
- <version>5.2.3</version>
- </dependency>
- <dependency>
- <groupId>org.apache.poi</groupId>
- <artifactId>poi-scratchpad</artifactId>
- <version>5.2.3</version>
- </dependency>
- /**
- * 根据文本不同的编码格式拿到文本内容
- * @param file
- * @return
- * @throws IOException
- */
- private String getContent(MultipartFile file) throws IOException {
- String fileName = file.getOriginalFilename();
- if (fileName != null) {
- if (fileName.endsWith(".txt")) {
- return readTextFile(file.getBytes());
- } else if (fileName.endsWith(".doc")) {
- return readDocFile(file);
- } else if (fileName.endsWith(".docx")) {
- return readDocxFile(file);
- }
- }
- return "";
- }
-
-
- /**
- * 文本编码格式
- */
- private static final List<Charset> FALLBACK_ENCODINGS = Arrays.asList(
- StandardCharsets.UTF_8,
- Charset.forName("GBK"),
- Charset.forName("GB2312"),
- StandardCharsets.ISO_8859_1
- );
-
-
- /**
- * 读取txt格式的文件
- * @param fileBytes
- * @return
- */
- private String readTextFile(byte[] fileBytes) {
- // 使用 UniversalDetector 检测文件编码
- UniversalDetector detector = new UniversalDetector(null);
- detector.handleData(fileBytes, 0, fileBytes.length);
- detector.dataEnd();
-
- String encoding = detector.getDetectedCharset();
- if (encoding != null) {
- String content = new String(fileBytes, Charset.forName(encoding));
- if (isValidContent(content)) {
- return content;
- }
- }
-
- // 尝试使用多种常见编码解析文件内容
- for (Charset charset : FALLBACK_ENCODINGS) {
- String content = new String(fileBytes, charset);
- if (isValidContent(content)) {
- return content;
- }
- }
-
- // 如果所有尝试都失败,返回默认的 UTF-8 编码内容
- return new String(fileBytes, StandardCharsets.UTF_8);
- }
-
-
- /**
- * 读取doc格式的文件
- * @param file
- * @return
- * @throws IOException
- */
- private static String readDocFile(MultipartFile file) throws IOException {
- try (InputStream inputStream = file.getInputStream();
- HWPFDocument doc = new HWPFDocument(inputStream)) {
- WordExtractor extractor = new WordExtractor(doc);
- return extractor.getText();
- }
- }
-
-
-
-
- /**
- * 读取docx格式的文件
- * @param file
- * @return
- * @throws IOException
- */
- private String readDocxFile(MultipartFile file) throws IOException {
- InputStream inputStream = file.getInputStream();
- XWPFDocument docx = new XWPFDocument(inputStream);
- XWPFWordExtractor extractor = new XWPFWordExtractor(docx);
- String content = extractor.getText();
- docx.close();
- inputStream.close();
- return content;
- }
-
-
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。