赞
踩
因为自己的任务需要用到word转html,但是poi3.1.2的版本与我poi4.1.2版本冲突,所以尝试用4.1.2版本来写一个word转html,它是可以同时支持doc和docx两种格式,非常好用,当前文章是关于docx转html的,doc相对来说比较简单,有兴趣的可以尝试一下
- <poi.version>4.1.2</poi.version>
- <!--注意版本保持一致 poi poi-ooxml poi-scratchpad-->
- <dependency>
- <groupId>org.apache.poi</groupId>
- <artifactId>poi</artifactId>
- </dependency>
- <!-- 操作doc ppt xls -->
- <dependency>
- <groupId>org.apache.poi</groupId>
- <artifactId>poi-scratchpad</artifactId>
- <version>${poi.version}</version>
- </dependency>
- <!-- 操作docx pptx xlsx -->
-
- <!--word S-->
- <dependency>
- <groupId>org.apache.poi</groupId>
- <artifactId>poi-scratchpad</artifactId>
- <version>${poi.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.poi</groupId>
- <artifactId>ooxml-schemas</artifactId>
- <version>1.4</version>
- </dependency>
-
- <dependency>
- <groupId>fr.opensagres.xdocreport</groupId>
- <artifactId>fr.opensagres.poi.xwpf.converter.xhtml</artifactId>
- <version>2.0.2</version>
- </dependency>
- <dependency>
- <groupId>fr.opensagres.xdocreport</groupId>
- <artifactId>fr.opensagres.xdocreport.converter.docx.xwpf</artifactId>
- <version>2.0.1</version>
- </dependency>
- <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
- <dependency>
- <groupId>org.jsoup</groupId>
- <artifactId>jsoup</artifactId>
- <version>1.17.2</version>
- </dependency>
使用Apache POI库读取Word文档。对于.docx文件,使用XWPFDocument类;对于.doc文件,使用HWPFDocument类。
word07转html
- public static String Word2007ToHtml(MultipartFile file) throws IOException{
- if (file.isEmpty() || file.getSize() <= 0) {
- throw new RuntimeException("文件为空,请添加文件");
- }else{
- if (file.getOriginalFilename().endsWith(".docx") || file.getOriginalFilename().endsWith(".DOCX")){
- try (InputStream input = file.getInputStream()) {
- XWPFDocument wordDocument = new XWPFDocument(input);
- XHTMLOptions options = XHTMLOptions.create();
- // 图片转base64
- //options.setImageManager(new Base64EmbedImgManager());
- // 获取所有图片数据
- options.setImageManager(new CustomImageManager(staticmediaUploadApi));
- options.setFragment(true);
- //忽略页眉页脚
- options.setOmitHeaderFooterPages(true);
-
- options.setIgnoreStylesIfUnused(false);
- // 转换html
- ByteArrayOutputStream htmlStream = new ByteArrayOutputStream();
- XHTMLConverter.getInstance().convert(wordDocument, htmlStream, options);
- String htmlStr = htmlStream.toString();
- htmlStream.close();
- return htmlStr;
- } catch (IOException e) {
- e.printStackTrace();
- return null; // 或者抛出异常,取决于你的业务逻辑
- }
- }
- else{
- throw new RuntimeException("文件格式错误,只能输入 MS Office 2007+ files");
- }
- }
- }
word03版本转html
- /**
- * 将doc格式的文件转换为html格式
- * @param inFileName 输入的doc文件名
- * @param outFileName 输出的html文件名
- */
- public static void docToHtml(String inFileName, String outFileName) {
- String content = null;
- ByteArrayOutputStream baos = null;
- try {
- // 新建word输入流,用于读取doc文件内容
- FileInputStream source = new FileInputStream(new File(inFileName));
- // 获取word对象,用于后续处理
- HWPFDocument wordDocument = new HWPFDocument(source);
- // 创建WordToHtmlConverter对象,用于转换文档
- WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
- // 设置图片存放的位置
- wordToHtmlConverter.setPicturesManager(new PicturesManager() {
- @Override
- public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) {
- File imgPath = new File("F:\\TestFile\\images");
- if (!imgPath.exists()) { // 如果图片目录不存在则创建
- imgPath.mkdirs();
- }
- File file = new File("F:\\TestFile\\images" + suggestedName);
- try {
- OutputStream os = new FileOutputStream(file);
- os.write(content);
- os.close();
- } catch (FileNotFoundException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- return "F:\\TestFile\\images" + suggestedName;
- }
- });
- // 处理文档转换
- wordToHtmlConverter.processDocument(wordDocument);
- // 获取转换后的html文档对象
- Document htmlDocument = wordToHtmlConverter.getDocument();
- // 创建DOMSource对象,用于后续转换
- DOMSource domSource = new DOMSource(htmlDocument);
-
- // 创建TransformerFactory对象,用于后续转换操作
- TransformerFactory tf = TransformerFactory.newInstance();
- // 创建Transformer对象,用于执行实际的转换操作
- Transformer serializer = tf.newTransformer();
- // 设置输出属性,如编码、缩进等
- serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
- serializer.setOutputProperty(OutputKeys.INDENT, "yes");
- serializer.setOutputProperty(OutputKeys.METHOD, "html");
- // 新建输出流,用于写入转换后的html内容到指定文件
- FileOutputStream fos = new FileOutputStream(new File(outFileName));
- StreamResult streamResult = new StreamResult(fos);
- // 执行转换操作,将domSource转换为streamResult
- serializer.transform(domSource, streamResult);
-
- } catch (Exception e) {
- e.printStackTrace();
- } finally {
- try {
- if (baos != null) {
- content = new String(baos.toByteArray(), "utf-8");
- baos.close();
- }
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
- }
- /**
- * 自定义的图片管理器,继承自ImageManager,用于处理Word文档中的图片。
- */
- @Slf4j
- public class CustomImageManager extends ImageManager {
-
- private MediaUploadApi mediaUploadApi;
- private byte[] picture;
- private String MediaId;
-
- /**
- * 构造函数,初始化基础目录和图片子目录。
- *
- */
- public CustomImageManager(MediaUploadApi staticmediaUploadApi) {
- super(new File(""), ""); // 调用父类构造函数
- this.mediaUploadApi = staticmediaUploadApi;
- }
-
- /**
- * 重写extract方法,用于从Word文档中提取图片数据。
- *
- * @param imagePath 图片的路径
- * @param imageData 图片的数据
- * @throws IOException 如果读写文件时发生错误
- */
- @Override
- public void extract(String imagePath, byte[] imageData) throws IOException {
-
- this.picture = imageData;// 调用父类的extract方法
- File file = FileUtil.writeBytes(imageData, imagePath);
- MultipartFile multipartFile;
- try {
- DiskFileItem item = (DiskFileItem) new DiskFileItemFactory().createItem("file", "image/png", true, file.getName());
- Files.copy(Paths.get(file.getAbsolutePath()), item.getOutputStream());
- multipartFile = new CommonsMultipartFile(item);
- MediaGetResponse upload = mediaUploadApi.upload(multipartFile, null, null, null, null, null, null, null, null);
- this.MediaId = upload.getMedia().getMediaId();
- log.error(this.MediaId);
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- //byte 转化成 file
-
- }
-
- /**
- * 重写resolve方法,用于解析图片的URI。
- *
- * @param uri 图片的URI
- * @return 解析后的图片路径
- */
- @Override
- public String resolve(String uri) {
- // 使用上传至云存储的方法
- // String imageid = mediaUploadApi.upload()
- String imageUrl = "https://api.jizhibao.cn.com/file/download/file/" + this.MediaId;
-
- return imageUrl;
- }
-
- /**
- * 获取图片相对于基本目录的路径。
- *
- * @param imagePath 图片的完整路径
- * @return 相对路径
- */
- private String getImageRelativePath(String imagePath) {
- return imagePath; // 返回图片的原始路径,未做任何处理
- }
- }
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。