方案總結:
1:poi(html屬性支援) 存在一個bug,對於table中的cell中既有文字又有圖片的在轉化後圖片丟失
2:tika(主要是提取內容,轉換出來的html不太好)
3.openoffice(依賴安裝,轉出的html不太好)
4. aspose(功能強大但是付費),但也可以免費使用,缺點:不支援擴充套件原因程式碼不是開源的
5.mammoth(對比poi缺少標籤的屬性比如顏色字型),這個不存在一個cell中既有文字又有圖片的在轉化後圖片丟失的問題,對於樣式這塊支援擴充套件,樣例很多
1.maven依賴
<dependency> <groupId>org.apache.poi</groupId> <artifactId>poi</artifactId> <version>3.14</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-examples</artifactId> <version>3.14</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-excelant</artifactId> <version>3.14</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-scratchpad</artifactId> <version>3.14</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml</artifactId> <version>3.14</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>ooxml-schemas</artifactId> <version>1.3</version> </dependency> <dependency> <groupId>com.aspose</groupId> <artifactId>aspose-words</artifactId> <version>18.6</version> <scope>system</scope> <systemPath>${project.basedir}/lib/aspose-words-18.6-jdk16.jar</systemPath> </dependency> <dependency> <groupId>com.aspose</groupId> <artifactId>aspose-cells</artifactId> <version>8.5.2</version> <scope>system</scope> <systemPath>${project.basedir}/lib/aspose-cells-8.5.2.jar</systemPath> </dependency> <dependency> <groupId>org.apache.pdfbox</groupId> <artifactId>pdfbox</artifactId> <version>2.0.9</version> </dependency> <dependency> <groupId>fr.opensagres.xdocreport</groupId> <artifactId>xdocreport</artifactId> <version>1.0.6</version> </dependency> <dependency> <groupId>org.apache.xmlbeans</groupId> <artifactId>xmlbeans</artifactId> <version>2.6.0</version> </dependency> <dependency> <groupId>net.sf.cssbox</groupId> <artifactId>pdf2dom</artifactId> <version>1.8</version> </dependency> <dependency> <groupId>org.projectlombok</groupId> <artifactId>lombok</artifactId> <version>1.16.10</version> </dependency> <!-- Hutool工具類 --> <!-- https://mvnrepository.com/artifact/cn.hutool/hutool-all --> <dependency> <groupId>cn.hutool</groupId> <artifactId>hutool-all</artifactId> <version>5.3.8</version> </dependency>
2.程式碼實現
2.1 wordBytes2HtmlFile方法
public static File wordBytes2HtmlFile(byte[] wordBytes, String htmlFilePath) { try { log.info("實現`aspose-words`授權 -> 去掉頭部水印"); /* 實現匹配檔案授權 -> 去掉頭部水印 `Evaluation Only. Created with Aspose.Words. Copyright 2003-2018 Aspose Pty Ltd.` | `Evaluation Only. Created with Aspose.Cells for Java. Copyright 2003 - 2020 Aspose Pty Ltd.` */ // InputStream is = new ClassPathResource("license.xml").getInputStream(); // 從當前類載入器中載入資源 InputStream is = Thread.currentThread().getContextClassLoader().getResourceAsStream("license.xml"); if (is != null) { License license = new License(); license.setLicense(is); } } catch (Exception e) { log.error("《`aspose-words`授權》 失敗: {}", e.getMessage()); } // Load word document from disk. com.aspose.words.Document doc = new com.aspose.words.Document(new ByteArrayInputStream(wordBytes)); // Save the document into MHTML. doc.save(htmlFilePath, SaveFormat.HTML); return new File(htmlFilePath); }
2.2 readBytes方法
public static byte[] readBytes(String filePath) { return FileUtil.readBytes(filePath); }
2.3main方法
public static void main(String[] args) { // word2HtmlFile("D:\\doc","JKLJLJLGJ.docx","JKLJLJLGJ.1111.html"); File htmlFile = wordBytes2HtmlFile(readBytes("D:\\doc\\xxxx.docx"), "D:\\doc\\xxxxxxxaaaa.html"); }