docx轉html方案驗證-支援latex,表格,圖片(aspose)

PyJava老鸟發表於2024-07-08

方案總結:

1:poi(html屬性支援) 存在一個bug,對於table中的cell中既有文字又有圖片的在轉化後圖片丟失

2:tika(主要是提取內容,轉換出來的html不太好)

3.openoffice(依賴安裝,轉出的html不太好)

4. aspose(功能強大但是付費),但也可以免費使用,缺點:不支援擴充套件原因程式碼不是開源的

5.mammoth(對比poi缺少標籤的屬性比如顏色字型),這個不存在一個cell中既有文字又有圖片的在轉化後圖片丟失的問題,對於樣式這塊支援擴充套件,樣例很多

1.maven依賴

    <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi</artifactId>
            <version>3.14</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-examples</artifactId>
            <version>3.14</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-excelant</artifactId>
            <version>3.14</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-scratchpad</artifactId>
            <version>3.14</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>3.14</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>ooxml-schemas</artifactId>
            <version>1.3</version>
        </dependency>
        <dependency>
            <groupId>com.aspose</groupId>
            <artifactId>aspose-words</artifactId>
            <version>18.6</version>
            <scope>system</scope>
            <systemPath>${project.basedir}/lib/aspose-words-18.6-jdk16.jar</systemPath>
        </dependency>
        <dependency>
            <groupId>com.aspose</groupId>
            <artifactId>aspose-cells</artifactId>
            <version>8.5.2</version>
            <scope>system</scope>
            <systemPath>${project.basedir}/lib/aspose-cells-8.5.2.jar</systemPath>
        </dependency>
        <dependency>
            <groupId>org.apache.pdfbox</groupId>
            <artifactId>pdfbox</artifactId>
            <version>2.0.9</version>
        </dependency>
        <dependency>
            <groupId>fr.opensagres.xdocreport</groupId>
            <artifactId>xdocreport</artifactId>
            <version>1.0.6</version>
        </dependency>
        <dependency>
            <groupId>org.apache.xmlbeans</groupId>
            <artifactId>xmlbeans</artifactId>
            <version>2.6.0</version>
        </dependency>
        <dependency>
            <groupId>net.sf.cssbox</groupId>
            <artifactId>pdf2dom</artifactId>
            <version>1.8</version>
        </dependency>
        <dependency>
            <groupId>org.projectlombok</groupId>
            <artifactId>lombok</artifactId>
            <version>1.16.10</version>
        </dependency>

        <!-- Hutool工具類 -->
        <!-- https://mvnrepository.com/artifact/cn.hutool/hutool-all -->
        <dependency>
            <groupId>cn.hutool</groupId>
            <artifactId>hutool-all</artifactId>
            <version>5.3.8</version>
        </dependency>

2.程式碼實現

2.1 wordBytes2HtmlFile方法

 public static File wordBytes2HtmlFile(byte[] wordBytes, String htmlFilePath) {
        try {
            log.info("實現`aspose-words`授權 -> 去掉頭部水印");
            /*
              實現匹配檔案授權 -> 去掉頭部水印 `Evaluation Only. Created with Aspose.Words. Copyright 2003-2018 Aspose Pty Ltd.` |
                                          `Evaluation Only. Created with Aspose.Cells for Java. Copyright 2003 - 2020 Aspose Pty Ltd.`
             */
//            InputStream is = new ClassPathResource("license.xml").getInputStream();
            // 從當前類載入器中載入資源
            InputStream is = Thread.currentThread().getContextClassLoader().getResourceAsStream("license.xml");
            if (is != null) {
                License license = new License();
                license.setLicense(is);
            }
        } catch (Exception e) {
            log.error("《`aspose-words`授權》 失敗: {}", e.getMessage());
        }
        // Load word document from disk.
        com.aspose.words.Document doc = new com.aspose.words.Document(new ByteArrayInputStream(wordBytes));
        // Save the document into MHTML.
        doc.save(htmlFilePath, SaveFormat.HTML);
        return new File(htmlFilePath);
    }

2.2 readBytes方法

    public static byte[] readBytes(String filePath) {
        return FileUtil.readBytes(filePath);
    }

2.3main方法

    public static void main(String[] args) {
//        word2HtmlFile("D:\\doc","JKLJLJLGJ.docx","JKLJLJLGJ.1111.html");

        File htmlFile = wordBytes2HtmlFile(readBytes("D:\\doc\\xxxx.docx"),
                "D:\\doc\\xxxxxxxaaaa.html");

    }

相關文章