docx轉html方案驗證-支援latex,表格,圖片(poi)

PyJava老鸟發表於2024-07-08

方案總結:

1:poi(html屬性支援) 存在一個bug,對於table中的cell中既有文字又有圖片的在轉化後圖片丟失

2:tika(主要是提取內容,轉換出來的html不太好)

3.openoffice(依賴安裝,轉出的html不太好)

4. aspose(功能強大但是付費),但也可以免費使用,缺點:不支援擴充套件原因程式碼不是開源的

5.mammoth(對比poi缺少標籤的屬性比如顏色字型),這個不存在一個cell中既有文字又有圖片的在轉化後圖片丟失的問題,對於樣式這塊支援擴充套件,樣例很多

本文基於poi實現docx to html

1.maven依賴

<dependency>
      <groupId>docx4j</groupId>
      <artifactId>docx4j</artifactId>
      <version>3.3.7</version>
      <scope>system</scope>
      <systemPath>${project.basedir}/src/main/resources/lib/docx4j-3.3.7.jar</systemPath>
    </dependency>

    <dependency>
      <groupId>org.jdom</groupId>
      <artifactId>jdom</artifactId>
      <version>2.0.6</version>
      <scope>system</scope>
      <systemPath>${project.basedir}/src/main/resources/lib/jdom-2.0.6.jar</systemPath>
    </dependency>
<dependency>
      <groupId>fmath</groupId>
      <artifactId>fmath-mathml-java</artifactId>
      <version>3.1</version>
      <scope>system</scope>
      <systemPath>${project.basedir}/src/main/resources/lib/fmath-mathml-java-3.1.jar</systemPath>
    </dependency>

<dependency>
      <groupId>org.scilab.forge</groupId>
      <artifactId>jlatexmath</artifactId>
      <version>1.0.8-SNAPSHOT</version>
      <scope>system</scope>
      <systemPath>${project.basedir}/src/main/resources/lib/jlatexmath-1.0.8-SNAPSHOT.jar</systemPath>
    </dependency>

<dependency>
      <groupId>commons-io</groupId>
      <artifactId>commons-io</artifactId>
      <version>2.11.0</version>
    </dependency>

    <dependency>
      <groupId>net.sf.saxon</groupId>
      <artifactId>Saxon-HE</artifactId>
      <version>9.8.0-12</version>
    </dependency>

    <!-- https://mvnrepository.com/artifact/fr.opensagres.xdocreport/org.apache.poi.xwpf.converter.core -->
    <dependency>
      <groupId>fr.opensagres.xdocreport</groupId>
      <artifactId>org.apache.poi.xwpf.converter.core</artifactId>
      <version>1.0.6</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/fr.opensagres.xdocreport/org.apache.poi.xwpf.converter.xhtml -->
    <dependency>
      <groupId>fr.opensagres.xdocreport</groupId>
      <artifactId>org.apache.poi.xwpf.converter.xhtml</artifactId>
      <version>1.0.6</version>
    </dependency>




    <dependency>
      <groupId>org.apache.poi</groupId>
      <artifactId>poi-scratchpad</artifactId>
      <version>3.17</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.commons/commons-collections4 -->
    <dependency>
      <groupId>org.apache.commons</groupId>
      <artifactId>commons-collections4</artifactId>
      <version>4.0</version>
    </dependency>

    <dependency>
      <groupId>org.apache.poi</groupId>
      <artifactId>poi</artifactId>
      <version>3.17</version>
    </dependency>
     <dependency>
        <groupId>org.apache.poi</groupId>
        <artifactId>poi-ooxml-schemas</artifactId>
        <version>3.16</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.xmlbeans/xmlbeans -->
    <dependency>
      <groupId>org.apache.xmlbeans</groupId>
      <artifactId>xmlbeans</artifactId>
      <version>2.6.0</version>
    </dependency>

    <dependency>
      <groupId>org.apache.poi</groupId>
      <artifactId>poi-ooxml</artifactId>
      <version>3.14</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 -->
    <dependency>
      <groupId>org.apache.commons</groupId>
      <artifactId>commons-lang3</artifactId>
      <version>3.4</version>
    </dependency>

2. 程式碼實現

2.1 docxToHtml方法
// docx轉換為html
    public static void docxToHtml(String sourceFile, String outDir) throws Exception {

        if (!sourceFile.contains(".docx")) {
            throw new IllegalArgumentException("you should use docx file");
        }

        FileInputStream fis = new FileInputStream(sourceFile);
        ZipSecureFile.setMinInflateRatio(0.004);
        XWPFDocument document = new XWPFDocument(fis);
        try {
            for (XWPFParagraph paragraph : document.getParagraphs()) {
                handle(paragraph);
            }
            String sourcePrefix = sourceFile.split("\\.")[0];

            File sourceDirFile = new File(sourcePrefix);


            String parentDir = sourceDirFile.getParent();
            String name = sourceDirFile.getName();


            UUID uuid = UUID.randomUUID();
            String uuidStr = uuid.toString();
            uuidStr = uuidStr.replace("-", "").substring(0, 16);
            sourcePrefix = parentDir + File.separator + "html" + File.separator;
            File tempFileDir = new File(sourcePrefix);
            if (!tempFileDir.exists()) {
                tempFileDir.mkdirs();
            }

            String tempFile = sourcePrefix + name + "-temp" + uuidStr + ".docx";

//        FileOutputStream out = new FileOutputStream(tempFile);
//        document.write(out);
//        out.close();


            for (XWPFTable table : document.getTables()) {
                for (XWPFTableRow row : table.getRows()) {
                    for (XWPFTableCell cell : row.getTableCells()) {
                        for (XWPFParagraph paragraph : cell.getParagraphs()) {
                            handle(paragraph);
                        }
                    }
                }
            }

            FileOutputStream out1 = new FileOutputStream(tempFile);
            document.write(out1);
            out1.close();
            fis.close();


            FileOutputStream fileOutputStream = null;
            OutputStreamWriter outputStreamWriter = null;
            try {
                XWPFDocument documentNew = new XWPFDocument(Files.newInputStream(Paths.get(tempFile)));
                XHTMLOptions options = XHTMLOptions.create();

                File dirFile = new File(outDir);
                if (!dirFile.exists()) {
                    dirFile.mkdirs();
                }
                StringBuilder imageDir = new StringBuilder();
                imageDir.append(outDir).append("/image");
                File imagrDirFile = new File(imageDir.toString());
                if (!imagrDirFile.exists()) {
                    imagrDirFile.mkdirs();
                }

                // 存放圖片的資料夾
                options.setExtractor(new FileImageExtractor(new File(imageDir.toString())));
                // html中圖片的路徑
                options.URIResolver(new BasicURIResolver("image"));



//            XHTMLOptions options = XHTMLOptions.create();
//            options.setIgnoreStylesIfUnused(false);
//            options.getStyleSheet().addStyle("table", "border-collapse:collapse;width:100%;");
//            options.getStyleSheet().addStyle("td", "border:1px solid black;");
//            options.getStyleSheet().addStyle("th", "border:1px solid black;");
//            options.setStyleSheet("h3 { display: none; }");


//             自定義內容處理工廠
                options.setContentHandlerFactory(new IContentHandlerFactory() {
                    @Override
                    public ContentHandler create(OutputStream outputStream, Writer writer, XHTMLOptions xhtmlOptions) {
                        return outputStream != null ? new TableAwareContentHandler(outputStream, options.getIndent())
                                : new TableAwareContentHandler(writer, options.getIndent());
                    }
                });
                String targetFilePath=outDir+File.separator + name+".html";
                fileOutputStream = new FileOutputStream(targetFilePath);
                outputStreamWriter = new OutputStreamWriter(fileOutputStream, StandardCharsets.UTF_8);
                XHTMLConverter xhtmlConverter = (XHTMLConverter) XHTMLConverter.getInstance();
                xhtmlConverter.convert(documentNew, outputStreamWriter, options);
            } catch (Exception e) {
                e.printStackTrace();
            } finally {
                if (outputStreamWriter != null) {
                    outputStreamWriter.close();
                }
                if (fileOutputStream != null) {
                    fileOutputStream.close();
                }
            }

        } catch (Exception e) {
            e.printStackTrace();
        }
    }

2.2 handle方法

 private static void handle(XWPFParagraph paragraph) throws InvalidFormatException, IOException {
        CTP ctp = paragraph.getCTP();
        XmlObject[] xmlObjects = ctp.selectPath(
                "declare namespace m='http://schemas.openxmlformats.org/officeDocument/2006/math' .//m:oMath");
        for (XmlObject xmlObject : xmlObjects) {
//                System.out.println("在文件中發現公式: " + xmlObject.xmlText());
            String mml = MathmlUtils.convertOMML2MML(xmlObject.xmlText());
            String latex = MathmlUtils.convertMML2Latex(mml);

            latex = LatexUtil.latexFilter(latex);

            XmlCursor cursor = xmlObject.newCursor();

            cursor.removeXml();
            XmlCursor startCursor = cursor.newCursor();
            startCursor.toPrevToken();

            XWPFRun run = paragraph.createRun();


            TeXFormula formula = new TeXFormula(latex);
            TeXIcon icon = formula.createTeXIcon(TeXConstants.STYLE_DISPLAY, 20);
            BufferedImage image = new BufferedImage(icon.getIconWidth(), icon.getIconHeight(), BufferedImage.TYPE_INT_ARGB);
            Graphics2D g2 = image.createGraphics();
            g2.setColor(Color.WHITE);
            g2.fillRect(0, 0, icon.getIconWidth(), icon.getIconHeight());
            JLabel jl = new JLabel();
            jl.setForeground(new Color(0, 0, 0));
            icon.paintIcon(jl, g2, 0, 0);


            // 建立一個用來存放影像位元組的輸出流
            java.io.ByteArrayOutputStream baos = new java.io.ByteArrayOutputStream();

            try {
                // 將影像編碼為png
                ImageIO.write(image, "png", baos);
            } catch (IOException e) {
                e.printStackTrace();
            }

            InputStream isa = new ByteArrayInputStream(baos.toByteArray());

            // 設定樣式為普通文字,可以根據需要調整
            run.setBold(false);
            run.setItalic(false);
            run.setFontSize(14);

            run.addPicture(isa, XWPFDocument.PICTURE_TYPE_PNG, "image.png", Units.toEMU(200), Units.toEMU(100));


            cursor.toPrevToken();
            XmlObject newRunXmlObject = run.getCTR();
            if (cursor.toPrevSibling()) {
                cursor.toEndToken();
                XmlCursor newCursor = newRunXmlObject.newCursor();
                newCursor.moveXml(startCursor);
            } else { // 否則,把新Run插入到公式所在位置的開頭
                cursor.toParent();
                cursor.toNextToken();
                XmlCursor newCursor = newRunXmlObject.newCursor();
                newCursor.moveXml(cursor);
            }
        }
    }

2.3 TableAwareContentHandler

static class TableAwareContentHandler extends DefaultHandler {
        private final OutputStream out;
        private final Writer writer;
        private boolean startingElement;
        private StringBuilder currentCharacters;
        private final Integer indent;
        private int nbElements;
        private boolean firstElement;

        SimpleContentHandler simpleContentHandler;

        public TableAwareContentHandler(OutputStream out) {
            this((OutputStream) out, (Integer) null);
        }

        public TableAwareContentHandler(OutputStream out, Integer indent) {
            this(out, (Writer) null, indent);
            simpleContentHandler = new SimpleContentHandler(out, indent);
        }


        public TableAwareContentHandler(Writer writer, Integer indent) {
            this((OutputStream) null, writer, indent);
            simpleContentHandler = new SimpleContentHandler(writer, indent);
        }

        private TableAwareContentHandler(OutputStream out, Writer writer, Integer indent) {
            this.out = out;
            this.writer = writer;
            this.currentCharacters = new StringBuilder();
            this.indent = indent;
            this.firstElement = true;
        }

        private void doIndentIfNeeded() throws SAXException {
            if (this.indent != null && !this.firstElement) {
                StringBuilder content = new StringBuilder("\n");

                for (int i = 0; i < this.nbElements; ++i) {
                    for (int j = 0; j < this.indent; ++j) {
                        content.append(' ');
                    }
                }

                this.write(content.toString());
            }
        }


        public final void characters(char[] ch, int start, int length) throws SAXException {
            if (this.startingElement) {
                this.write(">");
            }

            this.startingElement = false;

            for (int i = start; i < start + length; ++i) {
                char c = ch[i];
                this.currentCharacters.append(c);
            }

        }

        protected boolean mustEncodeCharachers() {
            return true;
        }

        protected void flushCharacters(String characters) throws SAXException {
            this.write(characters);
        }

        protected void resetCharacters() {
            this.currentCharacters.setLength(0);
        }

        private void write(String content) throws SAXException {
            try {
                if (this.out != null) {
                    this.out.write(content.getBytes());
                } else {
                    this.writer.write(content);
                }

            } catch (IOException var3) {
                throw new SAXException(var3);
            }
        }

        public void startElement(String uri, String localName, String name, Attributes attributes) throws SAXException {

            if ("th".equals(localName)) {
                return;
            }
            if (this.startingElement) {
                this.write(">");
            }

            if (this.currentCharacters.length() > 0) {
                this.flushCharacters(this.currentCharacters.toString());
                this.resetCharacters();
            }

            this.doIndentIfNeeded();

            if ("table".equals(localName)) {
                this.write("<");
                this.write(localName);
                this.write(" style=\"border: 1px solid black;\"");
            } else if ("tr".equals(localName)) {
                this.write("<");
                this.write("tr");
                this.write(" style=\"border: 1px solid black;\"");
            } else if ("td".equals(localName)) {
                this.write("<");
                this.write("td ");
                this.write(" style=\"border: 1px solid black;\"");
            } else {
                this.write("<");
                this.write(localName);
            }
            int length = attributes.getLength();
            if (length > 0) {
                String attrName = null;
                String attrValue = null;

                for (int i = 0; i < length; ++i) {
                    attrName = attributes.getLocalName(i);
                    attrValue = attributes.getValue(i);
                    this.write(" ");
                    this.write(attrName);
                    this.write("=\"");
                    this.write(attrValue);
                    this.write("\"");
                }
            }

            this.startingElement = true;
            this.firstElement = false;
            ++this.nbElements;


        }

        public void endElement(String uri, String localName, String name) throws SAXException {
            if ("th".equals(localName)) {
                return;
            }

            --this.nbElements;
            if (this.currentCharacters.length() > 0) {
                this.flushCharacters(this.currentCharacters.toString());
                this.resetCharacters();
            }

            if (this.startingElement) {
                this.write("/>");
                this.startingElement = false;
            } else {
                this.doIndentIfNeeded();
                if ("table".equals(localName)) {
                    write("</table>");
                } else if ("tr".equals(localName)) {
                    write("</tr>");
                } else if ("td".equals(localName)) {
                    write("</td>");
                } else {
                    this.write("</");
                    this.write(localName);
                    this.write(">");
                }
            }


        }

    }

相關文章