方案總結:
1:poi(html屬性支援) 存在一個bug,對於table中的cell中既有文字又有圖片的在轉化後圖片丟失
2:tika(主要是提取內容,轉換出來的html不太好)
3.openoffice(依賴安裝,轉出的html不太好)
4. aspose(功能強大但是付費),但也可以免費使用,缺點:不支援擴充套件原因程式碼不是開源的
5.mammoth(對比poi缺少標籤的屬性比如顏色字型),這個不存在一個cell中既有文字又有圖片的在轉化後圖片丟失的問題,對於樣式這塊支援擴充套件,樣例很多
本文基於poi實現docx to html
1.maven依賴
<dependency> <groupId>docx4j</groupId> <artifactId>docx4j</artifactId> <version>3.3.7</version> <scope>system</scope> <systemPath>${project.basedir}/src/main/resources/lib/docx4j-3.3.7.jar</systemPath> </dependency> <dependency> <groupId>org.jdom</groupId> <artifactId>jdom</artifactId> <version>2.0.6</version> <scope>system</scope> <systemPath>${project.basedir}/src/main/resources/lib/jdom-2.0.6.jar</systemPath> </dependency> <dependency> <groupId>fmath</groupId> <artifactId>fmath-mathml-java</artifactId> <version>3.1</version> <scope>system</scope> <systemPath>${project.basedir}/src/main/resources/lib/fmath-mathml-java-3.1.jar</systemPath> </dependency> <dependency> <groupId>org.scilab.forge</groupId> <artifactId>jlatexmath</artifactId> <version>1.0.8-SNAPSHOT</version> <scope>system</scope> <systemPath>${project.basedir}/src/main/resources/lib/jlatexmath-1.0.8-SNAPSHOT.jar</systemPath> </dependency> <dependency> <groupId>commons-io</groupId> <artifactId>commons-io</artifactId> <version>2.11.0</version> </dependency> <dependency> <groupId>net.sf.saxon</groupId> <artifactId>Saxon-HE</artifactId> <version>9.8.0-12</version> </dependency> <!-- https://mvnrepository.com/artifact/fr.opensagres.xdocreport/org.apache.poi.xwpf.converter.core --> <dependency> <groupId>fr.opensagres.xdocreport</groupId> <artifactId>org.apache.poi.xwpf.converter.core</artifactId> <version>1.0.6</version> </dependency> <!-- https://mvnrepository.com/artifact/fr.opensagres.xdocreport/org.apache.poi.xwpf.converter.xhtml --> <dependency> <groupId>fr.opensagres.xdocreport</groupId> <artifactId>org.apache.poi.xwpf.converter.xhtml</artifactId> <version>1.0.6</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-scratchpad</artifactId> <version>3.17</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.commons/commons-collections4 --> <dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-collections4</artifactId> <version>4.0</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi</artifactId> <version>3.17</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml-schemas</artifactId> <version>3.16</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.xmlbeans/xmlbeans --> <dependency> <groupId>org.apache.xmlbeans</groupId> <artifactId>xmlbeans</artifactId> <version>2.6.0</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml</artifactId> <version>3.14</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 --> <dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-lang3</artifactId> <version>3.4</version> </dependency>
2. 程式碼實現
2.1 docxToHtml方法
// docx轉換為html public static void docxToHtml(String sourceFile, String outDir) throws Exception { if (!sourceFile.contains(".docx")) { throw new IllegalArgumentException("you should use docx file"); } FileInputStream fis = new FileInputStream(sourceFile); ZipSecureFile.setMinInflateRatio(0.004); XWPFDocument document = new XWPFDocument(fis); try { for (XWPFParagraph paragraph : document.getParagraphs()) { handle(paragraph); } String sourcePrefix = sourceFile.split("\\.")[0]; File sourceDirFile = new File(sourcePrefix); String parentDir = sourceDirFile.getParent(); String name = sourceDirFile.getName(); UUID uuid = UUID.randomUUID(); String uuidStr = uuid.toString(); uuidStr = uuidStr.replace("-", "").substring(0, 16); sourcePrefix = parentDir + File.separator + "html" + File.separator; File tempFileDir = new File(sourcePrefix); if (!tempFileDir.exists()) { tempFileDir.mkdirs(); } String tempFile = sourcePrefix + name + "-temp" + uuidStr + ".docx"; // FileOutputStream out = new FileOutputStream(tempFile); // document.write(out); // out.close(); for (XWPFTable table : document.getTables()) { for (XWPFTableRow row : table.getRows()) { for (XWPFTableCell cell : row.getTableCells()) { for (XWPFParagraph paragraph : cell.getParagraphs()) { handle(paragraph); } } } } FileOutputStream out1 = new FileOutputStream(tempFile); document.write(out1); out1.close(); fis.close(); FileOutputStream fileOutputStream = null; OutputStreamWriter outputStreamWriter = null; try { XWPFDocument documentNew = new XWPFDocument(Files.newInputStream(Paths.get(tempFile))); XHTMLOptions options = XHTMLOptions.create(); File dirFile = new File(outDir); if (!dirFile.exists()) { dirFile.mkdirs(); } StringBuilder imageDir = new StringBuilder(); imageDir.append(outDir).append("/image"); File imagrDirFile = new File(imageDir.toString()); if (!imagrDirFile.exists()) { imagrDirFile.mkdirs(); } // 存放圖片的資料夾 options.setExtractor(new FileImageExtractor(new File(imageDir.toString()))); // html中圖片的路徑 options.URIResolver(new BasicURIResolver("image")); // XHTMLOptions options = XHTMLOptions.create(); // options.setIgnoreStylesIfUnused(false); // options.getStyleSheet().addStyle("table", "border-collapse:collapse;width:100%;"); // options.getStyleSheet().addStyle("td", "border:1px solid black;"); // options.getStyleSheet().addStyle("th", "border:1px solid black;"); // options.setStyleSheet("h3 { display: none; }"); // 自定義內容處理工廠 options.setContentHandlerFactory(new IContentHandlerFactory() { @Override public ContentHandler create(OutputStream outputStream, Writer writer, XHTMLOptions xhtmlOptions) { return outputStream != null ? new TableAwareContentHandler(outputStream, options.getIndent()) : new TableAwareContentHandler(writer, options.getIndent()); } }); String targetFilePath=outDir+File.separator + name+".html"; fileOutputStream = new FileOutputStream(targetFilePath); outputStreamWriter = new OutputStreamWriter(fileOutputStream, StandardCharsets.UTF_8); XHTMLConverter xhtmlConverter = (XHTMLConverter) XHTMLConverter.getInstance(); xhtmlConverter.convert(documentNew, outputStreamWriter, options); } catch (Exception e) { e.printStackTrace(); } finally { if (outputStreamWriter != null) { outputStreamWriter.close(); } if (fileOutputStream != null) { fileOutputStream.close(); } } } catch (Exception e) { e.printStackTrace(); } }
2.2 handle方法
private static void handle(XWPFParagraph paragraph) throws InvalidFormatException, IOException { CTP ctp = paragraph.getCTP(); XmlObject[] xmlObjects = ctp.selectPath( "declare namespace m='http://schemas.openxmlformats.org/officeDocument/2006/math' .//m:oMath"); for (XmlObject xmlObject : xmlObjects) { // System.out.println("在文件中發現公式: " + xmlObject.xmlText()); String mml = MathmlUtils.convertOMML2MML(xmlObject.xmlText()); String latex = MathmlUtils.convertMML2Latex(mml); latex = LatexUtil.latexFilter(latex); XmlCursor cursor = xmlObject.newCursor(); cursor.removeXml(); XmlCursor startCursor = cursor.newCursor(); startCursor.toPrevToken(); XWPFRun run = paragraph.createRun(); TeXFormula formula = new TeXFormula(latex); TeXIcon icon = formula.createTeXIcon(TeXConstants.STYLE_DISPLAY, 20); BufferedImage image = new BufferedImage(icon.getIconWidth(), icon.getIconHeight(), BufferedImage.TYPE_INT_ARGB); Graphics2D g2 = image.createGraphics(); g2.setColor(Color.WHITE); g2.fillRect(0, 0, icon.getIconWidth(), icon.getIconHeight()); JLabel jl = new JLabel(); jl.setForeground(new Color(0, 0, 0)); icon.paintIcon(jl, g2, 0, 0); // 建立一個用來存放影像位元組的輸出流 java.io.ByteArrayOutputStream baos = new java.io.ByteArrayOutputStream(); try { // 將影像編碼為png ImageIO.write(image, "png", baos); } catch (IOException e) { e.printStackTrace(); } InputStream isa = new ByteArrayInputStream(baos.toByteArray()); // 設定樣式為普通文字,可以根據需要調整 run.setBold(false); run.setItalic(false); run.setFontSize(14); run.addPicture(isa, XWPFDocument.PICTURE_TYPE_PNG, "image.png", Units.toEMU(200), Units.toEMU(100)); cursor.toPrevToken(); XmlObject newRunXmlObject = run.getCTR(); if (cursor.toPrevSibling()) { cursor.toEndToken(); XmlCursor newCursor = newRunXmlObject.newCursor(); newCursor.moveXml(startCursor); } else { // 否則,把新Run插入到公式所在位置的開頭 cursor.toParent(); cursor.toNextToken(); XmlCursor newCursor = newRunXmlObject.newCursor(); newCursor.moveXml(cursor); } } }
2.3 TableAwareContentHandler 類
static class TableAwareContentHandler extends DefaultHandler { private final OutputStream out; private final Writer writer; private boolean startingElement; private StringBuilder currentCharacters; private final Integer indent; private int nbElements; private boolean firstElement; SimpleContentHandler simpleContentHandler; public TableAwareContentHandler(OutputStream out) { this((OutputStream) out, (Integer) null); } public TableAwareContentHandler(OutputStream out, Integer indent) { this(out, (Writer) null, indent); simpleContentHandler = new SimpleContentHandler(out, indent); } public TableAwareContentHandler(Writer writer, Integer indent) { this((OutputStream) null, writer, indent); simpleContentHandler = new SimpleContentHandler(writer, indent); } private TableAwareContentHandler(OutputStream out, Writer writer, Integer indent) { this.out = out; this.writer = writer; this.currentCharacters = new StringBuilder(); this.indent = indent; this.firstElement = true; } private void doIndentIfNeeded() throws SAXException { if (this.indent != null && !this.firstElement) { StringBuilder content = new StringBuilder("\n"); for (int i = 0; i < this.nbElements; ++i) { for (int j = 0; j < this.indent; ++j) { content.append(' '); } } this.write(content.toString()); } } public final void characters(char[] ch, int start, int length) throws SAXException { if (this.startingElement) { this.write(">"); } this.startingElement = false; for (int i = start; i < start + length; ++i) { char c = ch[i]; this.currentCharacters.append(c); } } protected boolean mustEncodeCharachers() { return true; } protected void flushCharacters(String characters) throws SAXException { this.write(characters); } protected void resetCharacters() { this.currentCharacters.setLength(0); } private void write(String content) throws SAXException { try { if (this.out != null) { this.out.write(content.getBytes()); } else { this.writer.write(content); } } catch (IOException var3) { throw new SAXException(var3); } } public void startElement(String uri, String localName, String name, Attributes attributes) throws SAXException { if ("th".equals(localName)) { return; } if (this.startingElement) { this.write(">"); } if (this.currentCharacters.length() > 0) { this.flushCharacters(this.currentCharacters.toString()); this.resetCharacters(); } this.doIndentIfNeeded(); if ("table".equals(localName)) { this.write("<"); this.write(localName); this.write(" style=\"border: 1px solid black;\""); } else if ("tr".equals(localName)) { this.write("<"); this.write("tr"); this.write(" style=\"border: 1px solid black;\""); } else if ("td".equals(localName)) { this.write("<"); this.write("td "); this.write(" style=\"border: 1px solid black;\""); } else { this.write("<"); this.write(localName); } int length = attributes.getLength(); if (length > 0) { String attrName = null; String attrValue = null; for (int i = 0; i < length; ++i) { attrName = attributes.getLocalName(i); attrValue = attributes.getValue(i); this.write(" "); this.write(attrName); this.write("=\""); this.write(attrValue); this.write("\""); } } this.startingElement = true; this.firstElement = false; ++this.nbElements; } public void endElement(String uri, String localName, String name) throws SAXException { if ("th".equals(localName)) { return; } --this.nbElements; if (this.currentCharacters.length() > 0) { this.flushCharacters(this.currentCharacters.toString()); this.resetCharacters(); } if (this.startingElement) { this.write("/>"); this.startingElement = false; } else { this.doIndentIfNeeded(); if ("table".equals(localName)) { write("</table>"); } else if ("tr".equals(localName)) { write("</tr>"); } else if ("td".equals(localName)) { write("</td>"); } else { this.write("</"); this.write(localName); this.write(">"); } } } }