前提是PDF裡面是有文字的!
一次性取得所有頁內容:
/// <summary> /// 改進前取得所有頁的所有word /// </summary> /// <param name="pdfFilePath"></param> /// <param name="txtDirectoryPath"></param> public static void ConvertPdfToTxt(string pdfFilePath, string txtDirectoryPath) { CAcroAVDoc avDoc = (Acrobat.CAcroAVDoc)Microsoft.VisualBasic.Interaction.CreateObject("AcroExch.AVDoc"); //set AVDoc object CAcroPDDoc pdDoc; //open the PDF if (avDoc.Open(pdfFilePath, "")) { pdDoc = (CAcroPDDoc)avDoc.GetPDDoc(); Object jsAcroObj = pdDoc.GetJSObject(); Type T = jsAcroObj.GetType(); object[] saveAsParam = { txtDirectoryPath, "com.adobe.acrobat.accesstext" }; T.InvokeMember("saveAs", BindingFlags.InvokeMethod | BindingFlags.Public | BindingFlags.Instance, null, jsAcroObj, saveAsParam); object[] closeDocParam = { true }; T.InvokeMember("closeDoc", BindingFlags.InvokeMethod | BindingFlags.Public | BindingFlags.Instance, null, jsAcroObj, closeDocParam); if (!avDoc.Close(1)) avDoc.Close(1); } }
逐頁取出:
/// <summary> /// 改進後取得每一頁的所有word /// </summary> /// <param name="pdDoc"></param> /// <returns></returns> public static List<KeyValuePair<String, String>> PdDocGetText(AcroPDDoc pdDoc) { List<KeyValuePair<String, String>> txt = new List<KeyValuePair<string, string>>(); AcroPDPage page; int pages = pdDoc.GetNumPages(); string pageText = ""; for (int i = 0; i < pages; i++) { page = (AcroPDPage)pdDoc.AcquirePage(i); object jso, jsNumWords, jsWord; List<string> words = new List<string>(); try { jso = pdDoc.GetJSObject(); if (jso != null) { object[] args = new object[] { i }; jsNumWords = jso.GetType().InvokeMember("getPageNumWords", System.Reflection.BindingFlags.InvokeMethod, null, jso, args, null); int numWords = Int32.Parse(jsNumWords.ToString()); for (int j = 0; j <= numWords; j++) { object[] argsj = new object[] { i, j, false }; jsWord = jso.GetType().InvokeMember("getPageNthWord", System.Reflection.BindingFlags.InvokeMethod, null, jso, argsj, null); words.Add((string)jsWord); } } foreach (string word in words) { //取得當前page內容 pageText += word; } } catch { } //當前頁內容加入list txt.Add(new KeyValuePair<string, string>((i + 1).ToString(), pageText)); pageText = ""; jso = null; } return txt; }
在這個基礎之上我們再寫一些比如搜尋PDF內容的功能就容易多了吧。
補充:這裡有一個問題,當遇到PDF排版是縱向的時候,讀出來的是亂碼,因為行是橫向的。這個困擾我很久了,大家如果有思路的話可以說出來交流一下。