PDF抽取文字 C# with Adobe API

快看一隻熊發表於2014-04-08

前提是PDF裡面是有文字的!

一次性取得所有頁內容:

        /// <summary>
        /// 改進前取得所有頁的所有word
        /// </summary>
        /// <param name="pdfFilePath"></param>
        /// <param name="txtDirectoryPath"></param>
        public static void ConvertPdfToTxt(string pdfFilePath, string txtDirectoryPath)
        {
            CAcroAVDoc avDoc = (Acrobat.CAcroAVDoc)Microsoft.VisualBasic.Interaction.CreateObject("AcroExch.AVDoc"); //set AVDoc object 
            CAcroPDDoc pdDoc;
            //open the PDF
            if (avDoc.Open(pdfFilePath, ""))
            {
                pdDoc = (CAcroPDDoc)avDoc.GetPDDoc();
                Object jsAcroObj = pdDoc.GetJSObject();
                Type T = jsAcroObj.GetType();
                object[] saveAsParam = { txtDirectoryPath, "com.adobe.acrobat.accesstext" };
                T.InvokeMember("saveAs",
                  BindingFlags.InvokeMethod |
                  BindingFlags.Public |
                  BindingFlags.Instance,
                  null, jsAcroObj, saveAsParam);
                object[] closeDocParam = { true };
                T.InvokeMember("closeDoc",
                  BindingFlags.InvokeMethod |
                  BindingFlags.Public |
                  BindingFlags.Instance,
                  null, jsAcroObj, closeDocParam);

                if (!avDoc.Close(1)) avDoc.Close(1);
            }
        }

逐頁取出:

        /// <summary>
        /// 改進後取得每一頁的所有word
        /// </summary>
        /// <param name="pdDoc"></param>
        /// <returns></returns>
        public static List<KeyValuePair<String, String>> PdDocGetText(AcroPDDoc pdDoc)
        {
            List<KeyValuePair<String, String>> txt = new List<KeyValuePair<string, string>>();
            AcroPDPage page;
            int pages = pdDoc.GetNumPages();
            string pageText = "";
            for (int i = 0; i < pages; i++)
            {
                page = (AcroPDPage)pdDoc.AcquirePage(i);
                object jso, jsNumWords, jsWord;
                List<string> words = new List<string>();
                try
                {
                    jso = pdDoc.GetJSObject();
                    if (jso != null)
                    {
                        object[] args = new object[] { i };
                        jsNumWords = jso.GetType().InvokeMember("getPageNumWords", System.Reflection.BindingFlags.InvokeMethod, null, jso, args, null);
                        int numWords = Int32.Parse(jsNumWords.ToString());
                        for (int j = 0; j <= numWords; j++)
                        {
                            object[] argsj = new object[] { i, j, false };
                            jsWord = jso.GetType().InvokeMember("getPageNthWord", System.Reflection.BindingFlags.InvokeMethod, null, jso, argsj, null);
                            words.Add((string)jsWord);
                        }
                    }
                    foreach (string word in words)
                    {
                        //取得當前page內容
                        pageText += word;
                    }
                }
                catch
                {
                }

                //當前頁內容加入list
                txt.Add(new KeyValuePair<string, string>((i + 1).ToString(), pageText));
                pageText = "";

                jso = null;
            }
            return txt;
        }

在這個基礎之上我們再寫一些比如搜尋PDF內容的功能就容易多了吧。

 

補充:這裡有一個問題,當遇到PDF排版是縱向的時候,讀出來的是亂碼,因為行是橫向的。這個困擾我很久了,大家如果有思路的話可以說出來交流一下。

相關文章