C#/winform採集百度hi文章

weixin_34219944發表於2009-08-10

public partial class Form1 : Form

    {

        Thread newth;

        public Form1()

        {

            InitializeComponent();

         

        }    

        private void buttonGo_Click(object sender, EventArgs e)

        {

            CheckForIllegalCrossThreadCalls = false; //簡單非同步執行緒控制設定       

             newth = new Thread(new ThreadStart(doit));

            newth.Start();

        }

        void doit()

        { //HttpWebRequest 物件採集百度hi blog文章

            HttpWebRequest webRequest; //請求物件

            StreamReader responseReader;//響應物件

            string responseData;

            html mytml; //自定義html簡單處理物件,處理文章頁面資料

            DataTable dt = new DataTable();//儲存文章列表

            DataTable dt2 = new DataTable();//儲存文章內容

            int pagecount = 0;

            dt.Columns.Add(new DataColumn("title"));//標題

            dt.Columns.Add(new DataColumn("link"));//連結

            dt.Columns.Add(new DataColumn("description"));//文章內容

            dt.Columns.Add(new DataColumn("pubDate"));//發表時間

            dt.Columns.Add(new DataColumn("category"));//文章分類

            dt2.Columns.Add(new DataColumn("title"));

            dt2.Columns.Add(new DataColumn("link"));

            dt2.Columns.Add(new DataColumn("description"));

            dt2.Columns.Add(new DataColumn("pubDate"));

            dt2.Columns.Add(new DataColumn("category"));

            string url = "http://hi.baidu.com/306759613/blog/index/";//文章列表第一頁為http://hi.baidu.com/306759613/blog/index/0

            string arcurl="http://hi.baidu.com/306759613/blog/item/";//文章所在路徑

            //find page count

            //from index 0

            webRequest = WebRequest.Create(url + 0) as HttpWebRequest;

            webRequest.Timeout = 3000;//請求延時設定

            WebResponse reponse = webRequest.GetResponse();

          //gb2312讀取資料

            responseReader = new StreamReader(

            reponse.GetResponseStream(), System.Text.Encoding.GetEncoding("gb2312")

            );

            responseData = responseReader.ReadToEnd();//讀取整個頁面

            responseReader.Close();

            mytml = new html(responseData);//建立html頁面處理物件

            List<string> regpsl = mytml.getElementsByRegex(@"/blog/index/[\d]+");//獲取分頁連結的正則

          List<int> pagenum = new List<int>();//儲存頁碼

          foreach (string a in regpsl) {

          pagenum.Add(int.Parse(a.Replace("/blog/index/", "")));

          }

          pagecount = pagenum.Max() + 1;//pagenum中最大值為尾頁頁碼,頁面從0開始編號,頁數為頁面數+1       

          mytml = null;

            this.progressBar1.Value = 0; //進度條

            for (int i = 0; i < pagecount; i++)

            {

           webRequest = WebRequest.Create(url+i) as HttpWebRequest;//讀取各分頁

           webRequest.Timeout = 3000;

            responseReader = new StreamReader(

            webRequest.GetResponse().GetResponseStream(), System.Text.Encoding.GetEncoding("gb2312")

            );

             responseData = responseReader.ReadToEnd();

             responseReader.Close();

             mytml = new html(responseData);

             DataTable dti = mytml.getAritcleTable();//獲取該分頁文章列表

             this.progressBar1.Value = (i * 100 / pagecount);

             this.label1.Text = this.progressBar1.Value + "%"; //進度條


         for (int j = 0; j < dti.Rows.Count; j++)

         {

             dt.Rows.Add(dti.Rows[j].ItemArray);//插入該文章到總文章表

             HttpWebRequest subrequest = WebRequest.Create(arcurl+dti.Rows[j][1]+".html") as HttpWebRequest;//讀取文章資訊

             subrequest.Timeout = 3000;

             StreamReader subre = new StreamReader (subrequest.GetResponse().GetResponseStream(), System.Text.Encoding.GetEncoding("gb2312"));

             string tmphtml = subre.ReadToEnd();

             //處理文章頁面html開始

             int start = tmphtml.IndexOf("<div id=\"blog_text\" class=\"cnt\">")+"<div id=\"blog_text\" class=\"cnt\">".Length;

             int end = tmphtml.IndexOf("</div",start);

             tmphtml = tmphtml.Substring(start,end-start);//取得文章內容

             dt2.Rows.Add(new object[] { dti.Rows[j].ItemArray[0], dti.Rows[j].ItemArray[1], tmphtml, dti.Rows[j].ItemArray[3] });//插入文章資料到文章表

             subre.Close();

             this.progressBar1.Value = (i * 100 / pagecount) + (j * 25 / dti.Rows.Count);//設定進度條

             this.label1.Text = this.progressBar1.Value + "%"; //顯示百分比           

             writeXML(dt2, "f:\\p\\" + dti.Rows[j][1] + ".xml");//將文章以xml格式輸出

             dt2.Rows.Clear();

             subre.Close();

             subrequest = null;

         }

             webRequest = null;

             responseReader.Close();

             responseReader = null;

             responseData = string.Empty;

            }

            this.progressBar1.Value =this.progressBar1.Maximum;//進度100%

            this.label1.Text = this.progressBar1.Value + "%";

             this.dataGridView1.DataSource = dt;//顯示文章列表資料

              writeXML(dt, "f:\\p\\Articel.xml");//輸出文章內容資料到xml檔案

            textBoxDebug.Text = textBoxDebug.Text+ "寫入完畢\r\n";

        }

    /// <summary>

        /// 將資料表輸出到xml

        /// </summary>

        /// <param name="dt"></param>

        /// <param name="fileName"></param>

        public void writeXML(DataTable dt, string fileName)

        {

            string xmlstr ="<?xml version=\"1.0\" encoding=\"utf-8\"?>\r\n";

            xmlstr += "<?xml-stylesheet href=\"t.xsl\" type=\"text/xsl\"?>\r\n";

           xmlstr +="<root>\r\n";

            dt.TableName = "articels";

            System.Xml.XmlDocument xml = new System.Xml.XmlDocument();

            for (int k = 0; k < dt.Rows.Count; k++)

            {

                xmlstr = xmlstr + "<" + dt.TableName + ">\r\n";

                for (int l = 0; l < dt.Columns.Count; l++)

                {

                    xmlstr = xmlstr + "<" + dt.Columns[l].ColumnName + ">\r\n<![CDATA[\r\n";

                    xmlstr = xmlstr + dt.Rows[k][l] + "\r\n";

                    xmlstr = xmlstr + "]]>\r\n</" + dt.Columns[l].ColumnName + ">\r\n";

                }

                xmlstr = xmlstr + "</" + dt.TableName + ">\r\n";

            }

            xmlstr += "</root>\r\n";

            StreamWriter w = new StreamWriter(fileName, false, System.Text.Encoding.UTF8);//utf8儲存

            w.Write(xmlstr);

            w.Close();

        }

    }

    class html

    {

        string htmltext=string.Empty;

        /// <summary>

        /// 建構函式

        /// </summary>

        /// <param name="htmltext"></param>

        public html( string htmltext) {

            this.htmltext = htmltext;       

        }

        /// <summary>

        /// 獲取文章列表

        /// </summary>

        /// <returns></returns>

        public DataTable getAritcleTable(){

            DataTable dt = new DataTable();

            int start = htmltext.IndexOf("div id=\"m_blog\" class=\"modbox\">");//起始位置

            int end = htmltext.IndexOf("<div id=\"mod_artclg\" class=\"mod\">");//結束位置

            string htm = htmltext.Substring(start-1, end - start -1 );

            dt.Columns.Add(new DataColumn("title"));

            dt.Columns.Add(new DataColumn("link"));

            dt.Columns.Add(new DataColumn("description"));

            dt.Columns.Add(new DataColumn("pubDate"));

            dt.Columns.Add(new DataColumn("category"));

            string title, link, description, pubDate, category,temp;

            int nstart, nend;//記錄上次提取位置

            start = 0;

            do

            {//遍歷html文件 提取文章資訊

                nstart = htm.IndexOf("<div class=\"tit\">",start) + "<div class=\"tit\">".Length;

                if (nstart < start) break;

                start = nstart;

                nend = htm.IndexOf("</div>",start);

                start = nend + 5;

                temp = htm.Substring(nstart, nend - nstart );

                nstart = temp.IndexOf(">");

                nend =temp.IndexOf("</a>");

                title = temp.Substring(nstart + 1, nend-nstart-1 );//文章標題

                nstart = temp.IndexOf("\"");

                nend = temp.IndexOf("\"", nstart + 1);

                link = temp.Substring(nstart + 1, nend - nstart-1 );//連結

                nstart = link.IndexOf("item/")+"item/".Length;

                nend = link.IndexOf(".html");

                link = link.Substring(nstart, nend - nstart);//取檔名(去除副檔名)

                nstart = htm.IndexOf("<div class=\"date\">", start)+ "<div class=\"date\">".Length;

                start = nstart;

                nend = htm.IndexOf("</div>", start);

                pubDate = htm.Substring(nstart , nend - nstart);//發表日期

                start = nend + 5;

                nstart = htm.IndexOf("<div class=\"cnt\">", start) + "<div class=\"cnt\">".Length;

                start = nstart;

                nend = htm.IndexOf("</div>", start);

                start = nend + 5;

                description = htm.Substring(nstart, nend - nstart );//文章內容

                nstart = htm.IndexOf("<div class=\"opt\">", start) + "<div class=\"opt\">".Length;

                start = nstart;

                nend = htm.IndexOf("</div>", start);

                start = nend + 5;

                temp = htm.Substring(nstart, nend - nstart );

                nstart = temp.IndexOf("");

                nend =temp.IndexOf("</a>");

                category=temp.Substring(nstart + 1, nend - nstart - 1); //文章分類            

dt.Rows.Add(new string[] { title, link, description, pubDate, category });

            } while (nstart > 0);

         

            return dt;

        }

    }

相關文章