public partial class Form1 : Form
{
Thread newth;
public Form1()
{
InitializeComponent();
}
private void buttonGo_Click(object sender, EventArgs e)
{
CheckForIllegalCrossThreadCalls = false; //簡單非同步執行緒控制設定
newth = new Thread(new ThreadStart(doit));
newth.Start();
}
void doit()
{ //用HttpWebRequest 物件採集百度hi blog文章
HttpWebRequest webRequest; //請求物件
StreamReader responseReader;//響應物件
string responseData;
html mytml; //自定義html簡單處理物件,處理文章頁面資料
DataTable dt = new DataTable();//儲存文章列表
DataTable dt2 = new DataTable();//儲存文章內容
int pagecount = 0;
dt.Columns.Add(new DataColumn("title"));//標題
dt.Columns.Add(new DataColumn("link"));//連結
dt.Columns.Add(new DataColumn("description"));//文章內容
dt.Columns.Add(new DataColumn("pubDate"));//發表時間
dt.Columns.Add(new DataColumn("category"));//文章分類
dt2.Columns.Add(new DataColumn("title"));
dt2.Columns.Add(new DataColumn("link"));
dt2.Columns.Add(new DataColumn("description"));
dt2.Columns.Add(new DataColumn("pubDate"));
dt2.Columns.Add(new DataColumn("category"));
string url = "http://hi.baidu.com/306759613/blog/index/";//文章列表第一頁為http://hi.baidu.com/306759613/blog/index/0
string arcurl="http://hi.baidu.com/306759613/blog/item/";//文章所在路徑
//find page count
//from index 0
webRequest = WebRequest.Create(url + 0) as HttpWebRequest;
webRequest.Timeout = 3000;//請求延時設定
WebResponse reponse = webRequest.GetResponse();
//以gb2312讀取資料
responseReader = new StreamReader(
reponse.GetResponseStream(), System.Text.Encoding.GetEncoding("gb2312")
);
responseData = responseReader.ReadToEnd();//讀取整個頁面
responseReader.Close();
mytml = new html(responseData);//建立html頁面處理物件
List<string> regpsl = mytml.getElementsByRegex(@"/blog/index/[\d]+");//獲取分頁連結的正則
List<int> pagenum = new List<int>();//儲存頁碼
foreach (string a in regpsl) {
pagenum.Add(int.Parse(a.Replace("/blog/index/", "")));
}
pagecount = pagenum.Max() + 1;//pagenum中最大值為尾頁頁碼,頁面從0開始編號,頁數為頁面數+1
mytml = null;
this.progressBar1.Value = 0; //進度條
for (int i = 0; i < pagecount; i++)
{
webRequest = WebRequest.Create(url+i) as HttpWebRequest;//讀取各分頁
webRequest.Timeout = 3000;
responseReader = new StreamReader(
webRequest.GetResponse().GetResponseStream(), System.Text.Encoding.GetEncoding("gb2312")
);
responseData = responseReader.ReadToEnd();
responseReader.Close();
mytml = new html(responseData);
DataTable dti = mytml.getAritcleTable();//獲取該分頁文章列表
this.progressBar1.Value = (i * 100 / pagecount);
this.label1.Text = this.progressBar1.Value + "%"; //進度條
for (int j = 0; j < dti.Rows.Count; j++)
{
dt.Rows.Add(dti.Rows[j].ItemArray);//插入該文章到總文章表
HttpWebRequest subrequest = WebRequest.Create(arcurl+dti.Rows[j][1]+".html") as HttpWebRequest;//讀取文章資訊
subrequest.Timeout = 3000;
StreamReader subre = new StreamReader (subrequest.GetResponse().GetResponseStream(), System.Text.Encoding.GetEncoding("gb2312"));
string tmphtml = subre.ReadToEnd();
//處理文章頁面html開始
int start = tmphtml.IndexOf("<div id=\"blog_text\" class=\"cnt\">")+"<div id=\"blog_text\" class=\"cnt\">".Length;
int end = tmphtml.IndexOf("</div",start);
tmphtml = tmphtml.Substring(start,end-start);//取得文章內容
dt2.Rows.Add(new object[] { dti.Rows[j].ItemArray[0], dti.Rows[j].ItemArray[1], tmphtml, dti.Rows[j].ItemArray[3] });//插入文章資料到文章表
subre.Close();
this.progressBar1.Value = (i * 100 / pagecount) + (j * 25 / dti.Rows.Count);//設定進度條
this.label1.Text = this.progressBar1.Value + "%"; //顯示百分比
writeXML(dt2, "f:\\p\\" + dti.Rows[j][1] + ".xml");//將文章以xml格式輸出
dt2.Rows.Clear();
subre.Close();
subrequest = null;
}
webRequest = null;
responseReader.Close();
responseReader = null;
responseData = string.Empty;
}
this.progressBar1.Value =this.progressBar1.Maximum;//進度100%
this.label1.Text = this.progressBar1.Value + "%";
this.dataGridView1.DataSource = dt;//顯示文章列表資料
writeXML(dt, "f:\\p\\Articel.xml");//輸出文章內容資料到xml檔案
textBoxDebug.Text = textBoxDebug.Text+ "寫入完畢\r\n";
}
/// <summary>
/// 將資料表輸出到xml
/// </summary>
/// <param name="dt"></param>
/// <param name="fileName"></param>
public void writeXML(DataTable dt, string fileName)
{
string xmlstr ="<?xml version=\"1.0\" encoding=\"utf-8\"?>\r\n";
xmlstr += "<?xml-stylesheet href=\"t.xsl\" type=\"text/xsl\"?>\r\n";
xmlstr +="<root>\r\n";
dt.TableName = "articels";
System.Xml.XmlDocument xml = new System.Xml.XmlDocument();
for (int k = 0; k < dt.Rows.Count; k++)
{
xmlstr = xmlstr + "<" + dt.TableName + ">\r\n";
for (int l = 0; l < dt.Columns.Count; l++)
{
xmlstr = xmlstr + "<" + dt.Columns[l].ColumnName + ">\r\n<![CDATA[\r\n";
xmlstr = xmlstr + dt.Rows[k][l] + "\r\n";
xmlstr = xmlstr + "]]>\r\n</" + dt.Columns[l].ColumnName + ">\r\n";
}
xmlstr = xmlstr + "</" + dt.TableName + ">\r\n";
}
xmlstr += "</root>\r\n";
StreamWriter w = new StreamWriter(fileName, false, System.Text.Encoding.UTF8);//以utf8儲存
w.Write(xmlstr);
w.Close();
}
}
class html
{
string htmltext=string.Empty;
/// <summary>
/// 建構函式
/// </summary>
/// <param name="htmltext"></param>
public html( string htmltext) {
this.htmltext = htmltext;
}
/// <summary>
/// 獲取文章列表
/// </summary>
/// <returns></returns>
public DataTable getAritcleTable(){
DataTable dt = new DataTable();
int start = htmltext.IndexOf("div id=\"m_blog\" class=\"modbox\">");//起始位置
int end = htmltext.IndexOf("<div id=\"mod_artclg\" class=\"mod\">");//結束位置
string htm = htmltext.Substring(start-1, end - start -1 );
dt.Columns.Add(new DataColumn("title"));
dt.Columns.Add(new DataColumn("link"));
dt.Columns.Add(new DataColumn("description"));
dt.Columns.Add(new DataColumn("pubDate"));
dt.Columns.Add(new DataColumn("category"));
string title, link, description, pubDate, category,temp;
int nstart, nend;//記錄上次提取位置
start = 0;
do
{//遍歷html文件 提取文章資訊
nstart = htm.IndexOf("<div class=\"tit\">",start) + "<div class=\"tit\">".Length;
if (nstart < start) break;
start = nstart;
nend = htm.IndexOf("</div>",start);
start = nend + 5;
temp = htm.Substring(nstart, nend - nstart );
nstart = temp.IndexOf(">");
nend =temp.IndexOf("</a>");
title = temp.Substring(nstart + 1, nend-nstart-1 );//文章標題
nstart = temp.IndexOf("\"");
nend = temp.IndexOf("\"", nstart + 1);
link = temp.Substring(nstart + 1, nend - nstart-1 );//連結
nstart = link.IndexOf("item/")+"item/".Length;
nend = link.IndexOf(".html");
link = link.Substring(nstart, nend - nstart);//取檔名(去除副檔名)
nstart = htm.IndexOf("<div class=\"date\">", start)+ "<div class=\"date\">".Length;
start = nstart;
nend = htm.IndexOf("</div>", start);
pubDate = htm.Substring(nstart , nend - nstart);//發表日期
start = nend + 5;
nstart = htm.IndexOf("<div class=\"cnt\">", start) + "<div class=\"cnt\">".Length;
start = nstart;
nend = htm.IndexOf("</div>", start);
start = nend + 5;
description = htm.Substring(nstart, nend - nstart );//文章內容
nstart = htm.IndexOf("<div class=\"opt\">", start) + "<div class=\"opt\">".Length;
start = nstart;
nend = htm.IndexOf("</div>", start);
start = nend + 5;
temp = htm.Substring(nstart, nend - nstart );
nstart = temp.IndexOf(":");
nend =temp.IndexOf("</a>");
category=temp.Substring(nstart + 1, nend - nstart - 1); //文章分類
dt.Rows.Add(new string[] { title, link, description, pubDate, category });
} while (nstart > 0);
return dt;
}
}