CSDN部落格分類系統的分析與實現

Thinkgamer_gyt發表於2015-09-08

一:爬蟲爬取csdn部落格各個系列的博文和標籤

       在這裡只給出主要程式碼:

      
#coding:utf-8

#第一部分:得到首頁部落格專家各個系列連結
#===============================================================================

import urllib2
from bs4 import BeautifulSoup
import os


def getPage(href): #偽裝成瀏覽器登陸,獲取網頁原始碼
    headers = {  
        'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'  
    }  
    req = urllib2.Request(  
        url = href ,
        headers = headers  
    )  
    
    #content = urllib2.urlopen(req).read()
    if urllib2.urlopen(req).read():
        return urllib2.urlopen(req).read()

def getText(href,count):   #得到部落格內容和博主的相關資訊
    soup = BeautifulSoup(getPage(href))
    div = soup.find("div",id="article_details",class_="details")
    #print div
    
#寫博文標籤---------------------------------------
    tag = div.find("div",class_="tag2box")             #文章標籤
    if tag:
        for a in tag.findAll("a"): 
            #print a.get_text()                          #標籤
            aTag = a.get_text()
            fp = open("%s\\tag.txt" % count,"a")
            fp.write(aTag.encode('utf-8'))
            fp.write("\n")
            fp.close()
#寫博文標題和內容-----------------------------------------------------   
    title = div.find("div",class_="article_title")  #文章標題
    content = div.find("div",id="article_content",class_="article_content") #內容
    titleName = title.h1.span.a.get_text().strip()       #標題
    #print titleName
    cont = content.get_text()   
    #print cont                       #內容
    fp = open("%s\\content.txt" % count,"a")
    fp.write(titleName.encode('utf-8'))
    fp.write(cont.encode('utf-8'))
    fp.close()
#寫博主的訪問量排名等--------------------------------------------------
    div = soup.find("div",id="panel_Profile",class_="panel")
    if div:
        ul_1 = div.find("ul",id = "blog_rank")
        
        fp = open("%s\\aother.txt" % count,"a")
        ul_1_List = ul_1.findAll("li")
        visit = ul_1_List[0].get_text()
        fp.write(visit.encode("utf-8"))
        fp.write("\n")
        #print ul_1_List[0].get_text()                 #訪問量

        score = ul_1_List[0].get_text()
        fp.write(score.encode("utf-8"))
        fp.write("\n")
        #print ul_1_List[0].get_text()                   #積分

        num = ul_1_List[3].get_text()
        fp.write(num.encode("utf-8"))
        fp.write("\n")
        #print ul_1_List[3].get_text()                 #排名
  
        ul_2 = div.find("ul",id = "blog_statistics")
        ul_2_List = ul_2.findAll("li")
        
        #print ul_2_List[0].get_text()        #原創文章數
        ower = ul_2_List[0].get_text()
        fp.write(ower.encode("utf-8"))
        fp.write("\n")
        
        #print ul_2_List[1].get_text()           #轉載文章數 
        fromAnother = ul_2_List[2].get_text()
        fp.write(fromAnother.encode("utf-8"))
        fp.write("\n")
        
        #print ul_2_List[2].get_text()             #譯文文章數
        translator = ul_2_List[2].get_text()
        fp.write(translator.encode("utf-8"))
        fp.write("\n")
        
        #print ul_2_List[3].get_text()             #評論條數
        talk = ul_2_List[3].get_text()
        fp.write(talk.encode("utf-8"))
        fp.write("\n\n")
        fp.close()
#------------------------------------------------------------------------


if __name__=="__main__":
    for count in range(10,11):
        fp = open("%s.txt" % count,"r")
        hrefList = fp.readlines()
        for href in hrefList:
            print href.strip()
            getText(href.strip(),count)
        print count , "is  Ok ==========================================="

二:對其進行詞頻統計,找出頻率最高的N個詞,寫入檔案(主要是為第三步分類提供訓練的資料集PS:小編的訓練集不是太準確,各路大神若有好的意見可以給指導指導)

      在這裡簡化為使用MapReduce程式統計tag

三:使用貝葉斯分類演算法進行分類

        貝葉斯演算法原理請參考:http://blog.csdn.net/gamer_gyt/article/details/47205371

        Python程式碼實現請參考:http://blog.csdn.net/gamer_gyt/article/details/47860945

        分類程式碼實現如下:     

#encoding:utf-8

from numpy import *


#構造文件列表和標籤列表
def loadDataSet():
    wordList = []
    typeList = [0,1,2,3,4,5,6,7,8,9]#0~9代表10種型別
    for i in range(1,11):
        lineList2 = []
        fp = open("tagDispose\%s.txt" % i,"r")
        lineList1 = fp.readlines()
        for j in range(len(lineList1)):
            strWord = lineList1[j].strip()
            if ord(strWord[0])<127:
                strWord= strWord.lower()
            lineList2.append(strWord)
        wordList.append(lineList2)
        fp.close()
    return wordList,typeList

#求所有文件的並集
def createBingjiList(wordList):
    bingjiList = set([])        #呼叫set方法建立一個空集
    for doc in wordList:
        bingjiList = bingjiList | set(doc)   #建立兩個集合並集
    return list(bingjiList)

#如果一個文件在該詞庫中,那麼出現該單詞的位置由0變成1
def setOfWords(bingjiList,inputList):
    returnList = [0] * len(bingjiList)          #建立以一個所有元素都為0的向量
    for word in inputList:
        if word in bingjiList:
            returnList[bingjiList.index(word)] =1
    return returnList

'''
def writeList(wordList,bingjiList):
    fp1 = open("word.txt","a")
    for i in range(len(wordList)):
        fp1.write(str(wordList[i]))
        fp1.write("\n")
    fp1.close()
        
    fp2 = open("bingji.txt","a")
    for i in range(len(bingjiList)):
        fp2.write(str(bingjiList[i]))
        fp2.write("\n")
    fp2.close()
'''
#樸素貝葉斯分類器訓練集
def trainBayes(trainMatrix,trainTag):
    pA = []      #任意文件屬於0-9類別的概率
    for i in range(0,10):
        pA.append(trainTag.count(i)/float(len(trainTag)))
    numTrainDocs= len(trainMatrix)    #文件矩陣的長度
    numWords = len(trainMatrix[0])     #文件矩陣第一行的單詞個數
    #初始化每個標籤對應的矩陣,總數,避免某一個概率為0最後乘積為0,so初始化分子為1分母為2
    p0Num = ones(numWords);p0Denom = 2.0
    p1Num = ones(numWords);p1Denom = 2.0
    p2Num = ones(numWords);p2Denom = 2.0
    p3Num = ones(numWords);p3Denom = 2.0
    p4Num = ones(numWords);p4Denom = 2.0
    p5Num = ones(numWords);p5Denom = 2.0
    p6Num = ones(numWords);p6Denom = 2.0
    p7Num = ones(numWords);p7Denom = 2.0
    p8Num = ones(numWords);p8Denom = 2.0
    p9Num = ones(numWords);p9Denom = 2.0
    for i in range(numTrainDocs):
        if trainTag[i] == 0:
            p0Num +=trainMatrix[i];p0Denom +=sum(trainMatrix[i])
        elif trainTag[i] == 1:
            p1Num +=trainMatrix[i];p1Denom +=sum(trainMatrix[i])
        elif trainTag[i] == 2:
             p2Num +=trainMatrix[i];p2Denom +=sum(trainMatrix[i])
        elif trainTag[i] == 3:
            p3Num +=trainMatrix[i];p3Denom +=sum(trainMatrix[i])
        elif trainTag[i] == 4:
            p4Num +=trainMatrix[i];p4Denom +=sum(trainMatrix[i])
        elif trainTag[i] == 5:
            p5Num +=trainMatrix[i];p5Denom +=sum(trainMatrix[i])
        elif trainTag[i] == 6:
            p6Num +=trainMatrix[i];p6Denom +=sum(trainMatrix[i])
        elif trainTag[i] == 7:
            p7Num +=trainMatrix[i];p7Denom +=sum(trainMatrix[i])
        elif trainTag[i] == 8:
            p8Num +=trainMatrix[i];p8Denom +=sum(trainMatrix[i])
        else:
            p9Num +=trainMatrix[i];p9Denom +=sum(trainMatrix[i])
    pV = []
    pV0 = log(p0Num/p0Denom);pV.append(pV0)  
    pV1 = log(p1Num/p1Denom);pV.append(pV1)
    pV2 = log(p2Num/p2Denom);pV.append(pV2)
    pV3 = log(p3Num/p3Denom);pV.append(pV3)
    pV4 = log(p4Num/p4Denom);pV.append(pV4)
    pV5 = log(p5Num/p5Denom);pV.append(pV5)
    pV6 = log(p6Num/p6Denom);pV.append(pV6)
    pV7 = log(p7Num/p7Denom);pV.append(pV7)
    pV8 = log(p8Num/p8Denom);pV.append(pV8)
    pV9 = log(p9Num/p9Denom);pV.append(pV9)

    return pA,pV

#樸素貝葉斯分類函式
def classifyBayes(testDoc,pV,pA):
    p0 = sum(testDoc * pV[0]) + log(pA[0])
    p1 = sum(testDoc * pV[1]) + log(pA[1])
    p2 = sum(testDoc * pV[2]) + log(pA[2])
    p3 = sum(testDoc * pV[3]) + log(pA[3])
    p4 = sum(testDoc * pV[4]) + log(pA[4])
    p5 = sum(testDoc * pV[5]) + log(pA[5])
    p6 = sum(testDoc * pV[6]) + log(pA[6])
    p7 = sum(testDoc * pV[7]) + log(pA[7])
    p8 = sum(testDoc * pV[8]) + log(pA[8])
    p9 = sum(testDoc * pV[9]) + log(pA[9])
    listValue = [p0,p1,p2,p3,p4,p5,p6,p7,p8,p9]
    return listValue.index(max(listValue))

#從文字中得到資料
def getDoc():
    import jieba
    print "準備中......\n請稍等......"
    fp = open("test.txt",'r')
    wordList = []
    strDocList = fp.readlines()
    for strDoc in strDocList:
        full_seg = jieba.cut(strDoc.strip(),cut_all = True)
        for word in full_seg:
            if len(word)>0:  #去除標點符號
                if ord(word[0])<127:
                    wordList.append(word.lower())
                else:
                   wordList.append(word)
    return wordList

def testingBayes():
    wordList,typeList = loadDataSet()
    bingjiList = createBingjiList(wordList)
    trainMat = []   #建立一個空的列表
    for lineDoc in wordList:
        trainMat.append(setOfWords(bingjiList,lineDoc))#使用詞向量來填充trainMat列表
    pA,pV = trainBayes(trainMat,typeList)
    testDoc = getDoc()      #從文字中得到資料
    thisList = array(setOfWords(bingjiList,testDoc))
    return classifyBayes(thisList,pV,pA)

if __name__=="__main__":
    type = ['移動開發','Web前端','架構設計','程式語言','網際網路',\
            '資料庫','系統運維','雲端計算','研發管理','綜合']
    classifiedNum = testingBayes()
    print "the text is classified as:",str(type[classifiedNum]).decode("utf-8")
   

相關文章