chapter7:樸素貝葉斯及文字---非結構化文字分類

CopperDong發表於2017-10-06

 非結構化資料包括郵件、推文、博文、新聞報導等物件。這些資料看上去(至少一眼看上去)並不能很清晰地通過表格來描述。

一、一個文字正負傾向性的自動判定系統

  這裡的資料集稱為訓練語料庫(training corpus)。語料庫中的每條記錄即使只是一段140個字元的推文,每個文件都標註了正面或負面類別

  一種方法可以從文件的第一句開始,比如Puts the Thrill back in Thriller,然後計算一篇正面文件以Puts開始的概率,以the為開始第二個詞的概率,以Thrill為第三個詞的概率,等等。這麼多概率需要計算使得上述做法不可行。

  但我們可以通過將文件看成無序詞袋(bag of words)從而對上述做法進行簡化。

二、訓練階段

   Newsgroup語料庫http://qwone.com/~json/20Newsgroups/

     該資料有來自20個不同新聞組的帖子

           常用詞和停用詞

     這會減少我們的處理量

     去掉它們之後會提高效能

三、用Python

     BayesText類

   1、初始化方法

     讀入停用詞表中的詞

     讀取訓練目錄來獲取子目錄的名字

     對每個子目錄,呼叫train方法來計算該目錄下所有檔案中的單詞出現數目

     利用如下公式計算概率

       P(wk | hi) = (nk + 1) / ( n + |Vocabulary| )

from __future__ import print_function
import os, codecs, math

class BayesText:

    def __init__(self, trainingdir, stopwordlist):
        """This class implements a naive Bayes approach to text
        classification
        trainingdir is the training data. Each subdirectory of
        trainingdir is titled with the name of the classification
        category -- those subdirectories in turn contain the text
        files for that category.
        The stopwordlist is a list of words (one per line) will be
        removed before any counting takes place.
        """
        self.vocabulary = {}
        self.prob = {}
        self.totals = {}
        self.stopwords = {}
        f = open(stopwordlist)
        for line in f:
            self.stopwords[line.strip()] = 1
        f.close()
        categories = os.listdir(trainingdir)
        #filter out files that are not directories
        self.categories = [filename for filename in categories
                           if os.path.isdir(trainingdir + filename)]
        print("Counting ...")
        for category in self.categories:
            print('    ' + category)
            (self.prob[category],
             self.totals[category]) = self.train(trainingdir, category)
        # I am going to eliminate any word in the vocabulary
        # that doesn't occur at least 3 times
        toDelete = []
        for word in self.vocabulary:
            if self.vocabulary[word] < 3:
                # mark word for deletion
                # can't delete now because you can't delete
                # from a list you are currently iterating over
                toDelete.append(word)
        # now delete
        for word in toDelete:
            del self.vocabulary[word]
        # now compute probabilities
        vocabLength = len(self.vocabulary)
        print("Computing probabilities:")
        for category in self.categories:
            print('    ' + category)
            denominator = self.totals[category] + vocabLength
            for word in self.vocabulary:
                if word in self.prob[category]:
                    count = self.prob[category][word]
                else:
                    count = 1
                self.prob[category][word] = (float(count + 1)
                                             / denominator)
        print ("DONE TRAINING\n\n")
                    

    def train(self, trainingdir, category):
        """counts word occurrences for a particular category"""
        currentdir = trainingdir + category
        files = os.listdir(currentdir)
        counts = {}
        total = 0
        for file in files:
            #print(currentdir + '/' + file)
            f = codecs.open(currentdir + '/' + file, 'r', 'iso8859-1')
            for line in f:
                tokens = line.split()
                for token in tokens:
                    # get rid of punctuation and lowercase token
                    token = token.strip('\'".,?:-')
                    token = token.lower()
                    if token != '' and not token in self.stopwords:
                        self.vocabulary.setdefault(token, 0)
                        self.vocabulary[token] += 1
                        counts.setdefault(token, 0)
                        counts[token] += 1
                        total += 1
            f.close()
        return(counts, total)
                    
                    
    def classify(self, filename):
        results = {}
        for category in self.categories:
            results[category] = 0
        f = codecs.open(filename, 'r', 'iso8859-1')
        for line in f:
            tokens = line.split()
            for token in tokens:
                #print(token)
                token = token.strip('\'".,?:-').lower()
                if token in self.vocabulary:
                    for category in self.categories:
                        if self.prob[category][token] == 0:
                            print("%s %s" % (category, token))
                        results[category] += math.log(
                            self.prob[category][token])
        f.close()
        results = list(results.items())
        results.sort(key=lambda tuple: tuple[1], reverse = True)
        # for debugging I can change this to give me the entire list
        return results[0][0]

    def testCategory(self, directory, category):
        files = os.listdir(directory)
        total = 0
        correct = 0
        for file in files:
            total += 1
            result = self.classify(directory + file)
            if result == category:
                correct += 1
        return (correct, total)

    def test(self, testdir):
        """Test all files in the test directory--that directory is
        organized into subdirectories--each subdir is a classification
        category"""
        categories = os.listdir(testdir)
        #filter out files that are not directories
        categories = [filename for filename in categories if
                      os.path.isdir(testdir + filename)]
        correct = 0
        total = 0
        for category in categories:
            print(".", end="")
            (catCorrect, catTotal) = self.testCategory(
                testdir + category + '/', category)
            correct += catCorrect
            total += catTotal
        print("\n\nAccuracy is  %f%%  (%i test instances)" %
              ((float(correct) / total) * 100, total))
            
# change these to match your directory structure
baseDirectory = "/Users/raz/Dropbox/guide/data/20news-bydate/"
trainingDir = baseDirectory + "20news-bydate-train/"
testDir = baseDirectory + "20news-bydate-test/"


stoplistfile = "/Users/raz/Downloads/20news-bydate/stopwords0.txt"
print("Reg stoplist 0 ")
bT = BayesText(trainingDir, baseDirectory + "stopwords0.txt")
print("Running Test ...")
bT.test(testDir)

print("\n\nReg stoplist 25 ")
bT = BayesText(trainingDir, baseDirectory + "stopwords25.txt")
print("Running Test ...")
bT.test(testDir)

print("\n\nReg stoplist 174 ")
bT = BayesText(trainingDir, baseDirectory + "stopwords174.txt")
print("Running Test ...")
bT.test(testDir)

四、樸素貝葉斯以及情感分析

情感分析的目標是確定作者的態度或看法

一種常見的情感分析是確定某條評論的極性(正向或負向)


閱讀本書成為資料探勘專家的可能性不會比閱讀鋼琴書成為鋼琴演奏高手的可能性更大。你需要不斷實踐



           

  




相關文章