Python自然語言處理 6 學習分類文字

CopperDong發表於2017-11-24

回答下列問題:

(1)怎樣才能識別出語言資料中明顯用於分類的特徵?

(2)怎樣才能構建用於自動執行語言處理任務的語言模型?

(3)從這些模型中我們可以學到哪些關於語言的知識?

決策樹,樸素貝葉斯分類器和最大熵(shang)分類

一 監督式分類

#性別鑑定

建立分類

def gender_features(word):
    return {'last_letter': word[-1]}
gender_features('Shrek')
{'last_letter': 'k'}

from nltk.corpus import names
import random
names = ([(name, 'male') for name in names.words('male.txt')] + 
         [(name, 'female') for name in names.words('female.txt')])
random.shuffle(names)

import nltk
featuresets = [ (gender_features(n), g) for (n,g) in names ]
train_set, test_set = featuresets[500:], featuresets[:500]   #訓練集和測試集
classifier = nltk.NaiveBayesClassifier.train(train_set)

classifier.classify(gender_features('Neo'))
'male'
classifier.classify(gender_features('Trinity'))
'female'
print nltk.classify.accuracy(classifier, test_set)   #評估
0.75
classifier.show_most_informative_features(5)  #哪些特徵對於區分名字的性別是最有效的
Most Informative Features
             last_letter = u'a'           female : male   =     33.4 : 1.0
             last_letter = u'k'             male : female =     30.8 : 1.0
             last_letter = u'f'             male : female =     17.3 : 1.0
             last_letter = u'p'             male : female =     10.5 : 1.0
             last_letter = u'd'             male : female =     10.0 : 1.0

#選擇正確的特徵

def gender_features2(name):
    features = {}
    features["firstletter"] = name[0].lower()
    features["lastletter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    return features
gender_features2('JJohn')

featuresets = [(gender_features2(n), g) for (n,g) in names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)  #使用樸素貝葉斯分類器
print nltk.classify.accuracy(classifier, test_set)
0.776
#一種能有效完善特徵集的方法稱為錯誤分析。首先,選擇開發集,其中包含用於建立模型的語料資料。然後將這種開發集分為訓練集和開發測試集
train_names = names[1500:]
devtest_names = names[500:1500]
test_names = names[:500]
train_set = [(gender_features(n), g) for (n,g) in train_names]
devtest_set = [(gender_features(n),g) for (n,g) in devtest_names]
test_set = [(gender_features(n),g) for (n,g) in test_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier, devtest_set)
0.766
errors = []
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append( (tag, guess, name) )
for (tag, guess, name) in sorted(errors):  
    print 'correct=%-8s guess=%-8s name=%-30s' % (tag, guess, name)
correct=female   guess=male     name=Abagael                       
correct=female   guess=male     name=Adel                          
correct=female   guess=male     name=Alys                          
correct=female   guess=male     name=Amargo                        
correct=female   guess=male     name=Ambur
...      

#調整特徵提取器使其包含兩個字母后綴的特徵
def gender_features(word):
    return {'suffix1': word[-1:],
            'suffix2': word[-2:]}
train_set = [(gender_features(n), g) for (n,g) in train_names]
devtest_set = [(gender_features(n),g) for (n,g) in devtest_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier, devtest_set)

0.784
#文件分類

將電影評論語料庫歸類為正面或負面

from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
<text, categories>

#文件分類的特徵提取器,其特徵表示每個詞是否在一個給定的文件中
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = all_words.keys()[:2000]
def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features
print document_features(movie_reviews.words('pos/cv957_8737.txt'))
{u'contains(corporate)': False, u'contains(barred)': False, u'contains(batmans)': False, u'contains(menacing)': False, 
u'contains(rags)': False, u'contains(inquires)': False, 

#訓練和測試分類器以進行文件分類
featuresets = [(document_features(d),c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier, test_set)
0.73
classifier.show_most_informative_features(5)  #找出哪些特徵是分類器發現的並且是最有資訊量的
Most Informative Features
          contains(sans) = True              neg : pos    =      9.1 : 1.0
    contains(mediocrity) = True              neg : pos    =      7.8 : 1.0
     contains(dismissed) = True              pos : neg    =      6.9 : 1.0
     contains(testament) = True              pos : neg    =      6.5 : 1.0
   contains(bruckheimer) = True              neg : pos    =      6.4 : 1.0

#詞性標註

from nltk.corpus import brown
suffix_fdist = nltk.FreqDist()
for word in brown.words():
    word = word.lower()
    suffix_fdist[word[-1:]] += 1
    suffix_fdist[word[-2:]] += 1
    suffix_fdist[word[-3:]] += 1
from operator import itemgetter
common_suffixes = sorted(suffix_fdist.items(), key=itemgetter(1), reverse=True)
common_suffixes[:100]
[(u'e', 202946),
 (u',', 175002),
 (u'.', 152999),
 (u's', 128722),
 (u'd', 105687),
 (u't', 94459),
common_suf = [ suffix[0] for suffix in common_suffixes][:100]
common_suf

定義一個特徵提取器函式,用來檢查給定單詞的字尾

def pos_features(word):
    features = {}
    for suffix in common_suf:
        features['endswith(%s)'%suffix] = word.lower().endswith(suffix)
    return features
訓練新的“決策樹”的分類器

tagged_words = brown.tagged_words(categories='news')
tagged_words[0]
(u'The', u'AT')
len(tagged_words)
100554
len(pos_features(tagged_words[0][0]))
100
pos_features(tagged_words[0][0])
{u"endswith('')": False,
 u"endswith(')": False,
 u"endswith('s)": False,
 u'endswith(()': False,
 u'endswith())': False,
 u'endswith(,)': False,
featuresets = [(pos_features(n),g) for (n,g) in tagged_words]
size = int(len(featuresets) * 0.1)
size
Out[52]:  10055
train_set,test_set = featuresets[size:], featuresets[:size]
classifier = nltk.DecisionTreeClassifier.train(train_set)   #決策樹
nltk.classify.accuracy(classifier, test_set)
0.6270512182993535
classifier.classify(pos_features('cats'))
Out[54]:  u'NNS'
#決策樹的優點是容易解釋,甚至可以它們以虛擬碼形式輸出
print classifier.pseudocode(depth=4)
if endswith(the) == False: 
  if endswith(,) == False: 
    if endswith(s) == False: 
      if endswith(.) == False: return u'.'
      if endswith(.) == True: return u'.'
    if endswith(s) == True: 
      if endswith(is) == False: return u'PP$'
      if endswith(is) == True: return u'BEZ'
  if endswith(,) == True: return u','
if endswith(the) == True: return u'AT'

#探索上下文語境

不是隻傳遞已標註的詞,而是傳遞整個(未標註的)句子,以及目標詞的索引

#特徵檢測器

def pos_features(sentence, i):
    features = {"suffix(1)": sentence[i][-1:],
                "suffix(2)": sentence[i][-2:],
                "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
    return features
brown.sents()[0][7]
Out[62]:  u'an'
brown.sents()[0][8]
Out[63]:  u'investigation'
pos_features(brown.sents()[0], 8)    ###### 四個特徵
{'prev-word': u'an',
 'suffix(1)': u'n',
 'suffix(2)': u'on',
 'suffix(3)': u'ion'}
>>>

tagged_sents = brown.tagged_sents(categories='news')
featuresets = []
for tagged_sent in tagged_sents:
    untagged_sent = nltk.tag.untag(tagged_sent)
    for i, (word, tag) in enumerate(tagged_sent):
        featuresets.append( (pos_features(untagged_sent, i), tag) )
size = int(len(featuresets) * 0.1)
10055
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)
0.7891596220785678

#序列分類

在詞性標註的例子中,可以使用各種不同的序列分類器模型為給定的句子中的所有詞選擇詞性標註

一種稱為連續分類或貪婪序列分類的序列分類器策略,為第一個輸入找到最有可能的類標籤,然後在此基礎上找到下一個輸入的最佳的標籤。這個過程可以不斷重複直到所有的輸入都被貼上標籤。

特徵提取器

def pos_features(sentence, i, history):
    features = {"suffix(1)": sentence[i][-1:],
                "suffix(2)": sentence[i][-2:],
                "suffix(3)": sentence[i][-3:] }
    if i == 0:
        features["prev-word"] = "<START>"
        features["prev-tag"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
        features["pre-tag"] = history[i-1]
    return features
class ConsecutivePosTagger(nltk.TaggerI):
    def __init__(self, train_sents):
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = pos_features(untagged_sent, i, history)
                train_set.append((featureset, tag))
                history.append(tag)   ######
            self.classifier = nltk.NaiveBayesClassifier.train(train_set)
    def tag(self, sentence):
        history = []
        for i, word in enumerate(sentence):
            featureset = pos_features(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)
tagged_sents = brown.tagged_sents(categories='news')
size = int(len(tagged_sents) * 0.1)
train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]
tagger = ConsecutivePosTagger(train_sents)
print tagger.evaluate(test_sents)

#其他序列分類方法

這種方法的缺點是一旦做出決定便無法更改。例如:如果決定將一個詞標註為名詞,但後來發現應該是動詞,那也沒有辦法修復我們的錯誤了。解決這個問題的方法是採取轉型策略。轉型聯合分類的工作原理是為輸入的標籤建立一個初始值,然後反覆提煉該值,嘗試修復相關輸入之間的不一致

另一種方案是為詞性標記所有可能的序列打分,選擇總得分最高的序列。隱馬爾科夫模型就採取了這種方法。隱maerkefumox類似於連續分類器,不光考慮輸入也考慮已預測標記的歷史。然而,不是簡單地找出一個給定詞的單個最好標籤,而是為標記產生一個概率分佈。然後這些概率結合起來計算標記序列的概率得分,最後選擇最高概率的標記序列。不過,可能的標籤序列數量相當大。給定擁有30個標籤的標記集,大約有600萬億(30^10)中方式來標記一個10個詞的句子。為了避免單獨考慮所有這些可能的序列,隱馬爾科夫模型要求特徵提取器只考慮最近的標記(或最近的n個標記,其中n是相當小的)。由於這種限制,它可以使用動態規劃來有效地找出最有可能的標記序列。特別是,對每個連續的詞索引i,當前的及以前的每個可能的標記都將計算得分。這種基礎的方法被兩個更先進的模型所採用,它們被稱為最大熵馬爾科夫模型和線性鏈條件隨機場模型;但為標記序列打分用的是不同的演算法。

二 監督式分類的舉例

#句子分割

第一步是獲得一些已被分割成句子的資料,將它轉換成一種適合提取特徵的形式

sents = nltk.corpus.treebank_raw.sents()
tokens = []
boundaries = set()
offset = 0
for sent in nltk.corpus.treebank_raw.sents():
    tokens.extend(sent)
    offset += len(sent)
    boundaries.add(offset - 1)
def punct_features(tokens, i):
    return { 'next-word-capitailized': tokens[i+1][0].isupper(),
             'prevword': tokens[i-1].lower(),
             'punct': tokens[i],
             'prev-word-is-one-char': len(tokens[i-1]) == 1}
featuresets = [(punct_features(tokens, i), (i in boundaries)) for i in range(1, len(tokens)-1) if tokens[i] in '.?!']
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)
0.936026936026936

def segment_sentences(words):  #基於分類的斷句器
    start = 0
    sents = []
    for i, word in words:
        if word in '.?!' and classifier.classify(punct_features(words, i)) == True:
            sents.append(words[start:i+1])
            start = i+1
    if start < len(words):
        sents.append(words[start:])
        return sents

#識別對話行為型別

表述行為的陳述句,問候,問題,回答,斷言和說明都可以被認為是基於語言的行為型別。識別對話中隱含言語下的對話行為是理解談話的重要步驟。

利用NPS聊天語料庫建立一個分類器,用來識別新的即時訊息帖子的對話行為型別

posts = nltk.corpus.nps_chat.xml_posts()[:10000]   #每個帖子的XML註釋
def dialogue_act_features(post):   #特徵提取器
    features = {}
    for word in nltk.word_tokenize(post):
        features['contains(%s)' % word.lower()] = True
    return features
featuresets = [(dialogue_act_features(post.text), post.get('class')) for post in posts]
({'contains(gay)': True,
  'contains(im)': True,
  'contains(left)': True,
  'contains(name)': True,
  'contains(now)': True,
  'contains(this)': True,
  'contains(with)': True},
 'Statement')  #陳述句
size = int(len(featuresets) * 0.1)   #分類器
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier, test_set)
0.668

#識別文字蘊涵

(Recognizing textual entailment, RTE)是判斷文字T內的一個給定片段是否繼承另一個叫做“假設”的文字。迄今為止,已經有4個RTE挑戰賽,在那裡共享的開發和測試資料會提供給參賽隊伍。

def rte_features(rtepair):
    extractor = nltk.RTEFeatureExtractor(rtepair)
    features = {}
    features['word_overlap'] = len(extractor.overlap('word'))
    features['word_hyp_extra'] = len(extractor.hyp_extra('word'))
    features['ne_overlap'] = len(extractor.overlap('ne'))
    features['ne_hyp_extra'] = len(extractor.hyp_extra('ne'))
    return features
rtepair = nltk.corpus.rte.pairs(['rte3_dev.xml'])[33]
extractor = nltk.RTEFeatureExtractor(rtepair)
print extractor.text_words
set(['Organisation', 'Shanghai', 'Asia', 'four', 'at', 'operation', 'SCO', 'Iran', 'Soviet', 'Davudi', 'fight', 'China', 'association', 'fledgling', 'was', 'that', 'republics', 'former', 'Co', 'representing', 'Russia', 'Parviz', 'central', 'meeting', 'together', 'binds', 'terrorism.'])
print extractor.hyp_words
set(['member', 'SCO.', 'China'])
print extractor.overlap('word')
set([])
print extractor.overlap('ne')
set(['China'])
print extractor.hyp_extra('word')
set(['member'])

#擴充套件到大型資料集

純Python的分類不是很快,建議探索NLTK與外部機器學習包的介面技術,

三 評估

測試集

準確度

精確度和召回率

混淆矩陣

交叉驗證

四 決策樹

熵和資訊增益

五 樸素貝葉斯分類器

潛在概率模型

零計數和平滑

非二元特徵

獨立的樸素性

雙重計數的原因

六 最大熵分類器

最大熵模型

熵的最大化

生成式分類器對比條件分類器

七 為語言模式建模

模型告訴我們什麼?

八 深入閱讀

使用Weka, Mallet, TADM 和 MegaM






























相關文章