from math import log def calcShannonEnt(dataSet): numEntries = len(dataSet) print("樣本總數:" + str(numEntries)) labelCounts = {} #記錄每一類標籤的數量 #定義特徵向量featVec for featVec in dataSet: currentLabel = featVec[-1] #最後一列是類別標籤 if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0; labelCounts[currentLabel] += 1 #標籤currentLabel出現的次數 print("當前labelCounts狀態:" + str(labelCounts)) shannonEnt = 0.0 for key in labelCounts: prob = float(labelCounts[key]) / numEntries #每一個類別標籤出現的概率 print(str(key) + "類別的概率:" + str(prob)) print(prob * log(prob, 2) ) shannonEnt -= prob * log(prob, 2) print("熵值:" + str(shannonEnt)) return shannonEnt def createDataSet(): dataSet = [ # [1, 1, 'yes'], # [1, 0, 'yes'], # [1, 1, 'no'], # [0, 1, 'no'], # [0, 1, 'no'], # #以下隨意新增,用於測試熵的變化,越混亂越衝突,熵越大 # [1, 1, 'no'], # [1, 1, 'no'], # [1, 1, 'no'], # [1, 1, 'no'], # [1, 1, 'maybe'], # [1, 1, 'maybe1'] # 用下面的8個比較極端的例子看得會更清楚。如果按照這個規則繼續增加下去,熵會繼續增大。 # [1,1,'1'], # [1,1,'2'], # [1,1,'3'], # [1,1,'4'], # [1,1,'5'], # [1,1,'6'], # [1,1,'7'], # [1,1,'8'], # 這是另一個極端的例子,所有樣本的類別是一樣的,有序,不混亂,此時熵為0 [1,1,'1'], [1,1,'1'], [1,1,'1'], [1,1,'1'], [1,1,'1'], [1,1,'1'], [1,1,'1'], [1,1,'1'], ] labels = ['no surfacing', 'flippers'] return dataSet, labels def testCalcShannonEnt(): myDat, labels = createDataSet() print(calcShannonEnt(myDat)) if __name__ == '__main__': testCalcShannonEnt() print(log(0.000002, 2))
以下輸出結果是每個樣本的類別都不同時的輸出結果:
樣本總數:8 |
from math import log
def calcShannonEnt(dataSet):numEntries = len(dataSet)print("樣本總數:" + str(numEntries))
labelCounts = {} #記錄每一類標籤的數量
#定義特徵向量featVecfor featVec in dataSet:currentLabel = featVec[-1] #最後一列是類別標籤
if currentLabel not in labelCounts.keys():labelCounts[currentLabel] = 0;
labelCounts[currentLabel] += 1 #標籤currentLabel出現的次數print("當前labelCounts狀態:" + str(labelCounts))
shannonEnt = 0.0
for key in labelCounts:prob = float(labelCounts[key]) / numEntries #每一個類別標籤出現的概率
print(str(key) + "類別的概率:" + str(prob))print(prob * log(prob, 2) )shannonEnt -= prob * log(prob, 2) print("熵值:" + str(shannonEnt))
return shannonEnt
def createDataSet():dataSet = [# [1, 1, 'yes'],# [1, 0, 'yes'],# [1, 1, 'no'],# [0, 1, 'no'],# [0, 1, 'no'],# #以下隨意新增,用於測試熵的變化,越混亂越衝突,熵越大# [1, 1, 'no'],# [1, 1, 'no'],# [1, 1, 'no'],# [1, 1, 'no'],# [1, 1, 'maybe'],# [1, 1, 'maybe1']# 用下面的8個比較極端的例子看得會更清楚。如果按照這個規則繼續增加下去,熵會繼續增大。# [1,1,'1'],# [1,1,'2'],# [1,1,'3'],# [1,1,'4'],# [1,1,'5'],# [1,1,'6'],# [1,1,'7'],# [1,1,'8'],
# 這是另一個極端的例子,所有樣本的類別是一樣的,有序,不混亂,此時熵為0[1,1,'1'],[1,1,'1'],[1,1,'1'],[1,1,'1'],[1,1,'1'],[1,1,'1'],[1,1,'1'],[1,1,'1'],]
labels = ['no surfacing', 'flippers']
return dataSet, labels
def testCalcShannonEnt():
myDat, labels = createDataSet()print(calcShannonEnt(myDat))
if __name__ == '__main__':testCalcShannonEnt()print(log(0.000002, 2))