04 ML 決策樹入門 ID3 演算法實現

weixin_34146805發表於2016-05-11

參考自： ML In Action

from math import log
import operator 
import pickle

# 計算夏農熵 計算所有類別的資訊期望值
# 用來表示某個特徵的資訊期望值
# 熵越高混合的資料越多
def calc_shannon_ent(data_set):
    ent_num = len(data_set)
    label_count = {}
    for feat in data_set:
        curr_label = feat[-1]
        if curr_label not in label_count.keys():
            label_count[curr_label] = 0
        label_count[curr_label] += 1 # 統計各 label 的數量

    shannon_ent = 0.0
    for key in label_count:
        prob = float(label_count[key]) / ent_num
        shannon_ent -= prob * log(prob, 2) # log base 2 資訊期望值公式
    return shannon_ent

# 劃分資料集 
# 1.抽取 axis 位置為 value 的項
# 2.從選取的項中去掉 axis的值
# [[1, 0, 0], [1, 1, 0], [0, 1, 2]]
# 當 axis = 0 value = 1
# 結果 [[0, 0] [1, 0]]
def split_dataset(dataset, axis, value):
    ret_set = []
    for feat in dataset:
        if feat[axis] == value:
            reduce_feat = feat[:axis] # chop out axis used for spliting
            reduce_feat.extend(feat[axis+1:])
            ret_set.append(reduce_feat)
    return ret_set

# 選擇最重要 最好的特徵來劃分 
# ID3劃分演算法
def choose_best_feature_to_split(dataset):
    feature_num    = len(dataset[0]) - 1  # 最後一列是 label 前面幾列是特徵
    base_entropy   = calc_shannon_ent(dataset) # 計算真個資料集的熵
    best_info_gain = 0.0 # 最好的資訊增益
    best_feature   = -1  # 最好的特徵

    # 從一個特徵開始計算 各個特徵的熵
    for i in range(feature_num):
        feat_list = [example[i] for example in dataset]
        unique_val = set(feat_list) #get a set of unique values

        # 計算每個特徵劃分資料後的熵 一個特徵就是一列資料 可能包含不同的label
        # 計算各個 label 的熵和即可
        new_entropy = 0.0
        for value in unique_val:
            sub_dataset = split_dataset(dataset, i, value)
            prob = len(sub_dataset) / float(len(dataset))
            new_entropy += prob * calc_shannon_ent(sub_dataset)

        # 計算每個特徵的資訊增益 熵越小 資料混合度越低 說明按這個特徵劃分最好
        info_gain = base_entropy - new_entropy
        if info_gain > best_info_gain:
            best_info_gain = info_gain
            best_feature = i # 標記最好的特徵
    return best_feature

# 在處理完所有的特徵後，發現標籤已經不是唯一，採用投票表決的方法來做決定，即選取數量最多的label。
def majority_cnt(class_list):
    class_count = {}
    for vote in class_list:
        if vote not in class_count.keys():
            class_count[vote] = 0

        class_count[vote] += 1
    sorted_class_count = sorted(class_count.iteritems(), key = operator.itemgetter(1), reverse=True)
    return sorted_class_count[0][0]

def create_decision_tree(dataset, labels):
    class_list = [example[-1] for example in dataset] # 標籤列
    if class_list.count(class_list[0]) == len(class_list):
        return class_list[0] # 類別完全相同 停止劃分 直接返回該類別

    if len(dataset[0]) == 1: # 遍歷完所有的特徵列 只剩下標籤列了 返回出現次數最多的
        return majority_cnt(class_list)

    # 選擇最好的標籤 開始為 no surfacing
    best_feature = choose_best_feature_to_split(dataset)
    best_feature_label = labels[best_feature]

    new_tree = {best_feature_label : {}}
    del(labels[best_feature]) # 從標籤資料集中刪除次最好的標籤

    # 在資料集中找到最好標籤對應的一列資料 [1, 1, 1, 0, 0]
    feat_vals = [example[best_feature] for example in dataset]
    # 選擇這列資料中的屬性值 [1, 0]
    unique_vals = set(feat_vals)

    # 根據這列資料中的標籤 [1, 0] 來遞迴分類
    for value in unique_vals:
        sublabels = labels[:] # 去掉了這個最好的標籤後 繼續遞迴查詢分類
        result    = split_dataset(dataset, best_feature, value) # 去掉這個標籤
        new_tree[best_feature_label][value] = create_decision_tree(result, sublabels)
    return new_tree
# 1. [[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']]        
# 2. 計算得出 第一列 no surfacing 為起始最好的標籤 分類根據第一列
#    x1 = [1, 1, 'yes']     x2 = [1, 1, 'yes']     x3 = [1, 0, 'no']
#    y1 = [0, 1, 'no'],     y2 = [0, 1, 'no']
# 3. 再根據 第二列分類
#    x1 x2 y1 y2 一類
#    x3 一類

# 決策樹分類
def desicion_tree_classify(input_tree, feat_labels, test_vector):
    first_str   = input_tree.keys()[0]         # 樹的根節點 即為第一個標籤
    second_dict = input_tree[first_str]        # 第二層的資料
    feat_idx    = feat_labels.index(first_str) # 將字元標籤轉換為索引

    key = test_vector[feat_idx] # key -> 0, 1
    val_of_feat = second_dict[key] # 根據 0或者1 來選擇不同的分支

    if isinstance(val_of_feat, dict):
        class_label = desicion_tree_classify(val_of_feat, feat_labels, test_vector)
    else:
        class_label = val_of_feat # 一直遞迴到最後一層葉子節點 得到標籤
    return class_label

# 序列化決策樹
def store_tree(input_tree, filename):
    fw = open(filename, 'w')
    pickle.dump(input_tree, fw)
    fw.close()

def grab_tree(filename):
    fr = open(filename)
    return pickle.load(fr)

data_set = [[1, 1, 'yes'],
            [1, 1, 'yes'],
            [1, 0, 'no'],
            [0, 1, 'no'],
            [0, 1, 'no']]
labels = ['no surfacing', 'flippers']

result = choose_best_feature_to_split(data_set)

new_tree = create_decision_tree(data_set, labels)
print('new_tree')
print(new_tree) 
# {'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}

labels = ['no surfacing', 'flippers']
result = desicion_tree_classify(new_tree, labels, [1, 1])
print('result')
print(result)
# yes

ML《決策樹（一）ID3》
2020-12-27
【面試考】【入門】決策樹演算法ID3，C4.5和CART
2020-05-24
面試演算法
機器學習之決策樹ID3(python實現)
2019-02-27
機器學習Python
ML《決策樹（三）CART》
2020-12-30
鵝廠優文 | 決策樹及ID3演算法學習
2018-03-20
演算法
ML《決策樹（二）C4.5》
2020-12-27
Reinventing the wheel：決策樹演算法的實現
2019-02-16
演算法
決策樹演算法的推理與實現
2022-06-03
演算法
ML《決策樹（四）Bagging 和 Random Forest》
2021-01-02
randomREST
《機器學習Python實現_09_01_決策樹_ID3與C4.5》
2020-05-26
機器學習Python
決策樹中資訊增益、ID3以及C4.5的實現與總結
2020-02-21
決策樹演算法-實戰篇
2020-11-16
演算法
基於資訊增益的ID3決策樹介紹。
2018-03-17
決策樹演算法
2022-03-07
演算法
機器學習實戰（三）決策樹ID3：樹的構建和簡單分類
2018-05-17
機器學習
決策樹在sklearn中的實現
2019-03-07
【Python機器學習實戰】決策樹和整合學習（二）——決策樹的實現
2021-08-25
Python機器學習
機器學習|決策樹-sklearn實現
2020-12-19
機器學習
Python程式設計入門：接地氣的決策樹演算法基礎講解
2019-07-29
Python程式設計演算法
決策樹模型(4)Cart演算法
2024-04-09
模型演算法
決策樹演算法-理論篇
2020-11-09
演算法
機器學習——決策樹模型：Python實現
2020-11-09
機器學習模型Python
決策樹
2024-07-27
分類演算法-決策樹 Decision Tree
2020-01-18
演算法
《統計學習方法》——從零實現決策樹
2021-03-17
決策樹模型(3)決策樹的生成與剪枝
2024-03-28
模型
決策樹示例
2021-01-16
機器學習之決策樹演算法
2019-07-28
機器學習演算法
機器學習之決策樹(Decision Tree)python實現
2018-06-12
機器學習Python
機器學習之決策樹在sklearn中的實現
2019-03-06
機器學習
《機器學習Python實現_09_02_決策樹_CART》
2020-05-27
機器學習Python
4. 決策樹
2020-10-26
Decision tree——決策樹
2020-04-30
決策樹（Decision Tree）
2021-07-13
演算法金 | 突破最強演算法模型，決策樹演算法！！
2024-05-31
演算法模型
【機器學習】實現層面決策樹並用graphviz視覺化樹
2020-10-28
機器學習視覺化
Python機器學習：決策樹001什麼是決策樹
2020-12-24
Python機器學習
通俗地說決策樹演算法（二）例項解析
2019-07-29
演算法
機器學習經典演算法之決策樹
2019-06-16
機器學習演算法

04 ML 決策樹入門 ID3 演算法實現

相關文章