《機器學習Python實現_09_02_決策樹_CART》

努力的番茄發表於2020-05-27

簡介

CART樹即分類迴歸樹(classification and regression tree),顧名思義,它即能用作分類任務又能用作迴歸任務,它的應用比較廣泛,通常會用作整合學習的基分類器,總得來說,它與ID3/C4.5有如下不同:

(1)它是一顆二叉樹;

(2)特徵選擇的方法不一樣,CART分類樹利用基尼係數做特徵選擇,CART迴歸樹利用平方誤差做特徵選擇;

接下來,分別對CART分類樹和迴歸樹做介紹

CART分類樹

首先介紹特徵選擇方法,基尼係數:

\[Gini(p)=\sum_{k=1}^Kp_k(1-p_k)=1-\sum_{k=1}^Kp_k^2 \]

所以,對於給定的樣本集合\(D\),其基尼指數:

\[Gini(D)=1-\sum_{k=1}^K(\frac{\mid C_k \mid}{\mid D \mid})^2 \]

這裡,\(C_k\)\(D\)中屬於第\(k\)類的樣本子集,\(K\)是類的個數,由於CART樹是二叉樹,所以對於某特徵\(A\),判斷其對分類標籤的貢獻時,只需要判斷該特徵是否等於某個取值\(a\)的情況,將當前資料集分割成\(D_1\)\(D_2\)兩部分:

\[D_1=\{(x,y)\in D\mid A(x)=a\},D_2=D-D_1 \]

所以在特徵\(A(x)=a\)的條件下,集合\(D\)的基尼指數可以定義為:

\[Gini(D,A,a)=\frac{\mid D_1 \mid}{\mid D \mid}Gini(D_1)+\frac{\mid D_2 \mid}{\mid D \mid}Gini(D_2),這裡D_1=\{(x,y)\in D\mid A(x)=a\},D_2=D-D_1 \]

程式碼實現

接下來進行CART分類樹的程式碼實現,這裡與ID3/C4.5最大的不同就是每次對當前結點僅進行二分處理

"""
定義計算gini係數相關的函式,程式碼封裝到ml_models.utils
"""
import numpy as np
def gini(x, sample_weight=None):
    """
    計算基尼係數 Gini(D)
    :param x:
    :param sample_weight:
    :return:
    """
    x_num = len(x)
    # 如果sample_weight為None設均設定一樣
    if sample_weight is None:
        sample_weight = np.asarray([1.0] * x_num)
    x_counter = {}
    weight_counter = {}
    # 統計各x取值出現的次數以及其對應的sample_weight列表
    for index in range(0, x_num):
        x_value = x[index]
        if x_counter.get(x_value) is None:
            x_counter[x_value] = 0
            weight_counter[x_value] = []
        x_counter[x_value] += 1
        weight_counter[x_value].append(sample_weight[index])

    # 計算gini係數
    gini_value = 1.0
    for key, value in x_counter.items():
        p_i = 1.0 * value * np.mean(weight_counter.get(key)) / x_num
        gini_value -= p_i * p_i
    return gini_value


def cond_gini(x, y, sample_weight=None):
    """
    計算條件gini係數:Gini(y,x)
    """
    x = np.asarray(x)
    y = np.asarray(y)
    # x中元素個數
    x_num = len(x)
    # 如果sample_weight為None設均設定一樣
    if sample_weight is None:
        sample_weight = np.asarray([1.0] * x_num)
    # 計算
    gini_value = .0
    for x_value in set(x):
        x_index = np.where(x == x_value)
        new_x = x[x_index]
        new_y = y[x_index]
        new_sample_weight = sample_weight[x_index]
        p_i = 1.0 * len(new_x) / x_num
        gini_value += p_i * gini(new_y, new_sample_weight)
    return gini_value


def gini_gain(x, y, sample_weight=None):
    """
    gini值的增益
    """
    x_num = len(x)
    if sample_weight is None:
        sample_weight = np.asarray([1.0] * x_num)
    return gini(y, sample_weight) - cond_gini(x, y, sample_weight)
import os
os.chdir('../')
from ml_models import utils
from ml_models.wrapper_models import DataBinWrapper
"""
CART分類樹的實現,程式碼封裝到ml_models.tree模組
"""
class CARTClassifier(object):
    class Node(object):
        """
        樹節點,用於儲存節點資訊以及關聯子節點
        """

        def __init__(self, feature_index: int = None, feature_value=None, target_distribute: dict = None,
                     weight_distribute: dict = None,
                     left_child_node=None, right_child_node=None, num_sample: int = None):
            """
            :param feature_index: 特徵id
            :param feature_value: 特徵取值
            :param target_distribute: 目標分佈
            :param weight_distribute:權重分佈
            :param left_child_node: 左孩子結點
            :param right_child_node: 右孩子結點
            :param num_sample:樣本量
            """
            self.feature_index = feature_index
            self.feature_value = feature_value
            self.target_distribute = target_distribute
            self.weight_distribute = weight_distribute
            self.left_child_node = left_child_node
            self.right_child_node = right_child_node
            self.num_sample = num_sample

    def __init__(self, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,
                 min_impurity_decrease=0, max_bins=10):
        """
        :param criterion:劃分標準,預設為gini,另外entropy表示用資訊增益比
        :param max_depth:樹的最大深度
        :param min_samples_split:當對一個內部結點劃分時,要求該結點上的最小樣本數,預設為2
        :param min_samples_leaf:設定葉子結點上的最小樣本數,預設為1
        :param min_impurity_decrease:打算劃分一個內部結點時,只有當劃分後不純度(可以用criterion引數指定的度量來描述)減少值不小於該引數指定的值,才會對該結點進行劃分,預設值為0
        """
        self.criterion = criterion
        if criterion == 'gini':
            self.criterion_func = utils.gini_gain
        else:
            self.criterion_func = utils.info_gain_rate
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.min_impurity_decrease = min_impurity_decrease

        self.root_node: self.Node = None
        self.dbw = DataBinWrapper(max_bins=max_bins)

    def _build_tree(self, current_depth, current_node: Node, x, y, sample_weight):
        """
        遞迴進行特徵選擇,構建樹
        :param x:
        :param y:
        :param sample_weight:
        :return:
        """
        rows, cols = x.shape
        # 計算y分佈以及其權重分佈
        target_distribute = {}
        weight_distribute = {}
        for index, tmp_value in enumerate(y):
            if tmp_value not in target_distribute:
                target_distribute[tmp_value] = 0.0
                weight_distribute[tmp_value] = []
            target_distribute[tmp_value] += 1.0
            weight_distribute[tmp_value].append(sample_weight[index])
        for key, value in target_distribute.items():
            target_distribute[key] = value / rows
            weight_distribute[key] = np.mean(weight_distribute[key])
        current_node.target_distribute = target_distribute
        current_node.weight_distribute = weight_distribute
        current_node.num_sample = rows
        # 判斷停止切分的條件

        if len(target_distribute) <= 1:
            return

        if rows < self.min_samples_split:
            return

        if self.max_depth is not None and current_depth > self.max_depth:
            return

        # 尋找最佳的特徵以及取值
        best_index = None
        best_index_value = None
        best_criterion_value = 0
        for index in range(0, cols):
            for index_value in set(x[:, index]):
                criterion_value = self.criterion_func((x[:, index] == index_value).astype(int), y, sample_weight)
                if criterion_value > best_criterion_value:
                    best_criterion_value = criterion_value
                    best_index = index
                    best_index_value = index_value

        # 如果criterion_value減少不夠則停止
        if best_index is None:
            return
        if best_criterion_value <= self.min_impurity_decrease:
            return
        # 切分
        current_node.feature_index = best_index
        current_node.feature_value = best_index_value
        selected_x = x[:, best_index]

        # 建立左孩子結點
        left_selected_index = np.where(selected_x == best_index_value)
        # 如果切分後的點太少,以至於都不能做葉子節點,則停止分割
        if len(left_selected_index[0]) >= self.min_samples_leaf:
            left_child_node = self.Node()
            current_node.left_child_node = left_child_node
            self._build_tree(current_depth + 1, left_child_node, x[left_selected_index], y[left_selected_index],
                             sample_weight[left_selected_index])
        # 建立右孩子結點
        right_selected_index = np.where(selected_x != best_index_value)
        # 如果切分後的點太少,以至於都不能做葉子節點,則停止分割
        if len(right_selected_index[0]) >= self.min_samples_leaf:
            right_child_node = self.Node()
            current_node.right_child_node = right_child_node
            self._build_tree(current_depth + 1, right_child_node, x[right_selected_index], y[right_selected_index],
                             sample_weight[right_selected_index])

    def fit(self, x, y, sample_weight=None):
        # check sample_weight
        n_sample = x.shape[0]
        if sample_weight is None:
            sample_weight = np.asarray([1.0] * n_sample)
        # check sample_weight
        if len(sample_weight) != n_sample:
            raise Exception('sample_weight size error:', len(sample_weight))

        # 構建空的根節點
        self.root_node = self.Node()

        # 對x分箱
        self.dbw.fit(x)

        # 遞迴構建樹
        self._build_tree(1, self.root_node, self.dbw.transform(x), y, sample_weight)

    # 檢索葉子節點的結果
    def _search_node(self, current_node: Node, x, class_num):
        if current_node.left_child_node is not None and x[current_node.feature_index] == current_node.feature_value:
            return self._search_node(current_node.left_child_node, x, class_num)
        elif current_node.right_child_node is not None and x[current_node.feature_index] != current_node.feature_value:
            return self._search_node(current_node.right_child_node, x, class_num)
        else:
            result = []
            total_value = 0.0
            for index in range(0, class_num):
                value = current_node.target_distribute.get(index, 0) * current_node.weight_distribute.get(index, 1.0)
                result.append(value)
                total_value += value
            # 歸一化
            for index in range(0, class_num):
                result[index] = result[index] / total_value
            return result

    def predict_proba(self, x):
        # 計算結果概率分佈
        x = self.dbw.transform(x)
        rows = x.shape[0]
        results = []
        class_num = len(self.root_node.target_distribute)
        for row in range(0, rows):
            results.append(self._search_node(self.root_node, x[row], class_num))
        return np.asarray(results)

    def predict(self, x):
        return np.argmax(self.predict_proba(x), axis=1)

    def _prune_node(self, current_node: Node, alpha):
        # 如果有子結點,先對子結點部分剪枝
        if current_node.left_child_node is not None:
            self._prune_node(current_node.left_child_node, alpha)
        if current_node.right_child_node is not None:
            self._prune_node(current_node.right_child_node, alpha)
        # 再嘗試對當前結點剪枝
        if current_node.left_child_node is not None or current_node.right_child_node is not None:
            # 避免跳層剪枝
            for child_node in [current_node.left_child_node, current_node.right_child_node]:
                # 當前剪枝的層必須是葉子結點的層
                if child_node.left_child_node is not None or child_node.right_child_node is not None:
                    return
            # 計算剪枝的前的損失值
            pre_prune_value = alpha * 2
            for child_node in [current_node.left_child_node, current_node.right_child_node]:
                for key, value in child_node.target_distribute.items():
                    pre_prune_value += -1 * child_node.num_sample * value * np.log(
                        value) * child_node.weight_distribute.get(key, 1.0)
            # 計算剪枝後的損失值
            after_prune_value = alpha
            for key, value in current_node.target_distribute.items():
                after_prune_value += -1 * current_node.num_sample * value * np.log(
                    value) * current_node.weight_distribute.get(key, 1.0)

            if after_prune_value <= pre_prune_value:
                # 剪枝操作
                current_node.left_child_node = None
                current_node.right_child_node = None
                current_node.feature_index = None
                current_node.feature_value = None

    def prune(self, alpha=0.01):
        """
        決策樹剪枝 C(T)+alpha*|T|
        :param alpha:
        :return:
        """
        # 遞迴剪枝
        self._prune_node(self.root_node, alpha)
#造偽資料
from sklearn.datasets import make_classification
data, target = make_classification(n_samples=100, n_features=2, n_classes=2, n_informative=1, n_redundant=0,
                                   n_repeated=0, n_clusters_per_class=1, class_sep=.5,random_state=21)
#訓練並檢視效果
tree = CARTClassifier()
tree.fit(data, target)
utils.plot_decision_function(data, target, tree)

png

一樣的,如果不加以限制,同樣會存在過擬合現象,所以可以剪枝...

#剪枝
tree.prune(5)
utils.plot_decision_function(data, target, tree)

png

CART迴歸樹

迴歸樹的特徵選擇是使用的平方誤差,即選擇一個特徵\(j\)和一個取值\(s\),將訓練集按\(X^j\leq s\)\(X^j>s\)分為兩部分,尋找使這兩部分的誤差平方之和下降最多的\(j,s\),這個過程可以描述如下:

\[\min_{j,s}[\min_{c_1}\sum_{x_i\in R_1(j,s)}(y_i-c_1)^2+\min_{c_2}\sum_{x_i\in R_2(j,s)}(y_i-c_2)^2] \]

這裡\(R_1(j,s)=\{x\mid x^j\leq s\},R_2(j,s)=\{x\mid x^j> s\},c_1=ave(y_i\mid x_i\in R_1(j,s)),c_2=ave(y_i\mid x_i\in R_2(j,s))\)

程式碼實現:

"""
平方誤差相關函式,封裝到ml_models.utils
"""
def square_error(x, sample_weight=None):
    """
    平方誤差
    :param x:
    :param sample_weight:
    :return:
    """
    x = np.asarray(x)
    x_mean = np.mean(x)
    x_num = len(x)
    if sample_weight is None:
        sample_weight = np.asarray([1.0] * x_num)
    error = 0.0
    for index in range(0, x_num):
        error += (x[index] - x_mean) * (x[index] - x_mean) * sample_weight[index]
    return error


def cond_square_error(x, y, sample_weight=None):
    """
    計算按x分組的y的誤差值
    :param x:
    :param y:
    :param sample_weight:
    :return:
    """
    x = np.asarray(x)
    y = np.asarray(y)
    # x中元素個數
    x_num = len(x)
    # 如果sample_weight為None設均設定一樣
    if sample_weight is None:
        sample_weight = np.asarray([1.0] * x_num)
    # 計算
    error = .0
    for x_value in set(x):
        x_index = np.where(x == x_value)
        new_y = y[x_index]
        new_sample_weight = sample_weight[x_index]
        error += square_error(new_y, new_sample_weight)
    return error


def square_error_gain(x, y, sample_weight=None):
    """
    平方誤差帶來的增益值
    :param x:
    :param y:
    :param sample_weight:
    :return:
    """
    x_num = len(x)
    if sample_weight is None:
        sample_weight = np.asarray([1.0] * x_num)
    return square_error(y, sample_weight) - cond_square_error(x, y, sample_weight)
"""
CART迴歸樹實現,封裝到ml_models.tree
"""
class CARTRegressor(object):
    class Node(object):
        """
        樹節點,用於儲存節點資訊以及關聯子節點
        """

        def __init__(self, feature_index: int = None, feature_value=None, y_hat=None, square_error=None,
                     left_child_node=None, right_child_node=None, num_sample: int = None):
            """
            :param feature_index: 特徵id
            :param feature_value: 特徵取值
            :param y_hat: 預測值
            :param square_error: 當前結點的平方誤差
            :param left_child_node: 左孩子結點
            :param right_child_node: 右孩子結點
            :param num_sample:樣本量
            """
            self.feature_index = feature_index
            self.feature_value = feature_value
            self.y_hat = y_hat
            self.square_error = square_error
            self.left_child_node = left_child_node
            self.right_child_node = right_child_node
            self.num_sample = num_sample

    def __init__(self, criterion='mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_std=1e-3,
                 min_impurity_decrease=0, max_bins=10):
        """
        :param criterion:劃分標準,目前僅有平方誤差
        :param max_depth:樹的最大深度
        :param min_samples_split:當對一個內部結點劃分時,要求該結點上的最小樣本數,預設為2
        :param min_std:最小的標準差
        :param min_samples_leaf:設定葉子結點上的最小樣本數,預設為1
        :param min_impurity_decrease:打算劃分一個內部結點時,只有當劃分後不純度(可以用criterion引數指定的度量來描述)減少值不小於該引數指定的值,才會對該結點進行劃分,預設值為0
        """
        self.criterion = criterion
        if criterion == 'mse':
            self.criterion_func = utils.square_error_gain
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.min_std = min_std
        self.min_impurity_decrease = min_impurity_decrease

        self.root_node: self.Node = None
        self.dbw = DataBinWrapper(max_bins=max_bins)

    def _build_tree(self, current_depth, current_node: Node, x, y, sample_weight):
        """
        遞迴進行特徵選擇,構建樹
        :param x:
        :param y:
        :param sample_weight:
        :return:
        """
        rows, cols = x.shape
        # 計算當前y的加權平均值
        current_node.y_hat = np.dot(sample_weight / np.sum(sample_weight), y)
        current_node.num_sample = rows
        # 判斷停止切分的條件
        current_node.square_error = np.dot(y - np.mean(y), y - np.mean(y))
        if np.sqrt(current_node.square_error / rows) <= self.min_std:
            return

        if rows < self.min_samples_split:
            return

        if self.max_depth is not None and current_depth > self.max_depth:
            return

        # 尋找最佳的特徵以及取值
        best_index = None
        best_index_value = None
        best_criterion_value = 0
        for index in range(0, cols):
            for index_value in sorted(set(x[:, index])):
                criterion_value = self.criterion_func((x[:, index] <= index_value).astype(int), y, sample_weight)
                if criterion_value > best_criterion_value:
                    best_criterion_value = criterion_value
                    best_index = index
                    best_index_value = index_value

        # 如果criterion_value減少不夠則停止
        if best_index is None:
            return
        if best_criterion_value <= self.min_impurity_decrease:
            return
        # 切分
        current_node.feature_index = best_index
        current_node.feature_value = best_index_value
        selected_x = x[:, best_index]

        # 建立左孩子結點
        left_selected_index = np.where(selected_x <= best_index_value)
        # 如果切分後的點太少,以至於都不能做葉子節點,則停止分割
        if len(left_selected_index[0]) >= self.min_samples_leaf:
            left_child_node = self.Node()
            current_node.left_child_node = left_child_node
            self._build_tree(current_depth + 1, left_child_node, x[left_selected_index], y[left_selected_index],
                             sample_weight[left_selected_index])
        # 建立右孩子結點
        right_selected_index = np.where(selected_x > best_index_value)
        # 如果切分後的點太少,以至於都不能做葉子節點,則停止分割
        if len(right_selected_index[0]) >= self.min_samples_leaf:
            right_child_node = self.Node()
            current_node.right_child_node = right_child_node
            self._build_tree(current_depth + 1, right_child_node, x[right_selected_index], y[right_selected_index],
                             sample_weight[right_selected_index])

    def fit(self, x, y, sample_weight=None):
        # check sample_weight
        n_sample = x.shape[0]
        if sample_weight is None:
            sample_weight = np.asarray([1.0] * n_sample)
        # check sample_weight
        if len(sample_weight) != n_sample:
            raise Exception('sample_weight size error:', len(sample_weight))

        # 構建空的根節點
        self.root_node = self.Node()

        # 對x分箱
        self.dbw.fit(x)

        # 遞迴構建樹
        self._build_tree(1, self.root_node, self.dbw.transform(x), y, sample_weight)

    # 檢索葉子節點的結果
    def _search_node(self, current_node: Node, x):
        if current_node.left_child_node is not None and x[current_node.feature_index] <= current_node.feature_value:
            return self._search_node(current_node.left_child_node, x)
        elif current_node.right_child_node is not None and x[current_node.feature_index] > current_node.feature_value:
            return self._search_node(current_node.right_child_node, x)
        else:
            return current_node.y_hat

    def predict(self, x):
        # 計算結果概率分佈
        x = self.dbw.transform(x)
        rows = x.shape[0]
        results = []
        for row in range(0, rows):
            results.append(self._search_node(self.root_node, x[row]))
        return np.asarray(results)

    def _prune_node(self, current_node: Node, alpha):
        # 如果有子結點,先對子結點部分剪枝
        if current_node.left_child_node is not None:
            self._prune_node(current_node.left_child_node, alpha)
        if current_node.right_child_node is not None:
            self._prune_node(current_node.right_child_node, alpha)
        # 再嘗試對當前結點剪枝
        if current_node.left_child_node is not None or current_node.right_child_node is not None:
            # 避免跳層剪枝
            for child_node in [current_node.left_child_node, current_node.right_child_node]:
                # 當前剪枝的層必須是葉子結點的層
                if child_node.left_child_node is not None or child_node.right_child_node is not None:
                    return
            # 計算剪枝的前的損失值
            pre_prune_value = alpha * 2 + \
                              (0.0 if current_node.left_child_node.square_error is None else current_node.left_child_node.square_error) + \
                              (0.0 if current_node.right_child_node.square_error is None else current_node.right_child_node.square_error)
            # 計算剪枝後的損失值
            after_prune_value = alpha + current_node.square_error

            if after_prune_value <= pre_prune_value:
                # 剪枝操作
                current_node.left_child_node = None
                current_node.right_child_node = None
                current_node.feature_index = None
                current_node.feature_value = None
                current_node.square_error = None

    def prune(self, alpha=0.01):
        """
        決策樹剪枝 C(T)+alpha*|T|
        :param alpha:
        :return:
        """
        # 遞迴剪枝
        self._prune_node(self.root_node, alpha)
#構造資料
data = np.linspace(1, 10, num=100)
target = np.sin(data) + np.random.random(size=100)#新增噪聲
data = data.reshape((-1, 1))
tree = CARTRegressor(max_bins=50)
tree.fit(data, target)
import matplotlib.pyplot as plt
plt.scatter(data, target)
plt.plot(data, tree.predict(data), color='r')
[<matplotlib.lines.Line2D at 0x221783ed9b0>]

png

#剪枝
tree.prune(1)
plt.scatter(data, target)
plt.plot(data, tree.predict(data), color='r')
[<matplotlib.lines.Line2D at 0x221783fcb70>]

png

相關文章