1 import math 2 import numpy as np 3 4 5 def calcShannonEnt(data): 6 num = len(data) 7 # 儲存每個類別的數目 8 labelCounts = {} 9 # 每一個樣本 10 for featVec in data: 11 currentLabel = featVec[-1] 12 if currentLabel not in labelCounts.keys(): 13 labelCounts[currentLabel] = 0 14 labelCounts[currentLabel] += 1 15 # 計算資訊增益 16 shannonEnt = 0 17 for key in labelCounts.keys(): 18 prob = float(labelCounts[key] / num) 19 shannonEnt -= prob * math.log(prob) 20 return shannonEnt
def splitData(dataSet, axis, value): """ axis為某一特徵維度 value為劃分該維度的值 """ retDataSet = [] for featVec in dataSet: if featVec[axis] == value: # 捨棄掉這一維度上對應的值,剩餘部分作為新的資料集 reducedFeatVec = featVec[:axis] reducedFeatVec.extend(featVec[axis+1:]) retDataSet.append(reducedFeatVec) return retDataSet
>>data Out[1]: [[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']] >>splitData(data, 0, 1) Out[2]: [[1, 'yes'], [1, 'yes'], [0, 'no']] >>splitData(data, 0, 0) Out[3]: [[1, 'no'], [1, 'no']]
# 選擇最好的特徵進行資料劃分 # 輸入dataSet為二維List def chooseBestFeatuerToSplit(dataSet): # 計算樣本所包含的特徵數目 numFeatures = len(dataSet[0]) - 1 # 資訊熵H(Y) baseEntropy = calcShannonEnt(dataSet) # 初始化 bestInfoGain = 0; bestFeature = -1 # 遍歷每個特徵,計算資訊增益 for i in range(numFeatures): # 取出對應特徵值,即一列資料 featList = [example[i] for example in dataSet] uniqueVals = np.unique(featList) newEntropy = 0 for value in uniqueVals: subDataSet = splitData(dataSet, i, value) prob = len(subDataSet)/float(dataSet) newEntropy += prob * calcShannonEnt(subDataSet) # 計算資訊增益G(Y, X) = H(Y) - sum(H(Y|x)) infoGain = baseEntropy - newEntropy if infoGain > bestInfoGain: bestInfoGain = infoGain bestFeature = i return bestFeature
def majorityCnt(classList): classCount = {} for vote in classList: if vote not in classCount.keys(): classCount[vote] = 0 classCount[vote] += 1 # 按統計個數進行倒序排序 sortedClassCount = sorted(classCount.items(), key=lambda item: item[1], reverse=True) return sortedClassCount[0][0]
def creatTree(dataSet, labels): """ labels為特徵的標籤列表 """ classList = [example[-1] for example in dataSet] # 如果data中的都為同一種類別,則停止,且返回該類別 if classList.count(classList[0]) == len(classList): return classList[0] # 如果資料集中僅剩類別這一列了,即特徵使用完,仍沒有分開,則投票 if len(dataSet[0]) == 1: return majorityCnt(classList) bestFeat = chooseBestFeatuerToSplit(dataSet) bestFeatLabel = labels[bestFeat] # 初始化樹,用於儲存樹的結構,是很多字典的巢狀結構 myTree = {bestFeatLabel: {}} # 已經用過的特徵刪去 del (labels[bestFeatLabel]) # 取出最優特徵這一列的值 featVals = [example[bestFeat] for example in dataSet] # 特徵的取值個數 uniqueVals = np.unique(featVals) # 開始遞迴分裂 for value in uniqueVals: subLabels = labels[:] myTree[bestFeatLabel][value] = creatTree(splitData(dataSet, bestFeat, value), subLabels) return myTree
>>data Out[1]: [[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']] >>labels Out[2]: ['no surfacing', 'flippers'] >>creatTree(data, labels) Out[3]: {'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}
def classify(inputTree, featLabels, testVec): # 自上而下搜尋預測樣本所屬類別 firstStr = inputTree.key()[0] secondDict = inputTree[firstStr] featIndex = featLabels.index(firstStr) for key in secondDict.keys(): # 按照特徵的位置確定搜尋方向 if testVec[featIndex] == key: if type(secondDict[key]).__name__ == 'dict': # 若下一級結構還是dict,遞迴搜尋 classLabel = classify(secondDict, featLabels, testVec) else: classLabel = secondDict[key] return classLabel
decisionNode = dict(boxstyle='sawtooth', fc="0.8") leafNode = dict(boxstyle='round4', fc="0.8") arrow_args = dict(arrowstyle='<-') def plotNode(nodeTxt, centerPt, parentPt, nodeType): createPlot.ax1.annotate(nodeTxt, xy=parentPt, xycoords='axes fraction', xytext=centerPt, textcoords='axes fraction', va="center", ha="center", bbox=nodeType, arrowprops=arrow_args) def createPlot(inTree): fig = plt.figure(1, facecolor='white') fig.clf() axprops = dict(xticks=[], yticks=[]) createPlot.ax1 = plt.subplot(111, frameon=False) plotTree.totalW = float(getNumLeafs(inTree)) plotTree.totalD = float(getTreeDepth(inTree)) plotTree.xOff = -0.5/plotTree.totalW plotTree.yOff = 1.0 plotTree(inTree, (0.5, 1.0), '') # plotNode('a decision node', (0.5, 0.1), (0.1, 0.5), decisionNode) # plotNode('a leaf node', (0.8, 0.1), (0.3, 0.8), leafNode) plt.show() def getNumLeafs(myTree): numLeafs = 0 firstStr = list(myTree.keys())[0] secondDict = myTree[firstStr] for key in list(secondDict.keys()): if type(secondDict[key]).__name__ == 'dict': numLeafs += getNumLeafs(secondDict[key]) else: numLeafs += 1 return numLeafs def getTreeDepth(myTree): maxDepth = 0 firstStr = list(myTree.keys())[0] secondDict = myTree[firstStr] for key in list(secondDict.keys()): if type(secondDict[key]).__name__ == 'dict': thisDepth = 1 + getTreeDepth(secondDict[key]) else: thisDepth = 1 if thisDepth > maxDepth: maxDepth = thisDepth return maxDepth def retrieveTree(i): listOfTrees = [{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}, ] return listOfTrees[i] def plotMidText(cntrPt, parentPt, txtString): xMid = (parentPt[0] - cntrPt[0])/2.0 + cntrPt[0] yMid = (parentPt[1] - cntrPt[1])/2.0 + cntrPt[1] createPlot.ax1.text(xMid, yMid, txtString) def plotTree(myTree, parentPt, nodeTxt): numLeafs = getNumLeafs(myTree) depth = getTreeDepth(myTree) firstStr = list(myTree.keys())[0] cntrPt = (plotTree.xOff + (1.0 + float(numLeafs))/2.0/plotTree.totalW, plotTree.yOff) plotMidText(cntrPt, parentPt, nodeTxt) plotNode(firstStr, cntrPt, parentPt, decisionNode) secondDict = myTree[firstStr] plotTree.yOff = plotTree.yOff - 1.0/plotTree.totalD for key in list(secondDict.keys()): if type(secondDict[key]).__name__ == 'dict': plotTree(secondDict[key], cntrPt, str(key)) else: plotTree.xOff = plotTree.xOff + 1.0/plotTree.totalW plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff), cntrPt, leafNode) plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key)) plotTree.yOff = plotTree.yOff + 1.0/plotTree.totalD
fr = open('lenses.txt') lenses = [inst.strip().split('\t') for inst in fr.readlines()] lenses_labels = ['age', 'prescript', 'astigmatic', 'tearRate'] lenses_Tree = creatTree(lenses, lenses_labels) createPlot(lenses_Tree)
def testing(myTree, data_test, labels): error = 0.0 for i in range(len(data_test)): classLabel = classify(myTree, labels, data_test[i]) if classLabel != data_test[i][-1]: error += 1 return float(error) # 測試投票節點正確率 def testingMajor(major, data_test): error = 0.0 for i in range(len(data_test)): if major[0] != data_test[i][-1]: error += 1 # print 'major %d' %error return float(error)
def postPruningTree(inTree, dataSet, test_data, labels): """ :param inTree: 原始樹 :param dataSet:資料集 :param test_data:測試資料,用於交叉驗證 :param labels:標籤集 """ firstStr = list(inTree.keys())[0] secondDict = inTree[firstStr] classList = [example[-1] for example in dataSet] labelIndex = labels.index(firstStr) temp_labels = copy.deepcopy(labels) del (labels[labelIndex]) for key in list(secondDict.keys()): if type(secondDict[key]).__name__ == 'dict': if type(dataSet[0][labelIndex]).__name__ == 'str': subDataSet = splitData(dataSet, labelIndex, key) subDataTest = splitData(test_data, labelIndex, key) if len(subDataTest) > 0: inTree[firstStr][key] = postPruningTree(secondDict[key], subDataSet, subDataTest, copy.deepcopy(labels)) if testing(inTree, test_data, temp_labels) < testingMajor(majorityCnt(classList), temp_labels): return inTree return majorityCnt(classList)
# 匯入決策樹包 from sklearn.tree import DecisionTreeClassifier # 畫圖工具包 import matplotlib.pyplot as plt import seaborn as sns sns.set(color_codes=True) # 匯入資料處理的包 from sklearn.model_selection import train_test_split # 模型評估 from sklearn import metrics from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score import missingno as msno_plot
# 讀取紅酒資料 wine_df =pd.read_csv('F:\自學2020\PythonML_Code\Charpter 3\winequality-red.csv', sep=';') # 檢視資料, 資料有11個特徵,類別為quality wine_df.describe().transpose().round(2)
plt.title('Non-missing values by columns') msno_plot.bar(wine_df)
# 通過箱型圖檢視每一列的箱型圖 plt.figure() pos = 1 for i in wine_df.columns: plt.subplot(3, 4, pos) sns.boxplot(wine_df[i]) pos += 1
# 處理缺失值 columns_name = list(wine_df.columns) for name in columns_name: q1, q2, q3 = wine_df[name].quantile([0.25, 0.5, 0.75]) IQR = q3 - q1 lower_cap = q1 - 1.5*IQR upper_cap = q3 + 1.5*IQR wine_df[name] = wine_df[name].apply(lambda x: upper_cap if x > upper_cap else (lower_cap if (x<lower_cap) else x))
plt.figure(figsize=(10, 8)) sns.heatmap(wine_df.corr(), annot=True, linewidths=.5, center=0, cbar=False, cmap='YlGnBu')
plt.figure(figsize=(10, 8)) sns.countplot(wine_df['quality'])
wine_df = wine_df[wine_df['quality'] != 3.5] wine_df = wine_df[wine_df['quality'] != 7.5] wine_df['quality'] = wine_df['quality'].replace(8, 7) wine_df['quality'] = wine_df['quality'].replace(3, 5) wine_df['quality'] = wine_df['quality'].replace(4, 5) wine_df['quality'].value_counts(normalize=True)
X_train, X_test, Y_train, Y_test = train_test_split(wine_df.drop(['quality'], axis=1), wine_df['quality'], test_size=0.3, random_state=22) print(X_train.shape, X_test.shape)
Output:(1119, 11) (480, 11)
model = DecisionTreeClassifier(criterion='gini', random_state=100, max_depth=3, min_samples_leaf=5) """ criterion:度量函式,包括gini、entropy等 class_weight:樣本權重,預設為None,也可通過字典形式制定樣本權重,如:假設樣本中存在4個類別,可以按照 [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] 這樣的輸入形式設定4個類的權重分別為1、5、1、1,而不是 [{1:1}, {2:5}, {3:1}, {4:1}]的形式。 該引數還可以設定為‘balance’,此時系統會按照輸入的樣本資料自動的計算每個類的權重,計算公式為:n_samples/( n_classes*np.bincount(y)), 其中n_samples表示輸入樣本總數,n_classes表示輸入樣本中類別總數,np.bincount(y) 表示計算屬於每個類的樣本個數,可以看到, 屬於某個類的樣本個數越多時,該類的權重越小。若使用者單獨指定了每個樣本的權重,且也設定了class_weight引數,則系統會將該樣本單獨指定 的權重乘以class_weight指定的其類的權重作為該樣本最終的權重。 max_depth: 設定樹的最大深度,即剪枝,預設為None,通常會限制最大深度防止過擬合一般為5~20,具體視樣本分佈來定 splitter: 節點劃分策略,預設為best,還可以設定為random,表示最優隨機劃分,一般用於資料量較大時,較小運算量 min_sample_leaf: 指定的葉子結點最小樣本數,預設為1,只有劃分後其左右分支上的樣本個數不小於該引數指定的值時,才考慮將該結點劃分也就是說, 當葉子結點上的樣本數小於該引數指定的值時,則該葉子節點及其兄弟節點將被剪枝。在樣本資料量較大時,可以考慮增大該值,提前結束樹的生長。 random_state: 當splitter設定為random時,可以通過該引數設計隨機種子數 min_sample_split: 對一個內部節點劃分時,要求該結點上的最小樣本數,預設為2 max_features: 劃分節點時,所允許搜尋的最大的屬性個數,預設為None,auto表示最多搜尋sqrt(n)個屬性,log2表示最多搜尋log2(n)個屬性,也可以設定整數; min_impurity_decrease :打算劃分一個內部結點時,只有當劃分後不純度(可以用criterion引數指定的度量來描述)減少值不小於該引數指定的值,才會對該 結點進行劃分,預設值為0。可以通過設定該引數來提前結束樹的生長。 min_impurity_split : 打算劃分一個內部結點時,只有當該結點上的不純度不小於該引數指定的值時,才會對該結點進行劃分,預設值為1e-7。該引數值0.25 版本之後將取消,由min_impurity_decrease代替。 """
model.fit(X_train, Y_train)
test_labels = model.predict(X_test) train_labels = model.predict(X_train) print('測試集上的準確率為%s'%accuracy_score(Y_test, test_labels)) print('訓練集上的準確率為%s'%accuracy_score(Y_train, train_labels)) 測試集上的準確率為0.6101694915254238 訓練集上的準確率為0.6014558689717925
feat_importance = model.tree_.compute_feature_importances(normalize=False) feat_imp_dict = dict(zip(feature_cols, model.feature_importances_)) feat_imp = pd.DataFrame.from_dict(feat_imp_dict, orient='index') feat_imp.rename(columns={0: 'FeatureImportance'}, inplace=True) feat_imp.sort_values(by=['FeatureImportance'], ascending=False).head() Output: FeatureImportance alcohol 0.507726 sulphates 0.280996 total sulfur dioxide 0.190009 volatile acidity 0.021269 fixed acidity 0.000000
path = model.cost_complexity_pruning_path(X_train, Y_train) ccp_alphas, impurities = path.ccp_alphas, path.impurities fig, ax = plt.figure(figsize=(16, 8)) ax.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle='steps-post') ax.set_xlabel('effective alpha') ax.set_ylabel('total impurity of leaves')
# 根據不同的alpha生成不同的樹並儲存 clfs = [] for ccp_alpha in ccp_alphas: clf = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha) clf.fit(X_train, Y_train) clfs.append(clf) # 刪去最後一個元素,因為最後只有一個節點 clfs = clfs[:-1] ccp_alphas = ccp_alphas[:-1] # 檢視樹的總節個點數和樹的深度隨alpha的變化 node_counts = [clf.tree_.node_count for clf in clfs] depth = [clf.tree_.max_depth for clf in clfs] fig, ax = plt.subplot(2, 1) ax[0].plot(ccp_alphas, node_counts, marker='o', drawstyle='steps-post') ax[0].set_xlabel('alpha') ax[0].set_ylabel('number of nodes') ax[0].set_title("Number of nodes vs alpha") ax[1].plot(ccp_alphas, depth, marker='o', drawstyle='steps-post') ax[1].set_xlabel('alpha') ax[1].set_ylabel('depth of Tree') ax[1].set_title("Depth vs alpha") fig.tight_layout()
# 檢視不同樹的訓練誤差和測試誤差變化關係 train_scores = [clf.score(X_train, Y_train) for clf in clfs] test_scores = [clf.score(X_test, Y_test) for clf in clfs] fig, ax = plt.subplots() ax.plot(ccp_alphas, train_scores, marker='o', label='train', drawstyle='steps-post') ax.plot(ccp_alphas, test_scores, marker='o', label='test', drawstyle='steps-post') ax.set_xlabel('alpha') ax.set_ylabel('accuracy') ax.legend() plt.show()
i = np.arange(len(ccp_alphas)) ccp = pd.DataFrame({'Depth': pd.Series(depth, index=i), 'Node': pd.Series(node_counts, index=i), 'ccp': pd.Series(ccp_alphas, index=i), 'train_scores': pd.Series(train_scores, index=i), 'test_scores': pd.Series(test_scores, index=i)}) ccp.tail() best = ccp[ccp['test_scores'] == ccp['test_scores'].max()]