一、前述
上節我們講解了xgboost的基本知識,本節我們通過例項進一步講解。
二、具體
1、安裝
預設可以通過pip安裝,若是安裝不上可以通過https://www.lfd.uci.edu/~gohlke/pythonlibs/網站下載相關安裝包,將安裝包拷貝到Anacoda3的安裝目錄的Scrripts目錄下, 然後pip install 安裝包安裝。
2、程式碼例項
import xgboost
# First XGBoost model for Pima Indians dataset from numpy import loadtxt from xgboost import XGBClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score # load data dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",") # split data into X and y X = dataset[:,0:8] Y = dataset[:,8] # split data into train and test sets seed = 7 test_size = 0.33 X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed) # fit model no training data model = XGBClassifier() model.fit(X_train, y_train) # make predictions for test data y_pred = model.predict(X_test) predictions = [round(value) for value in y_pred] # evaluate predictions accuracy = accuracy_score(y_test, predictions) print("Accuracy: %.2f%%" % (accuracy * 100.0))
或者每次插入一顆樹,看看效果
from numpy import loadtxt from xgboost import XGBClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score # load data dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",") # split data into X and y X = dataset[:,0:8] Y = dataset[:,8] # split data into train and test sets seed = 7 test_size = 0.33 X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed) # fit model no training data model = XGBClassifier() eval_set = [(X_test, y_test)] model.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="logloss", eval_set=eval_set, verbose=True) # make predictions for test data y_pred = model.predict(X_test) predictions = [round(value) for value in y_pred] # evaluate predictions accuracy = accuracy_score(y_test, predictions) print("Accuracy: %.2f%%" % (accuracy * 100.0))
觀看特徵的重要程度:
from numpy import loadtxt from xgboost import XGBClassifier from xgboost import plot_importance from matplotlib import pyplot # load data dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",") # split data into X and y X = dataset[:,0:8] y = dataset[:,8] # fit model no training data model = XGBClassifier() model.fit(X, y) # plot feature importance plot_importance(model) pyplot.show()
xgboost引數:
- 'booster':'gbtree',
- 'objective': 'multi:softmax', 多分類的問題
- 'num_class':10, 類別數,與 multisoftmax 並用
- 'gamma':損失下降多少才進行分裂
- 'max_depth':12, 構建樹的深度,越大越容易過擬合
- 'lambda':2, 控制模型複雜度的權重值的L2正則化項引數,引數越大,模型越不容易過擬合。
- 'subsample':0.7, 隨機取樣訓練樣本
- 'colsample_bytree':0.7, 生成樹時進行的列取樣
- 'min_child_weight':3, 孩子節點中最小的樣本權重和。如果一個葉子節點的樣本權重和小於min_child_weight則拆分過程結束
- 'silent':0 ,設定成1則沒有執行資訊輸出,最好是設定為0.
- 'eta': 0.007, 如同學習率
- 'seed':1000,
- 'nthread':7, cpu 執行緒數
xgb1 = XGBClassifier( learning_rate =0.1, n_estimators=1000, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27)
交叉驗證:
# Tune learning_rate from numpy import loadtxt from xgboost import XGBClassifier from sklearn.model_selection import GridSearchCV from sklearn.model_selection import StratifiedKFold # load data dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",") # split data into X and y X = dataset[:,0:8] Y = dataset[:,8] # grid search model = XGBClassifier() learning_rate = [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3] param_grid = dict(learning_rate=learning_rate) kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7) grid_search = GridSearchCV(model, param_grid, scoring="neg_log_loss", n_jobs=-1, cv=kfold) grid_result = grid_search.fit(X, Y) # summarize results print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) means = grid_result.cv_results_['mean_test_score'] params = grid_result.cv_results_['params'] for mean, param in zip(means, params): print("%f with: %r" % (mean, param))