k折交叉驗證
第一步,不重複抽樣將原始資料隨機分為 k 份。
第二步,每一次挑選其中 1 份作為測試集,剩餘 k-1 份作為訓練集用於模型訓練。
第三步,重複第二步 k 次,這樣每個子集都有一次機會作為測試集,其餘機會作為訓練集。
在每個訓練集上訓練後得到一個模型,
用這個模型在相應的測試集上測試,計算並儲存模型的評估指標,
第四步,計算 k 組測試結果的平均值作為模型精度的估計,並作為當前 k 折交叉驗證下模型的效能指標。
在這裡我們採用5折交叉驗證
網格搜尋
GridSearchCV,它存在的意義就是自動調參,只要把引數輸進去,就能給出最優化的結果和引數。但是這個方法適合於小資料集,一旦資料的量級上去了,很難得出結果。
import pandas as pd import numpy as np import matplotlib.pyplot as plt from xgboost import XGBClassifier from sklearn.metrics import roc_auc_score from sklearn.metrics import accuracy_score from sklearn.metrics import precision_score from sklearn.metrics import recall_score from sklearn.metrics import f1_score from sklearn.metrics import roc_curve,auc from sklearn.model_selection import train_test_split,GridSearchCV from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import GradientBoostingClassifier from lightgbm import LGBMClassifier from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn import svm data_all = pd.read_csv(`D:\data_all.csv`,encoding =`gbk`) X = data_all.drop([`status`],axis = 1) y = data_all[`status`] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=2018) #資料標準化 scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) #LR lr = LogisticRegression(random_state = 2018) param = {`C`:[1e-3,0.01,0.1,1,10,100,1e3], `penalty`:[`l1`, `l2`]} grid = GridSearchCV(estimator=lr, param_grid=param, scoring=`roc_auc`, cv=5) grid.fit(X_train,y_train) print(grid.best_params_) print( grid.best_score_) print(grid.score(X_test,y_test)) #DecisionTree dt = DecisionTreeClassifier(random_state = 2018) param = {`criterion`:[`gini`,`entropy`],`splitter`:[`best`,`random`],`max_depth`:[2,4,6,8],`max_features`:[`sqrt`,`log2`,None]} grid = GridSearchCV(estimator = dt, param_grid=param, scoring=`roc_auc`, cv=5) print(grid.best_params_) print( grid.best_score_) print(grid.score(X_test,y_test)) #SVM svc = svm.SVC(random_state = 2018) param = {`C`:[1e-2, 1e-1, 1, 10],`kernel`:[`linear`,`poly`,`rbf`,`sigmoid`]} grid = GridSearchCV(estimator = svc, param_grid=param, scoring=`roc_auc`, cv=5) print(grid.best_params_) print( grid.best_score_) print(grid.score(X_test,y_test)) #RandomForest rft = RandomForestClassifier() param = {`n_estimators`:[10,20,50,100],`criterion`:[`gini`,`entropy`],`max_depth`:[2,4,6,8,10,None],`max_features`:[`sqrt`,`log2`,None]} grid = GridSearchCV(estimator = rft, param_grid=param, scoring=`roc_auc`, cv=5) print(grid.best_params_) print( grid.best_score_) print(grid.score(X_test,y_test)) #GBDT gb = GradientBoostingClassifier() param = {`max_features`:[`sqrt`,`log2`,None],`learning_rate`:[0.01,0.1,0.5,1],`n_estimators`:range(20,200,20),`subsample`:[0.2,0.5,0.7,1.0]} grid = GridSearchCV(estimator = gb, param_grid=param, scoring=`roc_auc`, cv=5) print(grid.best_params_) print( grid.best_score_) print(grid.score(X_test,y_test)) #XGBoost xgb_c = XGBClassifier() param = {`n_estimators`:range(20,200,20),`max_depth`:[2,6,10],`reg_lambda`:[0.2,0.5,1]} grid = GridSearchCV(estimator = xgb_c, param_grid=param, scoring=`roc_auc`, cv=5) print(grid.best_params_) print( grid.best_score_) print(grid.score(X_test,y_test)) #LightGBM lgbm_c = LGBMClassifier() param = {`learning_rate`: [0.2,0.5,0.7], `max_depth`: range(1,10,2), `n_estimators`:range(20,100,10)} grid = GridSearchCV(estimator = lgbm_c, param_grid=param, scoring=`roc_auc`, cv=5) grid.fit(X_train,y_train) print(grid.best_params_) print( grid.best_score_) print(grid.score(X_test,y_test))