模型調優

mambakb發表於2018-12-24

k折交叉驗證

第一步,不重複抽樣將原始資料隨機分為 k 份。
第二步,每一次挑選其中 1 份作為測試集,剩餘 k-1 份作為訓練集用於模型訓練。
第三步,重複第二步 k 次,這樣每個子集都有一次機會作為測試集,其餘機會作為訓練集。
在每個訓練集上訓練後得到一個模型,
用這個模型在相應的測試集上測試,計算並儲存模型的評估指標,
第四步,計算 k 組測試結果的平均值作為模型精度的估計,並作為當前 k 折交叉驗證下模型的效能指標。

在這裡我們採用5折交叉驗證

網格搜尋

GridSearchCV,它存在的意義就是自動調參,只要把引數輸進去,就能給出最優化的結果和引數。但是這個方法適合於小資料集,一旦資料的量級上去了,很難得出結果。

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve,auc
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm

data_all = pd.read_csv(`D:\data_all.csv`,encoding =`gbk`)

X = data_all.drop([`status`],axis = 1)
y = data_all[`status`]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=2018)
#資料標準化
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
#LR
lr = LogisticRegression(random_state = 2018)
param = {`C`:[1e-3,0.01,0.1,1,10,100,1e3], `penalty`:[`l1`, `l2`]}
grid = GridSearchCV(estimator=lr, param_grid=param, scoring=`roc_auc`, cv=5)
grid.fit(X_train,y_train)
print(grid.best_params_)
print( grid.best_score_)
print(grid.score(X_test,y_test))
#DecisionTree
dt = DecisionTreeClassifier(random_state = 2018)
param = {`criterion`:[`gini`,`entropy`],`splitter`:[`best`,`random`],`max_depth`:[2,4,6,8],`max_features`:[`sqrt`,`log2`,None]}
grid = GridSearchCV(estimator = dt, param_grid=param, scoring=`roc_auc`, cv=5)
print(grid.best_params_)
print( grid.best_score_)
print(grid.score(X_test,y_test))
#SVM
svc = svm.SVC(random_state = 2018)
param = {`C`:[1e-2, 1e-1, 1, 10],`kernel`:[`linear`,`poly`,`rbf`,`sigmoid`]}
grid = GridSearchCV(estimator = svc, param_grid=param, scoring=`roc_auc`, cv=5)
print(grid.best_params_)
print( grid.best_score_)
print(grid.score(X_test,y_test))
#RandomForest
rft = RandomForestClassifier()
param = {`n_estimators`:[10,20,50,100],`criterion`:[`gini`,`entropy`],`max_depth`:[2,4,6,8,10,None],`max_features`:[`sqrt`,`log2`,None]}
grid = GridSearchCV(estimator = rft, param_grid=param, scoring=`roc_auc`, cv=5)
print(grid.best_params_)
print( grid.best_score_)
print(grid.score(X_test,y_test))
#GBDT
gb = GradientBoostingClassifier()
param = {`max_features`:[`sqrt`,`log2`,None],`learning_rate`:[0.01,0.1,0.5,1],`n_estimators`:range(20,200,20),`subsample`:[0.2,0.5,0.7,1.0]}
grid = GridSearchCV(estimator = gb, param_grid=param, scoring=`roc_auc`, cv=5)
print(grid.best_params_)
print( grid.best_score_)
print(grid.score(X_test,y_test))
#XGBoost
xgb_c = XGBClassifier()
param = {`n_estimators`:range(20,200,20),`max_depth`:[2,6,10],`reg_lambda`:[0.2,0.5,1]}
grid = GridSearchCV(estimator = xgb_c, param_grid=param, scoring=`roc_auc`, cv=5)
print(grid.best_params_)
print( grid.best_score_)
print(grid.score(X_test,y_test))
#LightGBM
lgbm_c = LGBMClassifier()
param = {`learning_rate`: [0.2,0.5,0.7], `max_depth`: range(1,10,2), `n_estimators`:range(20,100,10)}
grid = GridSearchCV(estimator = lgbm_c, param_grid=param, scoring=`roc_auc`, cv=5)
grid.fit(X_train,y_train)
print(grid.best_params_)
print( grid.best_score_)
print(grid.score(X_test,y_test))

 

相關文章