分別利用xgbboost和catboost演算法做特徵重要性排序

Einsteintly發表於2020-11-13

分別利用xgbboost和catboost演算法做特徵重要性排序

xgbboost

# -*- coding: utf-8 -*-
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings('ignore')
plt.rcParams['font.family']='Microsoft YaHei' #顯示中文標籤
plt.style.use ('ggplot') #設定繪圖風格
import seaborn as sns


df = pd.read_csv('***.csv')

df = df.fillna(0)

df.drop(['distinct_id'],axis=1,inplace=True)

df.rename(columns={'is_stay':'是否留存','max(is_login)':'是否登入','start_numbers':'啟動次數','is_play':'是否播放視訊','is_action':'是否評論/收藏/點贊/分享','show_numbers':'視訊曝光數','video_play_numbers':'視訊播放數','real_video_play_numbers':'視訊有效播放數','video_play_time':'視訊播放總時長','video_play_rate':'視訊播放率','video_real_play_rate':'視訊有效播放率','play_time_per_play':'每播放播放時長','play_time_per_show':'每曝光播放時長'},inplace=True) 

X = df.drop(['是否留存'],axis=1)
y = df['是否留存']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)


xgb_model = xgb.XGBClassifier(learning_rate = 0.1,max_depth = 7,min_child_weight = 5,objective = 'binary:logistic',seed = 0,gamma = 0.1,random_state=0,silent = True)
xgb_model.fit(X_train,y_train)

y_pred = xgb_model.predict(X_test)
cm = confusion_matrix(y_test,y_pred)

fig, ax= plt.subplots(figsize=(10,10))

cmap=sns.cubehelix_palette(start=1.5,rot=3,gamma=0.8,as_cmap=True)
sns.heatmap(cm,annot=True,fmt='g',cmap=cmap,linewidths=1.5,annot_kws={'size':20,'weight':'bold', 'color':'red'})

ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')
plt.setp(ax.get_yticklabels() , rotation = 360)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
ax.xaxis.set_ticklabels(['未流失', '流失'])
ax.yaxis.set_ticklabels(['未流失', '流失'])
plt.show()

fig,ax = plt.subplots(figsize=(16,9))
xgb.plot_importance(xgb_model,height=0.5,importance_type='gain',xlabel='xgbboost演算法計算出的留存相關特徵重要性',grid=False,ax=ax)
xgb.to_graphviz(xgb_model,num_trees=1,yes_color='#638e5e',no_color='#a40000')

結果展示在這裡插入圖片描述

catboost

# -*- coding: utf-8 -*-
import pandas as pd
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings('ignore')
plt.rcParams['font.family']='Microsoft YaHei' #顯示中文標籤
plt.style.use ('ggplot') #設定繪圖風格
import seaborn as sns
from catboost import CatBoostClassifier
import numpy as np

df = pd.read_csv('***.csv')

df = df.fillna(0)

df.drop(['distinct_id'],axis=1,inplace=True)

df.rename(columns={'is_stay':'是否留存','max(is_login)':'是否登入','start_numbers':'啟動次數','is_play':'是否播放視訊','is_action':'是否評論/收藏/點贊/分享','show_numbers':'視訊曝光數','video_play_numbers':'視訊播放數','real_video_play_numbers':'視訊有效播放數','video_play_time':'視訊播放總時長','video_play_rate':'視訊播放率','video_real_play_rate':'視訊有效播放率','play_time_per_play':'每播放播放時長','play_time_per_show':'每曝光播放時長'},inplace=True) 

X = df.drop(['是否留存'],axis=1)
y = df['是否留存']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)

categorical_features_indices = np.where(X_train.dtypes != np.float)[0]
cat_model = CatBoostClassifier(iterations=100, depth=7,cat_features=categorical_features_indices,learning_rate=0.01, loss_function='Logloss',logging_level='Verbose')
cat_model.fit(X_train,y_train,plot=True)

y_pred = cat_model.predict(X_test)
cm = confusion_matrix(y_test,y_pred)

fig, ax= plt.subplots(figsize=(10,10))

cmap=sns.cubehelix_palette(start=1.5,rot=3,gamma=0.8,as_cmap=True)
sns.heatmap(cm,annot=True,fmt='g',cmap=cmap,linewidths=1.5,annot_kws={'size':20,'weight':'bold', 'color':'red'})

ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')
plt.setp(ax.get_yticklabels() , rotation = 360)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
ax.xaxis.set_ticklabels(['未流失', '流失'])
ax.yaxis.set_ticklabels(['未流失', '流失'])
plt.show()

fea_ = cat_model.feature_importances_
fea_name = cat_model.feature_names_
plt.figure(figsize=(16, 9))
plt.title('catboost演算法計算出的與留存相關特徵重要性')
plt.barh(fea_name,fea_,height =0.5)

結果展示
在這裡插入圖片描述
實際第二種結果更符合當前業務邏輯,區別最大的是二分類特徵,具體邏輯還待進一步思考

相關文章