機器學習之 基於xgboost的特徵篩選

語亦情非發表於2020-03-19

本文主要是基於xgboost進行特徵選擇,很多人都知道在後面的模型選擇時,xgboost模型是一個非常熱門的模型。但其實在前面特徵選擇部分,基於xgboost進行特徵篩選也大有可為。

#coding=utf-8

import pandas as pd
import xgboost as xgb
import os,random,pickle


os.mkdir('featurescore')



train = pd.read_csv('../../data/train/train_x_rank.csv')
train_target = pd.read_csv('../../data/train/train_master.csv',encoding='gb18030')[['Idx','target']]
train = pd.merge(train,train_target,on='Idx')
train_y = train.target
train_x = train.drop(['Idx','target'],axis=1)
dtrain = xgb.DMatrix(train_x, label=train_y)

test = pd.read_csv('../../data/test/test_x_rank.csv')
test_Idx = test.Idx
test = test.drop('Idx',axis=1)
dtest = xgb.DMatrix(test)


train_test = pd.concat([train,test])
train_test.to_csv('rank_feature.csv',index=None)
print print(train_test.shape)

"""
params={
    	'booster':'gbtree',
    	'objective': 'rank:pairwise',
    	'scale_pos_weight': float(len(train_y)-sum(train_y))/float(sum(train_y)),
        'eval_metric': 'auc',
    	'gamma':0.1,
    	'max_depth':6,
    	'lambda':500,
        'subsample':0.6,
        'colsample_bytree':0.3,
        'min_child_weight':0.2, 
        'eta': 0.04,
    	'seed':1024,
    	'nthread':8
        }
xgb.cv(params,dtrain,num_boost_round=1100,nfold=10,metrics='auc',show_progress=3,seed=1024)#733

"""

def pipeline(iteration,random_seed,gamma,max_depth,lambd,subsample,colsample_bytree,min_child_weight):
    params={
            'booster':'gbtree',
	    'objective': 'rank:pairwise',
	    'scale_pos_weight': float(len(train_y)-sum(train_y))/float(sum(train_y)),
	    'eval_metric': 'auc',
	    'gamma':gamma,
	    'max_depth':max_depth,
	    'lambda':lambd,
	    'subsample':subsample,
	    'colsample_bytree':colsample_bytree,
	    'min_child_weight':min_child_weight, 
	    'eta': 0.2,
	    'seed':random_seed,
	    'nthread':8
	 }

    watchlist  = [(dtrain,'train')]
    model = xgb.train(params,dtrain,num_boost_round=700,evals=watchlist)
    #model.save_model('./model/xgb{0}.model'.format(iteration))
    #predict test set
    #test_y = model.predict(dtest)
    #test_result = pd.DataFrame(test_Idx,columns=["Idx"])
    #test_result["score"] = test_y
    #test_result.to_csv("./preds/xgb{0}.csv".format(iteration),index=None,encoding='utf-8')
    
    #save feature score
    feature_score = model.get_fscore()
    feature_score = sorted(feature_score.items(), key=lambda x:x[1],reverse=True)
    fs = []
    for (key,value) in feature_score:
        fs.append("{0},{1}\n".format(key,value))
    
    with open('./featurescore/feature_score_{0}.csv'.format(iteration),'w') as f:
        f.writelines("feature,score\n")
        f.writelines(fs)


if __name__ == "__main__":
    random_seed = range(10000,20000,100)
    gamma = [i/1000.0 for i in range(0,300,3)]
    max_depth = [5,6,7]
    lambd = range(400,600,2)
    subsample = [i/1000.0 for i in range(500,700,2)]
    colsample_bytree = [i/1000.0 for i in range(550,750,4)]
    min_child_weight = [i/1000.0 for i in range(250,550,3)]
    
    random.shuffle(random_seed)
    random.shuffle(gamma)
    random.shuffle(max_depth)
    random.shuffle(lambd)
    random.shuffle(subsample)
    random.shuffle(colsample_bytree)
    random.shuffle(min_child_weight)
    
    with open('params.pkl','w') as f:
        pickle.dump((random_seed,gamma,max_depth,lambd,subsample,colsample_bytree,min_child_weight),f)

    for i in range(36):
        pipeline(i,random_seed[i],gamma[i],max_depth[i%3],lambd[i],subsample[i],colsample_bytree[i],min_child_weight[i])

 因為xgboost的引數選擇非常重要,因此進行了引數shuffle的操作。最後可以基於以上不同引數組合的xgboost所得到的feature和socre,再進行score平均操作,篩選出高得分的特徵。

 

import pandas as pd 
import os


files = os.listdir('featurescore')
fs = {}
for f in files:
    t = pd.read_csv('featurescore/'+f)
    t.index = t.feature
    t = t.drop(['feature'],axis=1)
    d = t.to_dict()['score']
    for key in d:
        if fs.has_key(key):
            fs[key] += d[key]
        else:
            fs[key] = d[key] 
            
fs = sorted(fs.items(), key=lambda x:x[1],reverse=True)

t = []
for (key,value) in fs:
    t.append("{0},{1}\n".format(key,value))

with open('rank_feature_score.csv','w') as f:
    f.writelines("feature,score\n")
    f.writelines(t)

 這裡得出了每個特徵的總分,每個都除以36就是平均分了。最後按照平均分取出topn就可以。我的理解是這樣子。

然後覺得這種方法太耗時了。

相關文章