資料分析中常用小函式彙總【持續更新,個人筆記。。。】

alicelmx發表於2019-02-25

輸出缺失值所佔比例

def missing_data(data):
    total = data.isnull().sum()
    percent = (data.isnull().sum()/data.isnull().count()*100)
    tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    types = []
    for col in data.columns:
        dtype = str(data[col].dtype)
        types.append(dtype)
    tt['Types'] = types
    return(np.transpose(tt))

缺失值填充

眾數:mode() | 平均數:mean() | 中位數:median

train.product_type[train.product_type.isnull()]=train.product_type.dropna().mode().values

繪圖表示特徵之間的相關係數

# 計算所有特徵值每兩個之間的相關係數,並作圖表示。
corrmat = train.corr()# 得到相關係數
f,ax = plt.subplots(figsize = (12,9))
sns.heatmap(corrmat, vmax = .8, square = True)#熱點圖

# 取出相關性最大的前十個,做出熱點圖表示
k = 10 # number of variables for heatmap
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(train[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

時間戳轉化為日期

time_format = '%Y-%m-%d %H:%M:%S'
def get_date(timestamp) :
    time_local = time.localtime(timestamp)
    dt = time.strftime(time_format,time_local)
    
    return dt
train_df['action_date'] = train_df['timestamp_of_behavior'].apply(get_date)
train_df['action_date'] = pd.to_datetime(train_df['action_date'])

train_df['action_day'] = train_df['action_date'].dt.day
train_df['action_hour'] = train_df['action_date'].dt.hour
train_df['action_minute'] = train_df['action_date'].dt.minute
train_df['action_week'] = train_df['action_date'].dt.weekday

離散屬性數字化

def encode_count(df,column_name):
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(df[column_name].values))
        df[column_name] = lbl.transform(list(df[column_name].values))
        
        return df
train_df = encode_count(train_df,'gender')

按兩個特徵排序

# 對資料按照uid和timestamp_of_behavior升序排序
train_df.sort_values(['uid','timestamp_of_behavior'],ascending=True,inplace=True)
train_df.reset_index(drop=True,inplace=True)

對df2 分組聚合後再與df1 連線,重新命名列名

def merge_mean(df_1, df_2, columns, value, cname):
    add = pd.DataFrame(df_1[df_1[value].notnull()].groupby(columns)[value].mean()).reset_index()
    add.columns = columns + [cname]
    df_2 = df_2.merge(add,on=columns,how="left")
    
    return df_2

final_data_df = merge_mean(train_df,final_data_df,['uid'],'action_time_delta','action_time_delta_mean')

df分組在與自身聚合,重新命名列名

def mean_self_merge(df, columns, value, cname):
    add = pd.DataFrame(df.groupby(columns)[value].mean()).reset_index()
    add.columns = columns + [cname]
    df = df.merge(add,on=columns,how="left")
    
    return df
train_df = mean_self_merge(train_df,['vid'],'video_play_per','one_video_play_per')

行為時間逐行相減

train_df['action_time_diff'] = train_df.groupby('uid')['timestamp_of_behavior'].apply(lambda i:i.diff(1))

訓練各種模型的CV類

class CVClassifier():
    def __init__(self, estimator, n_splits=5, stratified=True, num_round=77777, **params):
        self.n_splits_ = n_splits
        self.scores_ = []
        self.clf_list_ = []
        self.estimator_ = estimator
        self.stratified_ = stratified
        self.num_round_ = num_round
        if params:
            self.params_ = params
        
    def cv(self, train_X, train_y):
        if self.stratified_:
            folds = StratifiedKFold(self.n_splits_, shuffle=True, random_state=seed)
        else:
            folds = KFold(self.n_splits_, shuffle=True, random_state=seed)
        oof = np.zeros(len(train_y))
        for fold, (train_idx, val_idx) in enumerate(folds.split(train_X, train_y)):
            print('fold %d' % fold)
            trn_data, trn_y = train_X.iloc[train_idx], train_y[train_idx]
            val_data, val_y = train_X.iloc[val_idx], train_y[val_idx]
            if self.estimator_ == 'lgbm':
                train_set = lgb.Dataset(data=trn_data, label=trn_y)
                val_set = lgb.Dataset(data=val_data, label=val_y)
                clf = lgb.train(params=params, train_set=train_set, num_boost_round=num_round, 
                                valid_sets=[train_set, val_set], verbose_eval=100, early_stopping_rounds=200)
                oof[val_idx] = clf.predict(train_X.iloc[val_idx], num_iteration=clf.best_iteration)
                
            elif self.estimator_ == 'xgb':
                train_set = xgb.DMatrix(data=trn_data, label=trn_y)
                val_set = xgb.DMatrix(data=val_data, label=val_y)
                watchlist = [(train_set, 'train'), (val_set, 'valid')]
                clf = xgb.train(self.params_, train_set, self.num_round_, watchlist, 
                               early_stopping_rounds=200, verbose_eval=100)
                oof[val_idx] = clf.predict(val_set, ntree_limit=clf.best_ntree_limit)
            
            elif self.estimator_ == 'cat':
                clf = CatBoostClassifier(self.num_round_, task_type='GPU', early_stopping_rounds=500, **self.params_)
                clf.fit(trn_data, trn_y, eval_set=(val_data, val_y), cat_features=[], use_best_model=True, verbose=500)
                oof[val_idx] = clf.predict_proba(val_data)[:, 1]

            # sk-learn model
            else:
                clf = self.estimator_.fit(trn_data, trn_y)
                try:
                    oof[val_idx] = clf.predict_proba(val_data)[:, 1]
                except AttributeError:
                    oof[val_idx] = clf.decision_function(val_data)
            
            self.clf_list_.append(clf)
            fold_score = roc_auc_score(train_y[val_idx], oof[val_idx])
            self.scores_.append(fold_score)
            print('Fold score: {:<8.5f}'.format(fold_score))
        self.oof_ = oof
        self.score_ = roc_auc_score(train_y, oof)
        print("CV score: {:<8.5f}".format(self.score_))
        
    def predict(self, test_X):
        self.predictions_ = np.zeros(len(test_X))
        
        if self.estimator_ == 'lgbm':
            self.feature_importance_df_ = pd.DataFrame()
            for fold, clf in enumerate(self.clf_list_):
                fold_importance_df = pd.DataFrame()
                fold_importance_df["feature"] = features
                fold_importance_df["importance"] = clf.feature_importance()
                fold_importance_df["fold"] = fold + 1
                self.feature_importance_df_ = pd.concat([self.feature_importance_df_, fold_importance_df], axis=0)
                
                self.predictions_ += clf.predict(test_X, num_iteration=clf.best_iteration) * (self.scores_[fold] / sum(self.scores_))
        elif self.estimator_ == 'xgb':
            for fold, clf in enumerate(self.clf_list_):
                self.predictions_ += clf.predict(xgb.DMatrix(test_X), ntree_limit=clf.best_ntree_limit) \
                * (self.scores_[fold] / sum(self.scores_))
        elif self.estimator_ == 'cat':
            for fold, clf in enumerate(self.clf_list_):
                self.predictions_ += clf.predict_proba(test_X)[:, 1] * (self.scores_[fold] / sum(self.scores_))
        else:
            for fold, clf in enumerate(self.clf_list_):
                self.predictions_ += clf.predict_proba(test_X)[:, 1] * (self.scores_[fold] / sum(self.scores_))

# Class for Bayesian Optimisation
class CVForBO():
    def __init__(self, model, train_X, train_y, test_X, base_params, int_params=[], n_splits=5, num_round=77777):
        self.oofs_ = []
        self.params_ = []
        self.predictions_ = []
        self.cv_scores_ = []
        self.model_ = model
        self.train_X_ = train_X
        self.train_y_ = train_y
        self.test_X_ = test_X
        self.base_params_ = base_params
        self.int_params_ = int_params
        self.n_splits_ = n_splits
        self.num_round_ = num_round
        
    def cv(self, **opt_params):
        for p in self.int_params_:
            if p in opt_params:
                opt_params[p] = int(np.round(opt_params[p]))
        self.base_params_.update(opt_params)
        
        cv_model = CVClassifier(self.model_, n_splits=self.n_splits_, num_round=self.num_round_, **self.base_params_)
        cv_model.cv(self.train_X_, self.train_y_)
        cv_model.predict(self.test_X_)
        
        self.oofs_.append(cv_model.oof_)
        self.predictions_.append(cv_model.predictions_)
        self.params_.append(self.base_params_)
        self.cv_scores_.append(cv_model.score_)

        return cv_model.score_
    
    def post_process(self, model_type=None, oof_path='inter_oofs.csv', pred_path='inter_preds.csv', params_path='inter_params.csv'):
        if not model_type:
            model_type=self.model_
        cols = ['{}_{}_{}'.format(model_type, str(self.cv_scores_[k]).split('.')[-1][:5], k) for k in range(len(self.cv_scores_))]
        self.oof_df = pd.DataFrame(np.array(self.oofs_).T, columns=cols)
        self.pred_df = pd.DataFrame(np.array(self.predictions_).T, columns=cols)
        self.params_df = pd.DataFrame(self.params_).T.rename(columns={c_old: c_new for c_old, c_new in enumerate(cols)})
        
        self.oof_df.to_csv(oof_path)
        self.pred_df.to_csv(pred_path)
        self.params_df.to_csv(params_path)

lightgbm訓練模型、儲存特徵重要性

def train_model(final_data_df,model_path):
    # 獲取所有特徵列
    print('Getting all the feature...')
    lgb_feature_list = list(final_data_df.columns.drop(['uid']))
    lgb_df = final_data_df[lgb_feature_list].copy()
    target = 'whether_to_keep'

    # 劃分資料
    print('Dividing dataset to trainset and valset...')
    train,val = train_test_split(lgb_df,test_size=0.2,random_state=2018)

    train_X = train.drop(target,1)
    train_y = train[target]

    val_X = val.drop(target,1)
    val_y = val[target]

    # 及時刪除釋放記憶體
    del final_data_df

    feature_name = lgb_feature_list.remove(target)

    lgb_train = lgb.Dataset(train_X,train_y,feature_name=feature_name)
    lgb_eval = lgb.Dataset(val_X, val_y, feature_name=feature_name,reference=lgb_train)

    # 儲存 Dataset 到 LightGBM 二進位制檔案將會使得載入更快速:
    print('Saving trainset and valset...')
    lgb_train.save_binary('./train.bin')
    lgb_eval.save_binary('./val.bin')

    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'num_thread': -1,
        'num_leaves':70,
        'max_depth':7,
        'learning_rate':0.01,
        'bagging_freq': 4,
        'bagging_fraction': 0.6,
        'feature_fraction': 0.6,
        'lambda_l1':1,
        'lambda_l2':1,
        'num_boost_round':20000,
        'data_random_seed':2017
    }

    ## 訓練
    model = lgb.train(
        params,
        lgb_train,
        valid_sets=lgb_eval,
        early_stopping_rounds=100
    )
    ### 儲存模型
    model.save_model(model_path,num_iteration=model.best_iteration)

    # 儲存模型重要性
    importance = model.feature_importance()
    names = model.feature_name()
    with open('./feature_importance.txt', 'w+') as file:
        for index, im in enumerate(importance):
            string = names[index] + ', ' + str(im) + '\n'
            file.write(string)

桶分析

在這裡插入圖片描述

相關文章