資料分析中常用小函式彙總【持續更新,個人筆記。。。】
輸出缺失值所佔比例
def missing_data(data):
total = data.isnull().sum()
percent = (data.isnull().sum()/data.isnull().count()*100)
tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
types = []
for col in data.columns:
dtype = str(data[col].dtype)
types.append(dtype)
tt['Types'] = types
return(np.transpose(tt))
缺失值填充
眾數:mode() | 平均數:mean() | 中位數:median
train.product_type[train.product_type.isnull()]=train.product_type.dropna().mode().values
繪圖表示特徵之間的相關係數
# 計算所有特徵值每兩個之間的相關係數,並作圖表示。
corrmat = train.corr()# 得到相關係數
f,ax = plt.subplots(figsize = (12,9))
sns.heatmap(corrmat, vmax = .8, square = True)#熱點圖
# 取出相關性最大的前十個,做出熱點圖表示
k = 10 # number of variables for heatmap
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(train[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()
時間戳轉化為日期
time_format = '%Y-%m-%d %H:%M:%S'
def get_date(timestamp) :
time_local = time.localtime(timestamp)
dt = time.strftime(time_format,time_local)
return dt
train_df['action_date'] = train_df['timestamp_of_behavior'].apply(get_date)
train_df['action_date'] = pd.to_datetime(train_df['action_date'])
train_df['action_day'] = train_df['action_date'].dt.day
train_df['action_hour'] = train_df['action_date'].dt.hour
train_df['action_minute'] = train_df['action_date'].dt.minute
train_df['action_week'] = train_df['action_date'].dt.weekday
離散屬性數字化
def encode_count(df,column_name):
lbl = preprocessing.LabelEncoder()
lbl.fit(list(df[column_name].values))
df[column_name] = lbl.transform(list(df[column_name].values))
return df
train_df = encode_count(train_df,'gender')
按兩個特徵排序
# 對資料按照uid和timestamp_of_behavior升序排序
train_df.sort_values(['uid','timestamp_of_behavior'],ascending=True,inplace=True)
train_df.reset_index(drop=True,inplace=True)
對df2 分組聚合後再與df1 連線,重新命名列名
def merge_mean(df_1, df_2, columns, value, cname):
add = pd.DataFrame(df_1[df_1[value].notnull()].groupby(columns)[value].mean()).reset_index()
add.columns = columns + [cname]
df_2 = df_2.merge(add,on=columns,how="left")
return df_2
final_data_df = merge_mean(train_df,final_data_df,['uid'],'action_time_delta','action_time_delta_mean')
df分組在與自身聚合,重新命名列名
def mean_self_merge(df, columns, value, cname):
add = pd.DataFrame(df.groupby(columns)[value].mean()).reset_index()
add.columns = columns + [cname]
df = df.merge(add,on=columns,how="left")
return df
train_df = mean_self_merge(train_df,['vid'],'video_play_per','one_video_play_per')
行為時間逐行相減
train_df['action_time_diff'] = train_df.groupby('uid')['timestamp_of_behavior'].apply(lambda i:i.diff(1))
訓練各種模型的CV類
class CVClassifier():
def __init__(self, estimator, n_splits=5, stratified=True, num_round=77777, **params):
self.n_splits_ = n_splits
self.scores_ = []
self.clf_list_ = []
self.estimator_ = estimator
self.stratified_ = stratified
self.num_round_ = num_round
if params:
self.params_ = params
def cv(self, train_X, train_y):
if self.stratified_:
folds = StratifiedKFold(self.n_splits_, shuffle=True, random_state=seed)
else:
folds = KFold(self.n_splits_, shuffle=True, random_state=seed)
oof = np.zeros(len(train_y))
for fold, (train_idx, val_idx) in enumerate(folds.split(train_X, train_y)):
print('fold %d' % fold)
trn_data, trn_y = train_X.iloc[train_idx], train_y[train_idx]
val_data, val_y = train_X.iloc[val_idx], train_y[val_idx]
if self.estimator_ == 'lgbm':
train_set = lgb.Dataset(data=trn_data, label=trn_y)
val_set = lgb.Dataset(data=val_data, label=val_y)
clf = lgb.train(params=params, train_set=train_set, num_boost_round=num_round,
valid_sets=[train_set, val_set], verbose_eval=100, early_stopping_rounds=200)
oof[val_idx] = clf.predict(train_X.iloc[val_idx], num_iteration=clf.best_iteration)
elif self.estimator_ == 'xgb':
train_set = xgb.DMatrix(data=trn_data, label=trn_y)
val_set = xgb.DMatrix(data=val_data, label=val_y)
watchlist = [(train_set, 'train'), (val_set, 'valid')]
clf = xgb.train(self.params_, train_set, self.num_round_, watchlist,
early_stopping_rounds=200, verbose_eval=100)
oof[val_idx] = clf.predict(val_set, ntree_limit=clf.best_ntree_limit)
elif self.estimator_ == 'cat':
clf = CatBoostClassifier(self.num_round_, task_type='GPU', early_stopping_rounds=500, **self.params_)
clf.fit(trn_data, trn_y, eval_set=(val_data, val_y), cat_features=[], use_best_model=True, verbose=500)
oof[val_idx] = clf.predict_proba(val_data)[:, 1]
# sk-learn model
else:
clf = self.estimator_.fit(trn_data, trn_y)
try:
oof[val_idx] = clf.predict_proba(val_data)[:, 1]
except AttributeError:
oof[val_idx] = clf.decision_function(val_data)
self.clf_list_.append(clf)
fold_score = roc_auc_score(train_y[val_idx], oof[val_idx])
self.scores_.append(fold_score)
print('Fold score: {:<8.5f}'.format(fold_score))
self.oof_ = oof
self.score_ = roc_auc_score(train_y, oof)
print("CV score: {:<8.5f}".format(self.score_))
def predict(self, test_X):
self.predictions_ = np.zeros(len(test_X))
if self.estimator_ == 'lgbm':
self.feature_importance_df_ = pd.DataFrame()
for fold, clf in enumerate(self.clf_list_):
fold_importance_df = pd.DataFrame()
fold_importance_df["feature"] = features
fold_importance_df["importance"] = clf.feature_importance()
fold_importance_df["fold"] = fold + 1
self.feature_importance_df_ = pd.concat([self.feature_importance_df_, fold_importance_df], axis=0)
self.predictions_ += clf.predict(test_X, num_iteration=clf.best_iteration) * (self.scores_[fold] / sum(self.scores_))
elif self.estimator_ == 'xgb':
for fold, clf in enumerate(self.clf_list_):
self.predictions_ += clf.predict(xgb.DMatrix(test_X), ntree_limit=clf.best_ntree_limit) \
* (self.scores_[fold] / sum(self.scores_))
elif self.estimator_ == 'cat':
for fold, clf in enumerate(self.clf_list_):
self.predictions_ += clf.predict_proba(test_X)[:, 1] * (self.scores_[fold] / sum(self.scores_))
else:
for fold, clf in enumerate(self.clf_list_):
self.predictions_ += clf.predict_proba(test_X)[:, 1] * (self.scores_[fold] / sum(self.scores_))
# Class for Bayesian Optimisation
class CVForBO():
def __init__(self, model, train_X, train_y, test_X, base_params, int_params=[], n_splits=5, num_round=77777):
self.oofs_ = []
self.params_ = []
self.predictions_ = []
self.cv_scores_ = []
self.model_ = model
self.train_X_ = train_X
self.train_y_ = train_y
self.test_X_ = test_X
self.base_params_ = base_params
self.int_params_ = int_params
self.n_splits_ = n_splits
self.num_round_ = num_round
def cv(self, **opt_params):
for p in self.int_params_:
if p in opt_params:
opt_params[p] = int(np.round(opt_params[p]))
self.base_params_.update(opt_params)
cv_model = CVClassifier(self.model_, n_splits=self.n_splits_, num_round=self.num_round_, **self.base_params_)
cv_model.cv(self.train_X_, self.train_y_)
cv_model.predict(self.test_X_)
self.oofs_.append(cv_model.oof_)
self.predictions_.append(cv_model.predictions_)
self.params_.append(self.base_params_)
self.cv_scores_.append(cv_model.score_)
return cv_model.score_
def post_process(self, model_type=None, oof_path='inter_oofs.csv', pred_path='inter_preds.csv', params_path='inter_params.csv'):
if not model_type:
model_type=self.model_
cols = ['{}_{}_{}'.format(model_type, str(self.cv_scores_[k]).split('.')[-1][:5], k) for k in range(len(self.cv_scores_))]
self.oof_df = pd.DataFrame(np.array(self.oofs_).T, columns=cols)
self.pred_df = pd.DataFrame(np.array(self.predictions_).T, columns=cols)
self.params_df = pd.DataFrame(self.params_).T.rename(columns={c_old: c_new for c_old, c_new in enumerate(cols)})
self.oof_df.to_csv(oof_path)
self.pred_df.to_csv(pred_path)
self.params_df.to_csv(params_path)
lightgbm訓練模型、儲存特徵重要性
def train_model(final_data_df,model_path):
# 獲取所有特徵列
print('Getting all the feature...')
lgb_feature_list = list(final_data_df.columns.drop(['uid']))
lgb_df = final_data_df[lgb_feature_list].copy()
target = 'whether_to_keep'
# 劃分資料
print('Dividing dataset to trainset and valset...')
train,val = train_test_split(lgb_df,test_size=0.2,random_state=2018)
train_X = train.drop(target,1)
train_y = train[target]
val_X = val.drop(target,1)
val_y = val[target]
# 及時刪除釋放記憶體
del final_data_df
feature_name = lgb_feature_list.remove(target)
lgb_train = lgb.Dataset(train_X,train_y,feature_name=feature_name)
lgb_eval = lgb.Dataset(val_X, val_y, feature_name=feature_name,reference=lgb_train)
# 儲存 Dataset 到 LightGBM 二進位制檔案將會使得載入更快速:
print('Saving trainset and valset...')
lgb_train.save_binary('./train.bin')
lgb_eval.save_binary('./val.bin')
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'num_thread': -1,
'num_leaves':70,
'max_depth':7,
'learning_rate':0.01,
'bagging_freq': 4,
'bagging_fraction': 0.6,
'feature_fraction': 0.6,
'lambda_l1':1,
'lambda_l2':1,
'num_boost_round':20000,
'data_random_seed':2017
}
## 訓練
model = lgb.train(
params,
lgb_train,
valid_sets=lgb_eval,
early_stopping_rounds=100
)
### 儲存模型
model.save_model(model_path,num_iteration=model.best_iteration)
# 儲存模型重要性
importance = model.feature_importance()
names = model.feature_name()
with open('./feature_importance.txt', 'w+') as file:
for index, im in enumerate(importance):
string = names[index] + ', ' + str(im) + '\n'
file.write(string)
桶分析
相關文章
- 彙編筆記(持續更新中)筆記
- PHP的常用函式 持續更新PHP函式
- Kotlin學習資料彙總(持續更新...)Kotlin
- 前端常用的工具類函式, 持續更新中前端函式
- 【少用會忘】PHP 函式筆記(持續更新)PHP函式筆記
- Python3常用函式速查【持續更新】Python函式
- 讓你瞬間提高工作效率的常用js函式彙總(持續更新)JS函式
- 國產資料庫考試資料彙總(持續更新)資料庫
- 常用快捷鍵,持續更新(個人向)
- 前端學習資源彙總(持續更新)前端
- 常用函式彙總函式
- python 系列文章彙總(持續更新…)Python
- 《大道至簡》創作參考筆記連結彙總(持續更新)筆記
- 2019 Vue 面試題彙總(持續更新中...)Vue面試題
- iOS開發常用小技巧記錄(持續更新)iOS
- Mysql 常用函式(1)- 常用函式彙總MySql函式
- 持續記函式函式
- 日常工作筆記(持續更新中。。)筆記
- AndroidStudio個人常用快捷鍵彙總(12.19更新)Android
- LeetCode Animation 題目圖解彙總(持續更新中...)LeetCode圖解
- AA常用函式彙總函式
- PHP 常用函式彙總PHP函式
- MySQL常用函式彙總MySql函式
- 【PyTorch】常用的神經網路層彙總(持續補充更新)PyTorch神經網路
- 軟考筆記 --- 持續更新筆記
- 前端面試經典題目彙總(持續更新中)前端面試
- 陣列常用函式彙總陣列函式
- 個人前端資源彙集(吐血整理 / 持續更新)前端
- excel最常用的八個函式彙總 excel中各函式的用途功能Excel函式
- drupal7學習筆記—–(持續更新中…)筆記
- iOS開發備忘筆記 (持續更新中)iOS筆記
- 【持續更新...】ligerGrid 學習筆記筆記
- 【持續更新...】ECharts學習筆記Echarts筆記
- 【持續更新...】Nginx 學習筆記Nginx筆記
- Java 學習筆記(持續更新)Java筆記
- PHP常用函式歸類【持續整理】PHP函式
- 小菜菜3月前端面試記錄彙總---持續更新前端面試
- Android 常用開源庫總結(持續更新)Android