【整合學習】lightgbm入門及模板
轉載自:https://www.cnblogs.com/wanglei5205/
以後會補充自己的東西的~
安裝LGB
pip install lightgbm
驗證是否安裝成功:
import lightgbm as lgb
使用案例
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Sat Mar 31 21:19:09 2018
@author: hello4720
"""
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn import metrics
from sklearn.model_selection import train_test_split
### 讀取資料
print("載入資料")
dataset1 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data1.csv')
dataset2 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data2.csv')
dataset3 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data3.csv')
dataset4 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data4.csv')
dataset5 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data5.csv')
dataset1.drop_duplicates(inplace=True)
dataset2.drop_duplicates(inplace=True)
dataset3.drop_duplicates(inplace=True)
dataset4.drop_duplicates(inplace=True)
dataset5.drop_duplicates(inplace=True)
### 資料合併
print("資料合併")
trains = pd.concat([dataset1,dataset2],axis=0)
trains = pd.concat([trains,dataset3],axis=0)
trains = pd.concat([trains,dataset4],axis=0)
online_test = dataset5
### 資料拆分
print("資料拆分")
train_xy,offline_test = train_test_split(trains, test_size = 0.2,random_state=21)
train,val = train_test_split(train_xy, test_size = 0.2,random_state=21)
print("訓練集")
y = train.is_trade # 訓練集標籤
X = train.drop(['instance_id','is_trade'],axis=1) # 訓練集特徵矩陣
print("驗證集")
val_y = val.is_trade # 驗證集標籤
val_X = val.drop(['instance_id','is_trade'],axis=1) # 驗證集特徵矩陣
print("測試集")
offline_test_X=offline_test.drop(['instance_id','is_trade'],axis=1) # 線下測試特徵矩陣
online_test_X=online_test.drop(['instance_id'],axis=1) # 線上測試特徵矩陣
### 資料轉換
lgb_train = lgb.Dataset(X, y, free_raw_data=False)
lgb_eval = lgb.Dataset(val_X, val_y, reference=lgb_train,free_raw_data=False)
### 開始訓練
print('設定引數')
params = {
'boosting_type': 'gbdt',
'boosting': 'dart',
'objective': 'binary',
'metric': 'binary_logloss',
'learning_rate': 0.01,
'num_leaves':25,
'max_depth':3,
'max_bin':10,
'min_data_in_leaf':8,
'feature_fraction': 0.6,
'bagging_fraction': 1,
'bagging_freq':0,
'lambda_l1': 0,
'lambda_l2': 0,
'min_split_gain': 0
}
print("開始訓練")
gbm = lgb.train(params, # 引數字典
lgb_train, # 訓練集
num_boost_round=2000, # 迭代次數
valid_sets=lgb_eval, # 驗證集
early_stopping_rounds=30) # 早停係數
### 線下預測
print ("線下預測")
preds_offline = gbm.predict(offline_test_X, num_iteration=gbm.best_iteration) # 輸出概率
offline=offline_test[['instance_id','is_trade']]
offline['preds']=preds_offline
offline.is_trade = offline['is_trade'].astype(np.float64)
print('log_loss', metrics.log_loss(offline.is_trade, offline.preds))
### 線上預測
print("線上預測")
preds_online = gbm.predict(online_test_X, num_iteration=gbm.best_iteration) # 輸出概率
online=online_test[['instance_id']]
online['preds']=preds_online
online.rename(columns={'preds':'predicted_score'},inplace=True)
online.to_csv("./data/20180405.txt",index=None,sep=' ')
### 儲存模型
from sklearn.externals import joblib
joblib.dump(gbm,'gbm.pkl')
### 特徵選擇
df = pd.DataFrame(X.columns.tolist(), columns=['feature'])
df['importance']=list(gbm.feature_importance())
df = df.sort_values(by='importance',ascending=False)
df.to_csv("./data/feature_score_20180405.csv",index=None,encoding='gbk')
lightgbm引數介紹(sklearn)
lightgbm調參案例
# -*- coding: utf-8 -*-
"""
# 作者:wanglei5205
# 郵箱:wanglei5205@126.com
# 部落格:http://cnblogs.com/wanglei5205
# github:http://github.com/wanglei5205
"""
### 匯入模組
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn import metrics
### 載入資料
print('載入資料')
dataset1 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data1.csv')
dataset2 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data2.csv')
dataset3 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data3.csv')
dataset4 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data4.csv')
dataset5 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data5.csv')
print('資料去重')
dataset1.drop_duplicates(inplace=True)
dataset2.drop_duplicates(inplace=True)
dataset3.drop_duplicates(inplace=True)
dataset4.drop_duplicates(inplace=True)
dataset5.drop_duplicates(inplace=True)
print('資料合併')
trains = pd.concat([dataset1,dataset2],axis=0)
trains = pd.concat([trains,dataset3],axis=0)
trains = pd.concat([trains,dataset4],axis=0)
online_test = dataset5
### 資料拆分(訓練集+驗證集+測試集)
print('資料拆分')
from sklearn.model_selection import train_test_split
train_xy,offline_test = train_test_split(trains,test_size = 0.2,random_state=21)
train,val = train_test_split(train_xy,test_size = 0.2,random_state=21)
# 訓練集
y_train = train.is_trade # 訓練集標籤
X_train = train.drop(['instance_id','is_trade'],axis=1) # 訓練集特徵矩陣
# 驗證集
y_val = val.is_trade # 驗證集標籤
X_val = val.drop(['instance_id','is_trade'],axis=1) # 驗證集特徵矩陣
# 測試集
offline_test_X = offline_test.drop(['instance_id','is_trade'],axis=1) # 線下測試特徵矩陣
online_test_X = online_test.drop(['instance_id'],axis=1) # 線上測試特徵矩陣
### 資料轉換
print('資料轉換')
lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=False)
lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train,free_raw_data=False)
### 設定初始引數--不含交叉驗證引數
print('設定引數')
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'binary_logloss',
}
### 交叉驗證(調參)
print('交叉驗證')
min_merror = float('Inf')
best_params = {}
# 準確率
print("調參1:提高準確率")
for num_leaves in range(20,200,5):
for max_depth in range(3,8,1):
params['num_leaves'] = num_leaves
params['max_depth'] = max_depth
cv_results = lgb.cv(
params,
lgb_train,
seed=2018,
nfold=3,
metrics=['binary_error'],
early_stopping_rounds=10,
verbose_eval=True
)
mean_merror = pd.Series(cv_results['binary_error-mean']).min()
boost_rounds = pd.Series(cv_results['binary_error-mean']).argmin()
if mean_merror < min_merror:
min_merror = mean_merror
best_params['num_leaves'] = num_leaves
best_params['max_depth'] = max_depth
params['num_leaves'] = best_params['num_leaves']
params['max_depth'] = best_params['max_depth']
# 過擬合
print("調參2:降低過擬合")
for max_bin in range(1,255,5):
for min_data_in_leaf in range(10,200,5):
params['max_bin'] = max_bin
params['min_data_in_leaf'] = min_data_in_leaf
cv_results = lgb.cv(
params,
lgb_train,
seed=42,
nfold=3,
metrics=['binary_error'],
early_stopping_rounds=3,
verbose_eval=True
)
mean_merror = pd.Series(cv_results['binary_error-mean']).min()
boost_rounds = pd.Series(cv_results['binary_error-mean']).argmin()
if mean_merror < min_merror:
min_merror = mean_merror
best_params['max_bin']= max_bin
best_params['min_data_in_leaf'] = min_data_in_leaf
params['min_data_in_leaf'] = best_params['min_data_in_leaf']
params['max_bin'] = best_params['max_bin']
print("調參3:降低過擬合")
for feature_fraction in [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
for bagging_fraction in [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
for bagging_freq in range(0,50,5):
params['feature_fraction'] = feature_fraction
params['bagging_fraction'] = bagging_fraction
params['bagging_freq'] = bagging_freq
cv_results = lgb.cv(
params,
lgb_train,
seed=42,
nfold=3,
metrics=['binary_error'],
early_stopping_rounds=3,
verbose_eval=True
)
mean_merror = pd.Series(cv_results['binary_error-mean']).min()
boost_rounds = pd.Series(cv_results['binary_error-mean']).argmin()
if mean_merror < min_merror:
min_merror = mean_merror
best_params['feature_fraction'] = feature_fraction
best_params['bagging_fraction'] = bagging_fraction
best_params['bagging_freq'] = bagging_freq
params['feature_fraction'] = best_params['feature_fraction']
params['bagging_fraction'] = best_params['bagging_fraction']
params['bagging_freq'] = best_params['bagging_freq']
print("調參4:降低過擬合")
for lambda_l1 in [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
for lambda_l2 in [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
for min_split_gain in [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
params['lambda_l1'] = lambda_l1
params['lambda_l2'] = lambda_l2
params['min_split_gain'] = min_split_gain
cv_results = lgb.cv(
params,
lgb_train,
seed=42,
nfold=3,
metrics=['binary_error'],
early_stopping_rounds=3,
verbose_eval=True
)
mean_merror = pd.Series(cv_results['binary_error-mean']).min()
boost_rounds = pd.Series(cv_results['binary_error-mean']).argmin()
if mean_merror < min_merror:
min_merror = mean_merror
best_params['lambda_l1'] = lambda_l1
best_params['lambda_l2'] = lambda_l2
best_params['min_split_gain'] = min_split_gain
params['lambda_l1'] = best_params['lambda_l1']
params['lambda_l2'] = best_params['lambda_l2']
params['min_split_gain'] = best_params['min_split_gain']
print(best_params)
### 訓練
params['learning_rate']=0.01
lgb.train(
params, # 引數字典
lgb_train, # 訓練集
valid_sets=lgb_eval, # 驗證集
num_boost_round=2000, # 迭代次數
early_stopping_rounds=50 # 早停次數
)
### 線下預測
print ("線下預測")
preds_offline = lgb.predict(offline_test_X, num_iteration=lgb.best_iteration) # 輸出概率
offline=offline_test[['instance_id','is_trade']]
offline['preds']=preds_offline
offline.is_trade = offline['is_trade'].astype(np.float64)
print('log_loss', metrics.log_loss(offline.is_trade, offline.preds))
### 線上預測
print("線上預測")
preds_online = lgb.predict(online_test_X, num_iteration=lgb.best_iteration) # 輸出概率
online=online_test[['instance_id']]
online['preds']=preds_online
online.rename(columns={'preds':'predicted_score'},inplace=True) # 更改列名
online.to_csv("./data/20180405.txt",index=None,sep=' ') # 儲存結果
### 儲存模型
from sklearn.externals import joblib
joblib.dump(lgb,'lgb.pkl')
### 特徵選擇
df = pd.DataFrame(X_train.columns.tolist(), columns=['feature'])
df['importance']=list(lgb.feature_importance()) # 特徵分數
df = df.sort_values(by='importance',ascending=False) # 特徵排序
df.to_csv("./data/feature_score_20180331.csv",index=None,encoding='gbk') # 儲存分數
相關文章
- 機器學習-整合學習LightGBM機器學習
- Nacos整合學習入門
- smarty模板入門學習(share)
- 整合學習入門介紹
- Flask入門學習---初步瞭解模板Flask
- C++入門學習——標準模板庫之vectorC++
- SpringBoot整合ElasticSearch 入門demo學習筆記Spring BootElasticsearch筆記
- Redis入門及常用命令學習Redis
- Pytest學習(一)- 入門及基礎
- Zookeeper入門學習--01介紹及安裝
- jqueryEasyUi 入門模板jQueryUI
- mybatis入門學習MyBatis
- Nginx入門學習Nginx
- Vue入門學習Vue
- ROS入門學習ROS
- nuxt 入門學習UX
- GORM學習入門GoORM
- afl入門學習
- Spark入門學習Spark
- React入門學習React
- 現在學習網路安全來得及嗎?web安全入門學習Web
- Linux入門及進階學習推薦書籍Linux
- Linux入門學習Linux
- MyBatisPlus入門學習MyBatis
- spring入門學習Spring
- Mybatis框架 入門學習MyBatis框架
- leaflet學習一 入門
- linux學習——入門Linux
- MyBatis入門學習(一)MyBatis
- Android學習 - 入門Android
- JavaScript入門學習學習筆記(上)JavaScript筆記
- Java入門學習-學習static的用法Java
- python 入門學習---模組匯入三種方式及中文註釋Python
- 整合學習(一):簡述整合學習
- vue 快速入門 系列 —— 模板Vue
- jfinal enjoy模板入門
- TS入門學習筆記筆記
- 【PostgreSQL】入門學習筆記SQL筆記