【整合學習】lightgbm入門及模板

alicelmx發表於2018-12-02

轉載自:https://www.cnblogs.com/wanglei5205/
以後會補充自己的東西的~


安裝LGB

pip install lightgbm

驗證是否安裝成功:

import lightgbm as lgb

使用案例

#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Sat Mar 31 21:19:09 2018

@author: hello4720
"""
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn import metrics
from sklearn.model_selection import train_test_split

### 讀取資料
print("載入資料")
dataset1 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data1.csv')
dataset2 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data2.csv')
dataset3 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data3.csv')
dataset4 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data4.csv')
dataset5 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data5.csv')

dataset1.drop_duplicates(inplace=True)
dataset2.drop_duplicates(inplace=True)
dataset3.drop_duplicates(inplace=True)
dataset4.drop_duplicates(inplace=True)
dataset5.drop_duplicates(inplace=True)

### 資料合併
print("資料合併")
trains = pd.concat([dataset1,dataset2],axis=0)
trains = pd.concat([trains,dataset3],axis=0)
trains = pd.concat([trains,dataset4],axis=0)

online_test = dataset5

### 資料拆分
print("資料拆分")
train_xy,offline_test = train_test_split(trains, test_size = 0.2,random_state=21)
train,val = train_test_split(train_xy, test_size = 0.2,random_state=21)

print("訓練集")
y = train.is_trade                                                  # 訓練集標籤
X = train.drop(['instance_id','is_trade'],axis=1)                   # 訓練集特徵矩陣

print("驗證集")
val_y = val.is_trade                                                # 驗證集標籤
val_X = val.drop(['instance_id','is_trade'],axis=1)                 # 驗證集特徵矩陣

print("測試集")
offline_test_X=offline_test.drop(['instance_id','is_trade'],axis=1) # 線下測試特徵矩陣
online_test_X=online_test.drop(['instance_id'],axis=1)              # 線上測試特徵矩陣

### 資料轉換
lgb_train = lgb.Dataset(X, y, free_raw_data=False)
lgb_eval = lgb.Dataset(val_X, val_y, reference=lgb_train,free_raw_data=False)

### 開始訓練
print('設定引數')
params = {
            'boosting_type': 'gbdt',
            'boosting': 'dart',
            'objective': 'binary',
            'metric': 'binary_logloss',

            'learning_rate': 0.01,
            'num_leaves':25,
            'max_depth':3,

            'max_bin':10,
            'min_data_in_leaf':8,

            'feature_fraction': 0.6,
            'bagging_fraction': 1,
            'bagging_freq':0,

            'lambda_l1': 0,
            'lambda_l2': 0,
            'min_split_gain': 0
}

print("開始訓練")
gbm = lgb.train(params,                     # 引數字典
                lgb_train,                  # 訓練集
                num_boost_round=2000,       # 迭代次數
                valid_sets=lgb_eval,        # 驗證集
                early_stopping_rounds=30)   # 早停係數
### 線下預測
print ("線下預測")
preds_offline = gbm.predict(offline_test_X, num_iteration=gbm.best_iteration) # 輸出概率
offline=offline_test[['instance_id','is_trade']]
offline['preds']=preds_offline
offline.is_trade = offline['is_trade'].astype(np.float64)
print('log_loss', metrics.log_loss(offline.is_trade, offline.preds))

### 線上預測
print("線上預測")
preds_online =  gbm.predict(online_test_X, num_iteration=gbm.best_iteration)  # 輸出概率
online=online_test[['instance_id']]
online['preds']=preds_online
online.rename(columns={'preds':'predicted_score'},inplace=True)
online.to_csv("./data/20180405.txt",index=None,sep=' ')

### 儲存模型
from sklearn.externals import joblib
joblib.dump(gbm,'gbm.pkl')

### 特徵選擇
df = pd.DataFrame(X.columns.tolist(), columns=['feature'])
df['importance']=list(gbm.feature_importance())
df = df.sort_values(by='importance',ascending=False)
df.to_csv("./data/feature_score_20180405.csv",index=None,encoding='gbk')

lightgbm引數介紹(sklearn)

在這裡插入圖片描述

lightgbm調參案例

# -*- coding: utf-8 -*-
"""
# 作者:wanglei5205
# 郵箱:wanglei5205@126.com
# 部落格:http://cnblogs.com/wanglei5205
# github:http://github.com/wanglei5205
"""
### 匯入模組
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn import metrics

### 載入資料
print('載入資料')
dataset1 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data1.csv')
dataset2 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data2.csv')
dataset3 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data3.csv')
dataset4 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data4.csv')
dataset5 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data5.csv')

print('資料去重')
dataset1.drop_duplicates(inplace=True)
dataset2.drop_duplicates(inplace=True)
dataset3.drop_duplicates(inplace=True)
dataset4.drop_duplicates(inplace=True)
dataset5.drop_duplicates(inplace=True)

print('資料合併')
trains = pd.concat([dataset1,dataset2],axis=0)
trains = pd.concat([trains,dataset3],axis=0)
trains = pd.concat([trains,dataset4],axis=0)

online_test = dataset5

### 資料拆分(訓練集+驗證集+測試集)
print('資料拆分')
from sklearn.model_selection import train_test_split
train_xy,offline_test = train_test_split(trains,test_size = 0.2,random_state=21)
train,val = train_test_split(train_xy,test_size = 0.2,random_state=21)

# 訓練集
y_train = train.is_trade                                               # 訓練集標籤
X_train = train.drop(['instance_id','is_trade'],axis=1)                # 訓練集特徵矩陣

# 驗證集
y_val = val.is_trade                                                   # 驗證集標籤
X_val = val.drop(['instance_id','is_trade'],axis=1)                    # 驗證集特徵矩陣

# 測試集
offline_test_X = offline_test.drop(['instance_id','is_trade'],axis=1)  # 線下測試特徵矩陣
online_test_X  = online_test.drop(['instance_id'],axis=1)              # 線上測試特徵矩陣

### 資料轉換
print('資料轉換')
lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=False)
lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train,free_raw_data=False)

### 設定初始引數--不含交叉驗證引數
print('設定引數')
params = {
          'boosting_type': 'gbdt',
          'objective': 'binary',
          'metric': 'binary_logloss',
          }

### 交叉驗證(調參)
print('交叉驗證')
min_merror = float('Inf')
best_params = {}

# 準確率
print("調參1:提高準確率")
for num_leaves in range(20,200,5):
    for max_depth in range(3,8,1):
        params['num_leaves'] = num_leaves
        params['max_depth'] = max_depth

        cv_results = lgb.cv(
                            params,
                            lgb_train,
                            seed=2018,
                            nfold=3,
                            metrics=['binary_error'],
                            early_stopping_rounds=10,
                            verbose_eval=True
                            )

        mean_merror = pd.Series(cv_results['binary_error-mean']).min()
        boost_rounds = pd.Series(cv_results['binary_error-mean']).argmin()

        if mean_merror < min_merror:
            min_merror = mean_merror
            best_params['num_leaves'] = num_leaves
            best_params['max_depth'] = max_depth

params['num_leaves'] = best_params['num_leaves']
params['max_depth'] = best_params['max_depth']

# 過擬合
print("調參2:降低過擬合")
for max_bin in range(1,255,5):
    for min_data_in_leaf in range(10,200,5):
            params['max_bin'] = max_bin
            params['min_data_in_leaf'] = min_data_in_leaf

            cv_results = lgb.cv(
                                params,
                                lgb_train,
                                seed=42,
                                nfold=3,
                                metrics=['binary_error'],
                                early_stopping_rounds=3,
                                verbose_eval=True
                                )

            mean_merror = pd.Series(cv_results['binary_error-mean']).min()
            boost_rounds = pd.Series(cv_results['binary_error-mean']).argmin()

            if mean_merror < min_merror:
                min_merror = mean_merror
                best_params['max_bin']= max_bin
                best_params['min_data_in_leaf'] = min_data_in_leaf

params['min_data_in_leaf'] = best_params['min_data_in_leaf']
params['max_bin'] = best_params['max_bin']

print("調參3:降低過擬合")
for feature_fraction in [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
    for bagging_fraction in [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
        for bagging_freq in range(0,50,5):
            params['feature_fraction'] = feature_fraction
            params['bagging_fraction'] = bagging_fraction
            params['bagging_freq'] = bagging_freq

            cv_results = lgb.cv(
                                params,
                                lgb_train,
                                seed=42,
                                nfold=3,
                                metrics=['binary_error'],
                                early_stopping_rounds=3,
                                verbose_eval=True
                                )

            mean_merror = pd.Series(cv_results['binary_error-mean']).min()
            boost_rounds = pd.Series(cv_results['binary_error-mean']).argmin()

            if mean_merror < min_merror:
                min_merror = mean_merror
                best_params['feature_fraction'] = feature_fraction
                best_params['bagging_fraction'] = bagging_fraction
                best_params['bagging_freq'] = bagging_freq

params['feature_fraction'] = best_params['feature_fraction']
params['bagging_fraction'] = best_params['bagging_fraction']
params['bagging_freq'] = best_params['bagging_freq']

print("調參4:降低過擬合")
for lambda_l1 in [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
    for lambda_l2 in [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
        for min_split_gain in [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
            params['lambda_l1'] = lambda_l1
            params['lambda_l2'] = lambda_l2
            params['min_split_gain'] = min_split_gain

            cv_results = lgb.cv(
                                params,
                                lgb_train,
                                seed=42,
                                nfold=3,
                                metrics=['binary_error'],
                                early_stopping_rounds=3,
                                verbose_eval=True
                                )

            mean_merror = pd.Series(cv_results['binary_error-mean']).min()
            boost_rounds = pd.Series(cv_results['binary_error-mean']).argmin()

            if mean_merror < min_merror:
                min_merror = mean_merror
                best_params['lambda_l1'] = lambda_l1
                best_params['lambda_l2'] = lambda_l2
                best_params['min_split_gain'] = min_split_gain

params['lambda_l1'] = best_params['lambda_l1']
params['lambda_l2'] = best_params['lambda_l2']
params['min_split_gain'] = best_params['min_split_gain']


print(best_params)

### 訓練
params['learning_rate']=0.01
lgb.train(
          params,                     # 引數字典
          lgb_train,                  # 訓練集
          valid_sets=lgb_eval,        # 驗證集
          num_boost_round=2000,       # 迭代次數
          early_stopping_rounds=50    # 早停次數
          )

### 線下預測
print ("線下預測")
preds_offline = lgb.predict(offline_test_X, num_iteration=lgb.best_iteration) # 輸出概率
offline=offline_test[['instance_id','is_trade']]
offline['preds']=preds_offline
offline.is_trade = offline['is_trade'].astype(np.float64)
print('log_loss', metrics.log_loss(offline.is_trade, offline.preds))

### 線上預測
print("線上預測")
preds_online =  lgb.predict(online_test_X, num_iteration=lgb.best_iteration)  # 輸出概率
online=online_test[['instance_id']]
online['preds']=preds_online
online.rename(columns={'preds':'predicted_score'},inplace=True)           # 更改列名
online.to_csv("./data/20180405.txt",index=None,sep=' ')                   # 儲存結果

### 儲存模型
from sklearn.externals import joblib
joblib.dump(lgb,'lgb.pkl')

### 特徵選擇
df = pd.DataFrame(X_train.columns.tolist(), columns=['feature'])
df['importance']=list(lgb.feature_importance())                           # 特徵分數
df = df.sort_values(by='importance',ascending=False)                      # 特徵排序
df.to_csv("./data/feature_score_20180331.csv",index=None,encoding='gbk')  # 儲存分數

相關文章