天池新人實戰賽o2o優惠券使用預測-排名181

AI信仰者發表於2020-11-08

資料
本賽題提供使用者在2016年1月1日至2016年6月30日之間真實線上線下消費行為,預測使用者在2016年7月領取優惠券後15天以內的使用情況。

具體請移步:o2o優惠券使用預測

具體思路:
去除不要的特徵
填充空值
計算統計特徵
使用[‘gbdt’, ‘xgb’, ‘rf_gini’, ‘et_gini’, ‘lgb’, ‘cat’]做blending去預測

在這裡插入圖片描述

模型1:分數0.8

# 解決lgb報錯
import os

os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

import datetime
import os
from concurrent.futures import ProcessPoolExecutor
from math import ceil

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from xgboost.sklearn import XGBClassifier

os.chdir(r'E:\專案檔案\o2o優惠券使用預測')

# dfoff = pd.read_csv('ccf_offline_stage1_train.csv')
# dftest = pd.read_csv('ccf_offline_stage1_test_revised.csv')
# dfon = pd.read_csv('ccf_online_stage1_train.csv')

pd.set_option('expand_frame_repr', False)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)


def drop_columns(X, predict=False):
    columns = [
        'User_id', 'Merchant_id', 'Discount_rate', 'Date_received', 'discount_rate_x', 'discount_rate_y',
        # 'u33', 'u34'
    ]

    if predict:
        columns.append('Coupon_id')
    else:
        columns.append('Date')

    X.drop(columns=columns, inplace=True)


def get_preprocess_data(predict=False):
    if predict:
        offline = pd.read_csv('ccf_offline_stage1_test_revised.csv', parse_dates=['Date_received'])
    else:
        offline = pd.read_csv('ccf_offline_stage1_train.csv', parse_dates=['Date_received', 'Date'])

    offline.Distance.fillna(11, inplace=True)
    offline.Distance = offline.Distance.astype(int)
    offline.Coupon_id.fillna(0, inplace=True)
    offline.Coupon_id = offline.Coupon_id.astype(int)
    offline.Date_received.fillna(date_null, inplace=True)

    offline[['discount_rate_x', 'discount_rate_y']] = offline[offline.Discount_rate.str.contains(':') == True][
        'Discount_rate'].str.split(':', expand=True).astype(int)
    offline['discount_rate'] = 1 - offline.discount_rate_y / offline.discount_rate_x
    offline.discount_rate = offline.discount_rate.fillna(offline.Discount_rate).astype(float)

    if predict:
        return offline

    offline.Date.fillna(date_null, inplace=True)

    # online
    online = pd.read_csv('ccf_online_stage1_train.csv', parse_dates=['Date_received', 'Date'])

    online.Coupon_id.fillna(0, inplace=True)
    # online.Coupon_id = online.Coupon_id.astype(int)
    online.Date_received.fillna(date_null, inplace=True)
    online.Date.fillna(date_null, inplace=True)

    return offline, online


def task(X_chunk, X, counter):
    print(counter, end=',', flush=True)
    X_chunk = X_chunk.copy()

    X_chunk['o17'] = -1
    X_chunk['o18'] = -1

    for i, user in X_chunk.iterrows():
        temp = X[X.User_id == user.User_id]

        temp1 = temp[temp.Date_received < user.Date_received]
        temp2 = temp[temp.Date_received > user.Date_received]

        # 使用者此次之後/前領取的所有優惠券數目
        X_chunk.loc[i, 'o3'] = len(temp1)
        X_chunk.loc[i, 'o4'] = len(temp2)

        # 使用者此次之後/前領取的特定優惠券數目
        X_chunk.loc[i, 'o5'] = len(temp1[temp1.Coupon_id == user.Coupon_id])
        X_chunk.loc[i, 'o6'] = len(temp2[temp2.Coupon_id == user.Coupon_id])

        # 使用者上/下一次領取的時間間隔
        temp1 = temp1.sort_values(by='Date_received', ascending=False)
        if len(temp1):
            X_chunk.loc[i, 'o17'] = (user.Date_received - temp1.iloc[0].Date_received).days

        temp2 = temp2.sort_values(by='Date_received')
        if len(temp2):
            X_chunk.loc[i, 'o18'] = (temp2.iloc[0].Date_received - user.Date_received).days

    return X_chunk


def get_offline_features(X, offline):
    # X = X[:1000]

    print(len(X), len(X.columns))

    temp = offline[offline.Coupon_id != 0]
    coupon_consume = temp[temp.Date != date_null]
    coupon_no_consume = temp[temp.Date == date_null]

    user_coupon_consume = coupon_consume.groupby('User_id')

    X['weekday'] = X.Date_received.dt.weekday
    X['day'] = X.Date_received.dt.day

    # # 距離優惠券消費次數
    # temp = coupon_consume.groupby('Distance').size().reset_index(name='distance_0')
    # X = pd.merge(X, temp, how='left', on='Distance')
    #
    # # 距離優惠券不消費次數
    # temp = coupon_no_consume.groupby('Distance').size().reset_index(name='distance_1')
    # X = pd.merge(X, temp, how='left', on='Distance')
    #
    # # 距離優惠券領取次數
    # X['distance_2'] = X.distance_0 + X.distance_1
    #
    # # 距離優惠券消費率
    # X['distance_3'] = X.distance_0 / X.distance_2

    # temp = coupon_consume[coupon_consume.Distance != 11].groupby('Distance').size()
    # temp['d4'] = temp.Distance.sum() / len(temp)
    # X = pd.merge(X, temp, how='left', on='Distance')

    '''user features'''

    # 優惠券消費次數
    temp = user_coupon_consume.size().reset_index(name='u2')
    X = pd.merge(X, temp, how='left', on='User_id')
    # X.u2.fillna(0, inplace=True)
    # X.u2 = X.u2.astype(int)

    # 優惠券不消費次數
    temp = coupon_no_consume.groupby('User_id').size().reset_index(name='u3')
    X = pd.merge(X, temp, how='left', on='User_id')

    # 使用優惠券次數與沒使用優惠券次數比值
    X['u19'] = X.u2 / X.u3

    # 領取優惠券次數
    X['u1'] = X.u2.fillna(0) + X.u3.fillna(0)

    # 優惠券核銷率
    X['u4'] = X.u2 / X.u1

    # 普通消費次數
    temp = offline[(offline.Coupon_id == 0) & (offline.Date != date_null)]
    temp1 = temp.groupby('User_id').size().reset_index(name='u5')
    X = pd.merge(X, temp1, how='left', on='User_id')

    # 一共消費多少次
    X['u25'] = X.u2 + X.u5

    # 使用者使用優惠券消費佔比
    X['u20'] = X.u2 / X.u25

    # 正常消費平均間隔
    temp = pd.merge(temp, temp.groupby('User_id').Date.max().reset_index(name='max'))
    temp = pd.merge(temp, temp.groupby('User_id').Date.min().reset_index(name='min'))
    temp = pd.merge(temp, temp.groupby('User_id').size().reset_index(name='len'))
    temp['u6'] = ((temp['max'] - temp['min']).dt.days / (temp['len'] - 1))
    temp = temp.drop_duplicates('User_id')
    X = pd.merge(X, temp[['User_id', 'u6']], how='left', on='User_id')

    # 優惠券消費平均間隔
    temp = pd.merge(coupon_consume, user_coupon_consume.Date.max().reset_index(name='max'))
    temp = pd.merge(temp, temp.groupby('User_id').Date.min().reset_index(name='min'))
    temp = pd.merge(temp, temp.groupby('User_id').size().reset_index(name='len'))
    temp['u7'] = ((temp['max'] - temp['min']).dt.days / (temp['len'] - 1))
    temp = temp.drop_duplicates('User_id')
    X = pd.merge(X, temp[['User_id', 'u7']], how='left', on='User_id')

    # 15天內平均會普通消費幾次
    X['u8'] = X.u6 / 15

    # 15天內平均會優惠券消費幾次
    X['u9'] = X.u7 / 15

    # 領取優惠券到使用優惠券的平均間隔時間
    temp = coupon_consume.copy()
    temp['days'] = (temp.Date - temp.Date_received).dt.days
    temp = (temp.groupby('User_id').days.sum() / temp.groupby('User_id').size()).reset_index(name='u10')
    X = pd.merge(X, temp, how='left', on='User_id')

    # 在15天內使用掉優惠券的值大小
    X['u11'] = X.u10 / 15

    # 領取優惠券到使用優惠券間隔小於15天的次數
    temp = coupon_consume.copy()
    temp['days'] = (temp.Date - temp.Date_received).dt.days
    temp = temp[temp.days <= 15]
    temp = temp.groupby('User_id').size().reset_index(name='u21')
    X = pd.merge(X, temp, how='left', on='User_id')

    # 使用者15天使用掉優惠券的次數除以使用優惠券的次數
    X['u22'] = X.u21 / X.u2

    # 使用者15天使用掉優惠券的次數除以領取優惠券未消費的次數
    X['u23'] = X.u21 / X.u3

    # 使用者15天使用掉優惠券的次數除以領取優惠券的總次數
    X['u24'] = X.u21 / X.u1

    # 消費優惠券的平均折率
    temp = user_coupon_consume.discount_rate.mean().reset_index(name='u45')
    X = pd.merge(X, temp, how='left', on='User_id')

    # 使用者核銷優惠券的最低消費折率
    temp = user_coupon_consume.discount_rate.min().reset_index(name='u27')
    X = pd.merge(X, temp, how='left', on='User_id')

    # 使用者核銷優惠券的最高消費折率
    temp = user_coupon_consume.discount_rate.max().reset_index(name='u28')
    X = pd.merge(X, temp, how='left', on='User_id')

    # 使用者核銷過的不同優惠券數量
    temp = coupon_consume.groupby(['User_id', 'Coupon_id']).size()
    temp = temp.groupby('User_id').size().reset_index(name='u32')
    X = pd.merge(X, temp, how='left', on='User_id')

    # 使用者領取所有不同優惠券數量
    temp = offline[offline.Date_received != date_null]
    temp = temp.groupby(['User_id', 'Coupon_id']).size().reset_index(name='u47')
    X = pd.merge(X, temp, how='left', on=['User_id', 'Coupon_id'])

    # 使用者核銷過的不同優惠券數量佔所有不同優惠券的比重
    X['u33'] = X.u32 / X.u47

    # 使用者平均每種優惠券核銷多少張
    X['u34'] = X.u2 / X.u47

    # 核銷優惠券使用者-商家平均距離
    temp = offline[(offline.Coupon_id != 0) & (offline.Date != date_null) & (offline.Distance != 11)]
    temp = temp.groupby('User_id').Distance
    temp = pd.merge(temp.count().reset_index(name='x'), temp.sum().reset_index(name='y'), on='User_id')
    temp['u35'] = temp.y / temp.x
    temp = temp[['User_id', 'u35']]
    X = pd.merge(X, temp, how='left', on='User_id')

    # 使用者核銷優惠券中的最小使用者-商家距離
    temp = coupon_consume[coupon_consume.Distance != 11]
    temp = temp.groupby('User_id').Distance.min().reset_index(name='u36')
    X = pd.merge(X, temp, how='left', on='User_id')

    # 使用者核銷優惠券中的最大使用者-商家距離
    temp = coupon_consume[coupon_consume.Distance != 11]
    temp = temp.groupby('User_id').Distance.max().reset_index(name='u37')
    X = pd.merge(X, temp, how='left', on='User_id')

    # 優惠券型別
    discount_types = [
        '0.2', '0.5', '0.6', '0.7', '0.75', '0.8', '0.85', '0.9', '0.95', '30:20', '50:30', '10:5',
        '20:10', '100:50', '200:100', '50:20', '30:10', '150:50', '100:30', '20:5', '200:50', '5:1',
        '50:10', '100:20', '150:30', '30:5', '300:50', '200:30', '150:20', '10:1', '50:5', '100:10',
        '200:20', '300:30', '150:10', '300:20', '500:30', '20:1', '100:5', '200:10', '30:1', '150:5',
        '300:10', '200:5', '50:1', '100:1',
    ]
    X['discount_type'] = -1
    for k, v in enumerate(discount_types):
        X.loc[X.Discount_rate == v, 'discount_type'] = k

    # 不同優惠券領取次數
    temp = offline.groupby(['User_id', 'Discount_rate']).size().reset_index(name='u41')
    X = pd.merge(X, temp, how='left', on=['User_id', 'Discount_rate'])

    # 不同優惠券使用次數
    temp = coupon_consume.groupby(['User_id', 'Discount_rate']).size().reset_index(name='u42')
    X = pd.merge(X, temp, how='left', on=['User_id', 'Discount_rate'])

    # 不同優惠券不使用次數
    temp = coupon_no_consume.groupby(['User_id', 'Discount_rate']).size().reset_index(name='u43')
    X = pd.merge(X, temp, how='left', on=['User_id', 'Discount_rate'])

    # 不同打折優惠券使用率
    X['u44'] = X.u42 / X.u41

    # 滿減型別優惠券領取次數
    temp = offline[offline.Discount_rate.str.contains(':') == True]
    temp = temp.groupby('User_id').size().reset_index(name='u48')
    X = pd.merge(X, temp, how='left', on='User_id')

    # 打折型別優惠券領取次數
    temp = offline[offline.Discount_rate.str.contains('\.') == True]
    temp = temp.groupby('User_id').size().reset_index(name='u49')
    X = pd.merge(X, temp, how='left', on='User_id')

    '''offline merchant features'''

    # 商戶消費次數
    temp = offline[offline.Date != date_null].groupby('Merchant_id').size().reset_index(name='m0')
    X = pd.merge(X, temp, how='left', on='Merchant_id')

    # 商家優惠券被領取後核銷次數
    temp = coupon_consume.groupby('Merchant_id').size().reset_index(name='m1')
    X = pd.merge(X, temp, how='left', on='Merchant_id')

    # 商戶正常消費筆數
    X['m2'] = X.m0.fillna(0) - X.m1.fillna(0)

    # 商家優惠券被領取次數
    temp = offline[offline.Date_received != date_null].groupby('Merchant_id').size().reset_index(name='m3')
    X = pd.merge(X, temp, how='left', on='Merchant_id')

    # 商家優惠券被領取後核銷率
    X['m4'] = X.m1 / X.m3

    # 商家優惠券被領取後不核銷次數
    temp = coupon_no_consume.groupby('Merchant_id').size().reset_index(name='m7')
    X = pd.merge(X, temp, how='left', on='Merchant_id')

    # 商戶當天優惠券領取次數
    temp = X[X.Date_received != date_null]
    temp = temp.groupby(['Merchant_id', 'Date_received']).size().reset_index(name='m5')
    X = pd.merge(X, temp, how='left', on=['Merchant_id', 'Date_received'])

    # 商戶當天優惠券領取人數
    temp = X[X.Date_received != date_null]
    temp = temp.groupby(['User_id', 'Merchant_id', 'Date_received']).size().reset_index()
    temp = temp.groupby(['Merchant_id', 'Date_received']).size().reset_index(name='m6')
    X = pd.merge(X, temp, how='left', on=['Merchant_id', 'Date_received'])

    # 商家優惠券核銷的平均消費折率
    temp = coupon_consume.groupby('Merchant_id').discount_rate.mean().reset_index(name='m8')
    X = pd.merge(X, temp, how='left', on='Merchant_id')

    # 商家優惠券核銷的最小消費折率
    temp = coupon_consume.groupby('Merchant_id').discount_rate.max().reset_index(name='m9')
    X = pd.merge(X, temp, how='left', on='Merchant_id')

    # 商家優惠券核銷的最大消費折率
    temp = coupon_consume.groupby('Merchant_id').discount_rate.min().reset_index(name='m10')
    X = pd.merge(X, temp, how='left', on='Merchant_id')

    # 商家優惠券核銷不同的使用者數量
    temp = coupon_consume.groupby(['Merchant_id', 'User_id']).size()
    temp = temp.groupby('Merchant_id').size().reset_index(name='m11')
    X = pd.merge(X, temp, how='left', on='Merchant_id')

    # 商家優惠券領取不同的使用者數量
    temp = offline[offline.Date_received != date_null].groupby(['Merchant_id', 'User_id']).size()
    temp = temp.groupby('Merchant_id').size().reset_index(name='m12')
    X = pd.merge(X, temp, how='left', on='Merchant_id')

    # 核銷商家優惠券的不同使用者數量其佔領取不同的使用者比重
    X['m13'] = X.m11 / X.m12

    # 商家優惠券平均每個使用者核銷多少張
    X['m14'] = X.m1 / X.m12

    # 商家被核銷過的不同優惠券數量
    temp = coupon_consume.groupby(['Merchant_id', 'Coupon_id']).size()
    temp = temp.groupby('Merchant_id').size().reset_index(name='m15')
    X = pd.merge(X, temp, how='left', on='Merchant_id')

    # 商家領取過的不同優惠券數量的比重
    temp = offline[offline.Date_received != date_null].groupby(['Merchant_id', 'Coupon_id']).size()
    temp = temp.groupby('Merchant_id').count().reset_index(name='m18')
    X = pd.merge(X, temp, how='left', on='Merchant_id')

    # 商家被核銷過的不同優惠券數量佔所有領取過的不同優惠券數量的比重
    X['m19'] = X.m15 / X.m18

    # 商家被核銷優惠券的平均時間
    temp = pd.merge(coupon_consume, coupon_consume.groupby('Merchant_id').Date.max().reset_index(name='max'))
    temp = pd.merge(temp, temp.groupby('Merchant_id').Date.min().reset_index(name='min'))
    temp = pd.merge(temp, temp.groupby('Merchant_id').size().reset_index(name='len'))
    temp['m20'] = ((temp['max'] - temp['min']).dt.days / (temp['len'] - 1))
    temp = temp.drop_duplicates('Merchant_id')
    X = pd.merge(X, temp[['Merchant_id', 'm20']], how='left', on='Merchant_id')

    # 商家被核銷優惠券中的使用者-商家平均距離
    temp = coupon_consume[coupon_consume.Distance != 11].groupby('Merchant_id').Distance
    temp = pd.merge(temp.count().reset_index(name='x'), temp.sum().reset_index(name='y'), on='Merchant_id')
    temp['m21'] = temp.y / temp.x
    temp = temp[['Merchant_id', 'm21']]
    X = pd.merge(X, temp, how='left', on='Merchant_id')

    # 商家被核銷優惠券中的使用者-商家最小距離
    temp = coupon_consume[coupon_consume.Distance != 11]
    temp = temp.groupby('Merchant_id').Distance.min().reset_index(name='m22')
    X = pd.merge(X, temp, how='left', on='Merchant_id')

    # 商家被核銷優惠券中的使用者-商家最大距離
    temp = coupon_consume[coupon_consume.Distance != 11]
    temp = temp.groupby('Merchant_id').Distance.max().reset_index(name='m23')
    X = pd.merge(X, temp, how='left', on='Merchant_id')

    """offline coupon features"""

    # 此優惠券一共發行多少張
    temp = offline[offline.Coupon_id != 0].groupby('Coupon_id').size().reset_index(name='c1')
    X = pd.merge(X, temp, how='left', on='Coupon_id')

    # 此優惠券一共被使用多少張
    temp = coupon_consume.groupby('Coupon_id').size().reset_index(name='c2')
    X = pd.merge(X, temp, how='left', on='Coupon_id')

    # 優惠券使用率
    X['c3'] = X.c2 / X.c1

    # 沒有使用的數目
    X['c4'] = X.c1 - X.c2

    # 此優惠券在當天發行了多少張
    temp = X.groupby(['Coupon_id', 'Date_received']).size().reset_index(name='c5')
    X = pd.merge(X, temp, how='left', on=['Coupon_id', 'Date_received'])

    # 優惠券型別(直接優惠為0, 滿減為1)
    X['c6'] = 0
    X.loc[X.Discount_rate.str.contains(':') == True, 'c6'] = 1

    # 不同打折優惠券領取次數
    temp = offline.groupby('Discount_rate').size().reset_index(name='c8')
    X = pd.merge(X, temp, how='left', on='Discount_rate')

    # 不同打折優惠券使用次數
    temp = coupon_consume.groupby('Discount_rate').size().reset_index(name='c9')
    X = pd.merge(X, temp, how='left', on='Discount_rate')

    # 不同打折優惠券不使用次數
    temp = coupon_no_consume.groupby('Discount_rate').size().reset_index(name='c10')
    X = pd.merge(X, temp, how='left', on='Discount_rate')

    # 不同打折優惠券使用率
    X['c11'] = X.c9 / X.c8

    # 優惠券核銷平均時間
    temp = pd.merge(coupon_consume, coupon_consume.groupby('Coupon_id').Date.max().reset_index(name='max'))
    temp = pd.merge(temp, temp.groupby('Coupon_id').Date.min().reset_index(name='min'))
    temp = pd.merge(temp, temp.groupby('Coupon_id').size().reset_index(name='count'))
    temp['c12'] = ((temp['max'] - temp['min']).dt.days / (temp['count'] - 1))
    temp = temp.drop_duplicates('Coupon_id')
    X = pd.merge(X, temp[['Coupon_id', 'c12']], how='left', on='Coupon_id')

    '''user merchant feature'''

    # 使用者領取商家的優惠券次數
    temp = offline[offline.Coupon_id != 0]
    temp = temp.groupby(['User_id', 'Merchant_id']).size().reset_index(name='um1')
    X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])

    # 使用者領取商家的優惠券後不核銷次數
    temp = coupon_no_consume.groupby(['User_id', 'Merchant_id']).size().reset_index(name='um2')
    X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])

    # 使用者領取商家的優惠券後核銷次數
    temp = coupon_consume.groupby(['User_id', 'Merchant_id']).size().reset_index(name='um3')
    X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])

    # 使用者領取商家的優惠券後核銷率
    X['um4'] = X.um3 / X.um1

    # 使用者對每個商家的不核銷次數佔使用者總的不核銷次數的比重
    temp = coupon_no_consume.groupby('User_id').size().reset_index(name='temp')
    X = pd.merge(X, temp, how='left', on='User_id')
    X['um5'] = X.um2 / X.temp
    X.drop(columns='temp', inplace=True)

    # 使用者在商店總共消費過幾次
    temp = offline[offline.Date != date_null].groupby(['User_id', 'Merchant_id']).size().reset_index(name='um6')
    X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])

    # 使用者在商店普通消費次數
    temp = offline[(offline.Coupon_id == 0) & (offline.Date != date_null)]
    temp = temp.groupby(['User_id', 'Merchant_id']).size().reset_index(name='um7')
    X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])

    # 使用者當天在此商店領取的優惠券數目
    temp = offline[offline.Date_received != date_null]
    temp = temp.groupby(['User_id', 'Merchant_id', 'Date_received']).size().reset_index(name='um8')
    X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id', 'Date_received'])

    # 使用者領取優惠券不同商家數量
    temp = offline[offline.Coupon_id == offline.Coupon_id]
    temp = temp.groupby(['User_id', 'Merchant_id']).size().reset_index()
    temp = temp.groupby('User_id').size().reset_index(name='um9')
    X = pd.merge(X, temp, how='left', on='User_id')

    # 使用者核銷優惠券不同商家數量
    temp = coupon_consume.groupby(['User_id', 'Merchant_id']).size()
    temp = temp.groupby('User_id').size().reset_index(name='um10')
    X = pd.merge(X, temp, how='left', on='User_id')

    # 使用者核銷過優惠券的不同商家數量佔所有不同商家的比重
    X['um11'] = X.um10 / X.um9

    # 使用者平均核銷每個商家多少張優惠券
    X['um12'] = X.u2 / X.um9

    '''other feature'''

    # 使用者領取的所有優惠券數目
    temp = X.groupby('User_id').size().reset_index(name='o1')
    X = pd.merge(X, temp, how='left', on='User_id')

    # 使用者領取的特定優惠券數目
    temp = X.groupby(['User_id', 'Coupon_id']).size().reset_index(name='o2')
    X = pd.merge(X, temp, how='left', on=['User_id', 'Coupon_id'])

    # multiple threads
    # data split
    stop = len(X)
    step = int(ceil(stop / cpu_jobs))

    X_chunks = [X[i:i + step] for i in range(0, stop, step)]
    X_list = [X] * cpu_jobs
    counters = [i for i in range(cpu_jobs)]

    start = datetime.datetime.now()
    with ProcessPoolExecutor() as e:
        X = pd.concat(e.map(task, X_chunks, X_list, counters))
        print('time:', str(datetime.datetime.now() - start).split('.')[0])
    # multiple threads

    # 使用者領取優惠券平均時間間隔
    temp = pd.merge(X, X.groupby('User_id').Date_received.max().reset_index(name='max'))
    temp = pd.merge(temp, temp.groupby('User_id').Date_received.min().reset_index(name='min'))
    temp = pd.merge(temp, temp.groupby('User_id').size().reset_index(name='len'))
    temp['o7'] = ((temp['max'] - temp['min']).dt.days / (temp['len'] - 1))
    temp = temp.drop_duplicates('User_id')
    X = pd.merge(X, temp[['User_id', 'o7']], how='left', on='User_id')

    # 使用者領取特定商家的優惠券數目
    temp = X.groupby(['User_id', 'Merchant_id']).size().reset_index(name='o8')
    X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])

    # 使用者領取的不同商家數目
    temp = X.groupby(['User_id', 'Merchant_id']).size()
    temp = temp.groupby('User_id').size().reset_index(name='o9')
    X = pd.merge(X, temp, how='left', on='User_id')

    # 使用者當天領取的優惠券數目
    temp = X.groupby(['User_id', 'Date_received']).size().reset_index(name='o10')
    X = pd.merge(X, temp, how='left', on=['User_id', 'Date_received'])

    # 使用者當天領取的特定優惠券數目
    temp = X.groupby(['User_id', 'Coupon_id', 'Date_received']).size().reset_index(name='o11')
    X = pd.merge(X, temp, how='left', on=['User_id', 'Coupon_id', 'Date_received'])

    # 使用者領取的所有優惠券種類數目
    temp = X.groupby(['User_id', 'Coupon_id']).size()
    temp = temp.groupby('User_id').size().reset_index(name='o12')
    X = pd.merge(X, temp, how='left', on='User_id')

    # 商家被領取的優惠券數目
    temp = X.groupby('Merchant_id').size().reset_index(name='o13')
    X = pd.merge(X, temp, how='left', on='Merchant_id')

    # 商家被領取的特定優惠券數目
    temp = X.groupby(['Merchant_id', 'Coupon_id']).size().reset_index(name='o14')
    X = pd.merge(X, temp, how='left', on=['Merchant_id', 'Coupon_id'])

    # 商家被多少不同使用者領取的數目
    temp = X.groupby(['Merchant_id', 'User_id']).size()
    temp = temp.groupby('Merchant_id').size().reset_index(name='o15')
    X = pd.merge(X, temp, how='left', on='Merchant_id')

    # 商家發行的所有優惠券種類數目
    temp = X.groupby(['Merchant_id', 'Coupon_id']).size()
    temp = temp.groupby('Merchant_id').size().reset_index(name='o16')
    X = pd.merge(X, temp, how='left', on='Merchant_id')

    print(len(X), len(X.columns))

    return X


def get_online_features(online, X):
    # temp = online[online.Coupon_id == online.Coupon_id]
    # coupon_consume = temp[temp.Date == temp.Date]
    # coupon_no_consume = temp[temp.Date != temp.Date]

    # 使用者線上操作次數
    temp = online.groupby('User_id').size().reset_index(name='on_u1')
    X = pd.merge(X, temp, how='left', on='User_id')

    # 使用者線上點選次數
    temp = online[online.Action == 0].groupby('User_id').size().reset_index(name='on_u2')
    X = pd.merge(X, temp, how='left', on='User_id')

    # 使用者線上點選率
    X['on_u3'] = X.on_u2 / X.on_u1

    # 使用者線上購買次數
    temp = online[online.Action == 1].groupby('User_id').size().reset_index(name='on_u4')
    X = pd.merge(X, temp, how='left', on='User_id')

    # 使用者線上購買率
    X['on_u5'] = X.on_u4 / X.on_u1

    # 使用者線上領取次數
    temp = online[online.Coupon_id != 0].groupby('User_id').size().reset_index(name='on_u6')
    X = pd.merge(X, temp, how='left', on='User_id')

    # 使用者線上領取率
    X['on_u7'] = X.on_u6 / X.on_u1

    # 使用者線上不消費次數
    temp = online[(online.Date == date_null) & (online.Coupon_id != 0)]
    temp = temp.groupby('User_id').size().reset_index(name='on_u8')
    X = pd.merge(X, temp, how='left', on='User_id')

    # 使用者線上優惠券核銷次數
    temp = online[(online.Date != date_null) & (online.Coupon_id != 0)]
    temp = temp.groupby('User_id').size().reset_index(name='on_u9')
    X = pd.merge(X, temp, how='left', on='User_id')

    # 使用者線上優惠券核銷率
    X['on_u10'] = X.on_u9 / X.on_u6

    # 使用者線下不消費次數佔線上線下總的不消費次數的比重
    X['on_u11'] = X.u3 / (X.on_u8 + X.u3)

    # 使用者線下的優惠券核銷次數佔線上線下總的優惠券核銷次數的比重
    X['on_u12'] = X.u2 / (X.on_u9 + X.u2)

    # 使用者線下領取的記錄數量佔總的記錄數量的比重
    X['on_u13'] = X.u1 / (X.on_u6 + X.u1)

    # # 消費優惠券的平均折率
    # temp = coupon_consume.groupby('User_id').discount_rate.mean().reset_index(name='ou14')
    # X = pd.merge(X, temp, how='left', on='User_id')
    #
    # # 使用者核銷優惠券的最低消費折率
    # temp = coupon_consume.groupby('User_id').discount_rate.min().reset_index(name='ou15')
    # X = pd.merge(X, temp, how='left', on='User_id')
    #
    # # 使用者核銷優惠券的最高消費折率
    # temp = coupon_consume.groupby('User_id').discount_rate.max().reset_index(name='ou16')
    # X = pd.merge(X, temp, how='left', on='User_id')
    #
    # # 不同打折優惠券領取次數
    # temp = online.groupby('Discount_rate').size().reset_index(name='oc1')
    # X = pd.merge(X, temp, how='left', on='Discount_rate')
    #
    # # 不同打折優惠券使用次數
    # temp = coupon_consume.groupby('Discount_rate').size().reset_index(name='oc2')
    # X = pd.merge(X, temp, how='left', on='Discount_rate')
    #
    # # 不同打折優惠券不使用次數
    # temp = coupon_no_consume.groupby('Discount_rate').size().reset_index(name='oc3')
    # X = pd.merge(X, temp, how='left', on='Discount_rate')
    #
    # # 不同打折優惠券使用率
    # X['oc4'] = X.oc2 / X.oc1

    print(len(X), len(X.columns))
    print('----------')

    return X


def get_train_data():
    path = 'cache_%s_train.csv' % os.path.basename(__file__)

    if os.path.exists(path):
        data = pd.read_csv(path)
    else:
        offline, online = get_preprocess_data()

        # date received 2016-01-01 - 2016-06-15
        # date consumed 2016-01-01 - 2016-06-30

        # train data 1
        # 2016-04-16 ~ 2016-05-15
        data_1 = offline[('2016-04-16' <= offline.Date_received) & (offline.Date_received <= '2016-05-15')].copy()
        data_1['label'] = 0
        data_1.loc[
            (data_1.Date != date_null) & (data_1.Date - data_1.Date_received <= datetime.timedelta(15)), 'label'] = 1

        # feature data 1
        # 領券 2016-01-01 ~ 2016-03-31
        end = '2016-03-31'
        data_off_1 = offline[offline.Date_received <= end]
        data_on_1 = online[online.Date_received <= end]

        # 普通消費 2016-01-01 ~ 2016-04-15
        end = '2016-04-15'
        data_off_2 = offline[(offline.Coupon_id == 0) & (offline.Date <= end)]
        data_on_2 = online[(online.Coupon_id == 0) & (online.Date <= end)]

        data_1 = get_offline_features(data_1, pd.concat([data_off_1, data_off_2]))
        data_1 = get_online_features(pd.concat([data_on_1, data_on_2]), data_1)

        # train data 2
        # 2016-05-16 ~ 2016-06-15
        data_2 = offline['2016-05-16' <= offline.Date_received].copy()
        data_2['label'] = 0
        data_2.loc[
            (data_2.Date != date_null) & (data_2.Date - data_2.Date_received <= datetime.timedelta(15)), 'label'] = 1

        # feature data 2
        # 領券
        start = '2016-02-01'
        end = '2016-04-30'
        data_off_1 = offline[(start <= offline.Date_received) & (offline.Date_received <= end)]
        data_on_1 = online[(start <= online.Date_received) & (online.Date_received <= end)]

        # 普通消費
        start = '2016-02-01'
        end = '2016-05-15'
        data_off_2 = offline[(offline.Coupon_id == 0) & (start <= offline.Date) & (offline.Date <= end)]
        data_on_2 = online[(online.Coupon_id == 0) & (start <= online.Date) & (online.Date <= end)]

        data_2 = get_offline_features(data_2, pd.concat([data_off_1, data_off_2]))
        data_2 = get_online_features(pd.concat([data_on_1, data_on_2]), data_2)

        data = pd.concat([data_1, data_2])

        # undersampling
        # if undersampling:
        #     temp = X_1[X_1.label == 1].groupby('User_id').size().reset_index()
        #     temp = X_1[X_1.User_id.isin(temp.User_id)]
        #     X_1 = pd.concat([temp, X_1[~X_1.User_id.isin(temp.User_id)].sample(4041)])

        # data.drop_duplicates(inplace=True)
        drop_columns(data)
        data.fillna(0, inplace=True)
        data.to_csv(path, index=False)

    return data


def analysis():
    offline, online = get_preprocess_data()

    # t = offline.groupby('Discount_rate').size().reset_index(name='receive_count')
    # t1 = offline[(offline.Coupon_id != 0) & (offline.Date != date_null)]
    # t1 = t1.groupby('Discount_rate').size().reset_index(name='consume_count')
    # t = pd.merge(t, t1, on='Discount_rate')
    # t['consume_rate'] = t.consume_count / t.receive_count

    # t = offline.groupby('Merchant_id').size().reset_index(name='receive_count')
    # t1 = offline[(offline.Coupon_id != 0) & (offline.Date != date_null)]
    # t1 = t1.groupby('Merchant_id').size().reset_index(name='consume_count')
    # t = pd.merge(t, t1, on='Merchant_id')
    # t['consume_rate'] = t.consume_count / t.receive_count

    t = offline.groupby('Distance').size().reset_index(name='receive_count')
    t1 = offline[(offline.Coupon_id != 0) & (offline.Date != date_null)]
    t1 = t1.groupby('Distance').size().reset_index(name='consume_count')
    t = pd.merge(t, t1, on='Distance')
    t['consume_rate'] = t.consume_count / t.receive_count

    t.to_csv('note.csv')

    # plt.bar(temp.Discount_rate.values, temp.total.values)
    # plt.bar(range(num), y1, bottom=y2, fc='r')
    # plt.show()

    exit()


def detect_duplicate_columns():
    X = get_train_data()
    X = X[:1000]

    for index1 in range(len(X.columns) - 1):
        for index2 in range(index1 + 1, len(X.columns)):
            column1 = X.columns[index1]
            column2 = X.columns[index2]
            X[column1] = X[column1].astype(str)
            X[column2] = X[column2].astype(str)
            temp = len(X[X[column1] == X[column2]])
            if temp == len(X):
                print(column1, column2, temp)
    exit()


def feature_importance_score():
    clf = train_xgb()
    fscores = pd.Series(clf.get_booster().get_fscore()).sort_values(ascending=False)
    fscores.plot(kind='bar', title='Feature Importance')
    plt.ylabel('Feature Importance Score')
    plt.show()
    exit()


def feature_selection():
    data = get_train_data()

    train_data, test_data = train_test_split(data,
                                             train_size=100000,
                                             random_state=0
                                             )

    X = train_data.copy().drop(columns='Coupon_id')
    y = X.pop('label')

    # sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
    # X = sel.fit_transform(X)
    # print(X.shape)
    # Create the RFE object and rank each pixel


def fit_eval_metric(estimator, X, y, name=None):
    if name is None:
        name = estimator.__class__.__name__

    if name is 'XGBClassifier' or name is 'LGBMClassifier':
        estimator.fit(X, y, eval_metric='auc')
    else:
        estimator.fit(X, y)

    return estimator


def grid_search(estimator, param_grid):
    start = datetime.datetime.now()

    print('--------------------------------------------')
    print(start.strftime('%Y-%m-%d %H:%M:%S'))
    print(param_grid)
    print()

    data = get_train_data()

    data, _ = train_test_split(data, train_size=100000, random_state=0)

    X = data.copy().drop(columns='Coupon_id')
    y = X.pop('label')

    estimator_name = estimator.__class__.__name__
    n_jobs = cpu_jobs
    clf = GridSearchCV(estimator=estimator, param_grid=param_grid, scoring='roc_auc', n_jobs=n_jobs
                       # cv=5
                       )

    clf = fit_eval_metric(clf, X, y, estimator_name)

    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print('%0.5f (+/-%0.05f) for %r' % (mean, std * 2, params))
    print()
    print('best params', clf.best_params_)
    print('best score', clf.best_score_)
    print('time: %s' % str((datetime.datetime.now() - start)).split('.')[0])
    print()

    return clf.best_params_, clf.best_score_


def grid_search_auto(steps, params, estimator):
    global log

    old_params = params.copy()

    while 1:
        for name, step in steps.items():
            score = 0

            start = params[name] - step['step']
            if start <= step['min']:
                start = step['min']

            stop = params[name] + step['step']
            if step['max'] != 'inf' and stop >= step['max']:
                stop = step['max']

            while 1:

                if str(step['step']).count('.') == 1:
                    stop += step['step'] / 10
                else:
                    stop += step['step']

                param_grid = {
                    name: np.arange(start, stop, step['step']),
                }

                best_params, best_score = grid_search(estimator.set_params(**params), param_grid)

                if best_params[name] == params[name] or score > best_score:
                    print(estimator.__class__.__name__, params)
                    break

                direction = (best_params[name] - params[name]) // abs(best_params[name] - params[name])
                start = stop = best_params[name] + step['step'] * direction

                score = best_score
                params[name] = best_params[name]
                print(estimator.__class__.__name__, params)

                if best_params[name] - step['step'] < step['min'] or (
                        step['max'] != 'inf' and best_params[name] + step['step'] > step['max']):
                    break

        if old_params == params:
            break
        old_params = params
        print('--------------------------------------------')
        print('new grid search')

    print('--------------------------------------------')
    log += 'grid search: %s\n%r\n' % (estimator.__class__.__name__, params)


def grid_search_gbdt(get_param=False):
    params = {
        # 10
        'learning_rate': 1e-2,
        'n_estimators': 1900,
        'max_depth': 9,
        'min_samples_split': 200,
        'min_samples_leaf': 50,
        'subsample': .8,

        # 'learning_rate': 1e-1,
        # 'n_estimators': 200,
        # 'max_depth': 8,
        # 'min_samples_split': 200,
        # 'min_samples_leaf': 50,
        # 'subsample': .8,

    }

    if get_param:
        return params

    steps = {
        'n_estimators': {'step': 100, 'min': 1, 'max': 'inf'},
        'max_depth': {'step': 1, 'min': 1, 'max': 'inf'},
        'min_samples_split': {'step': 10, 'min': 2, 'max': 'inf'},
        'min_samples_leaf': {'step': 10, 'min': 1, 'max': 'inf'},
        'subsample': {'step': .1, 'min': .1, 'max': 1},
    }

    grid_search_auto(steps, params, GradientBoostingClassifier())


def grid_search_xgb(get_param=False):
    params = {
        'learning_rate': 1e-2,
        'n_estimators': 1260,
        'max_depth': 8,
        'min_child_weight': 4,
        'gamma': .2,
        'subsample': .6,
        'colsample_bytree': .8,
        'scale_pos_weight': 1,
        'reg_alpha': 0,

    }

    if get_param:
        return params

    steps = {
        'n_estimators': {'step': 10, 'min': 1, 'max': 'inf'},
        'max_depth': {'step': 1, 'min': 1, 'max': 'inf'},
        'min_child_weight': {'step': 1, 'min': 1, 'max': 'inf'},
        'gamma': {'step': .1, 'min': 0, 'max': 1},
        'subsample': {'step': .1, 'min': .1, 'max': 1},
        'colsample_bytree': {'step': .1, 'min': .1, 'max': 1},
        'scale_pos_weight': {'step': 1, 'min': 1, 'max': 10},
        'reg_alpha': {'step': .1, 'min': 0, 'max': 1},
    }

    grid_search_auto(steps, params, XGBClassifier())


def grid_search_lgb(get_param=False):
    params = {
        # 10
        'learning_rate': 1e-2,
        'n_estimators': 1200,
        'num_leaves': 51,
        'min_split_gain': 0,
        'min_child_weight': 1e-3,
        'min_child_samples': 22,
        'subsample': .8,
        'colsample_bytree': .8,

        # 'learning_rate': .1,
        # 'n_estimators': 90,
        # 'num_leaves': 50,
        # 'min_split_gain': 0,
        # 'min_child_weight': 1e-3,
        # 'min_child_samples': 21,
        # 'subsample': .8,
        # 'colsample_bytree': .8,

    }

    if get_param:
        return params

    steps = {
        'n_estimators': {'step': 100, 'min': 1, 'max': 'inf'},
        'num_leaves': {'step': 1, 'min': 1, 'max': 'inf'},
        'min_split_gain': {'step': .1, 'min': 0, 'max': 1},
        'min_child_weight': {'step': 1e-3, 'min': 1e-3, 'max': 'inf'},
        'min_child_samples': {'step': 1, 'min': 1, 'max': 'inf'},
        # 'subsample': {'step': .1, 'min': .1, 'max': 1},
        'colsample_bytree': {'step': .1, 'min': .1, 'max': 1},
    }

    grid_search_auto(steps, params, LGBMClassifier())


def grid_search_cat(get_param=False):
    params = {
        # 10
        'learning_rate': 1e-2,
        'n_estimators': 3600,
        'max_depth': 8,
        'max_bin': 127,
        'reg_lambda': 2,
        'subsample': .7,

        # 'learning_rate': 1e-1,
        # 'iterations': 460,
        # 'depth': 8,
        # 'l2_leaf_reg': 8,
        # 'border_count': 37,

        # 'ctr_border_count': 16,
        'one_hot_max_size': 2,
        'bootstrap_type': 'Bernoulli',
        'leaf_estimation_method': 'Newton',
        'verbose': False,
        'eval_metric': 'AUC',
        'thread_count': cpu_jobs
    }

    if get_param:
        return params

    steps = {
        'n_estimators': {'step': 100, 'min': 1, 'max': 'inf'},
        'max_depth': {'step': 1, 'min': 1, 'max': 'inf'},
        'max_bin': {'step': 1, 'min': 1, 'max': 255},
        'reg_lambda': {'step': 1, 'min': 0, 'max': 'inf'},
        'subsample': {'step': .1, 'min': .1, 'max': 1},
        'one_hot_max_size': {'step': 1, 'min': 0, 'max': 255},
    }

    grid_search_auto(steps, params, CatBoostClassifier())


def grid_search_rf(criterion='gini', get_param=False):
    if criterion == 'gini':
        params = {
            # 10
            'n_estimators': 3090,
            'max_depth': 15,
            'min_samples_split': 2,
            'min_samples_leaf': 1,

            'criterion': 'gini',
        }
    else:
        params = {
            'n_estimators': 3110,
            'max_depth': 13,
            'min_samples_split': 70,
            'min_samples_leaf': 10,
            'criterion': 'entropy',
        }

    if get_param:
        return params

    steps = {
        'n_estimators': {'step': 10, 'min': 1, 'max': 'inf'},
        'max_depth': {'step': 1, 'min': 1, 'max': 'inf'},
        'min_samples_split': {'step': 2, 'min': 2, 'max': 'inf'},
        'min_samples_leaf': {'step': 2, 'min': 1, 'max': 'inf'},
    }

    grid_search_auto(steps, params, RandomForestClassifier())


def grid_search_et(criterion='gini', get_param=False):
    if criterion == 'gini':
        params = {
            # 10
            'n_estimators': 3060,
            'max_depth': 22,
            'min_samples_split': 12,
            'min_samples_leaf': 1,

            'criterion': 'gini',
        }
    else:
        params = {
            'n_estimators': 3100,
            'max_depth': 13,
            'min_samples_split': 70,
            'min_samples_leaf': 10,
            'criterion': 'entropy',
        }

    if get_param:
        return params

    steps = {
        'n_estimators': {'step': 10, 'min': 1, 'max': 'inf'},
        'max_depth': {'step': 1, 'min': 1, 'max': 'inf'},
        'min_samples_split': {'step': 2, 'min': 2, 'max': 'inf'},
        'min_samples_leaf': {'step': 2, 'min': 1, 'max': 'inf'},
    }

    grid_search_auto(steps, params, ExtraTreesClassifier())


def train_gbdt(model=False):
    global log

    params = grid_search_gbdt(True)
    clf = GradientBoostingClassifier().set_params(**params)

    if model:
        return clf

    params = clf.get_params()
    log += 'gbdt'
    log += ', learning_rate: %.3f' % params['learning_rate']
    log += ', n_estimators: %d' % params['n_estimators']
    log += ', max_depth: %d' % params['max_depth']
    log += ', min_samples_split: %d' % params['min_samples_split']
    log += ', min_samples_leaf: %d' % params['min_samples_leaf']
    log += ', subsample: %.1f' % params['subsample']
    log += '\n\n'

    return train(clf)


def train_xgb(model=False):
    global log

    params = grid_search_xgb(True)

    clf = XGBClassifier().set_params(**params)

    if model:
        return clf

    params = clf.get_params()
    log += 'xgb'
    log += ', learning_rate: %.3f' % params['learning_rate']
    log += ', n_estimators: %d' % params['n_estimators']
    log += ', max_depth: %d' % params['max_depth']
    log += ', min_child_weight: %d' % params['min_child_weight']
    log += ', gamma: %.1f' % params['gamma']
    log += ', subsample: %.1f' % params['subsample']
    log += ', colsample_bytree: %.1f' % params['colsample_bytree']
    log += '\n\n'

    return train(clf)


def train_lgb(model=False):
    global log

    params = grid_search_lgb(True)

    clf = LGBMClassifier().set_params(**params)

    if model:
        return clf

    params = clf.get_params()
    log += 'lgb'
    log += ', learning_rate: %.3f' % params['learning_rate']
    log += ', n_estimators: %d' % params['n_estimators']
    log += ', num_leaves: %d' % params['num_leaves']
    log += ', min_split_gain: %.1f' % params['min_split_gain']
    log += ', min_child_weight: %.4f' % params['min_child_weight']
    log += ', min_child_samples: %d' % params['min_child_samples']
    log += ', subsample: %.1f' % params['subsample']
    log += ', colsample_bytree: %.1f' % params['colsample_bytree']
    log += '\n\n'

    return train(clf)


def train_cat(model=False):
    global log

    params = grid_search_cat(True)

    clf = CatBoostClassifier().set_params(**params)

    if model:
        return clf

    params = clf.get_params()
    log += 'cat'
    log += ', learning_rate: %.3f' % params['learning_rate']
    log += ', iterations: %d' % params['iterations']
    log += ', depth: %d' % params['depth']
    log += ', l2_leaf_reg: %d' % params['l2_leaf_reg']
    log += ', border_count: %d' % params['border_count']
    log += ', subsample: %d' % params['subsample']
    log += ', one_hot_max_size: %d' % params['one_hot_max_size']
    log += '\n\n'

    return train(clf)


def train_rf(clf):
    global log

    params = clf.get_params()
    log += 'rf'
    log += ', n_estimators: %d' % params['n_estimators']
    log += ', max_depth: %d' % params['max_depth']
    log += ', min_samples_split: %d' % params['min_samples_split']
    log += ', min_samples_leaf: %d' % params['min_samples_leaf']
    log += ', criterion: %s' % params['criterion']
    log += '\n\n'

    return train(clf)


def train_rf_gini(model=False):
    clf = RandomForestClassifier().set_params(**grid_search_rf('gini', True))
    if model:
        return clf
    return train_rf(clf)


def train_rf_entropy():
    clf = RandomForestClassifier().set_params(**grid_search_rf('entropy', True))

    return train_rf(clf)


def train_et(clf):
    global log

    params = clf.get_params()
    log += 'et'
    log += ', n_estimators: %d' % params['n_estimators']
    log += ', max_depth: %d' % params['max_depth']
    log += ', min_samples_split: %d' % params['min_samples_split']
    log += ', min_samples_leaf: %d' % params['min_samples_leaf']
    log += ', criterion: %s' % params['criterion']
    log += '\n\n'

    return train(clf)


def train_et_gini(model=False):
    clf = ExtraTreesClassifier().set_params(**grid_search_et('gini', True))
    if model:
        return clf
    return train_et(clf)


def train_et_entropy():
    clf = ExtraTreesClassifier().set_params(**{
        'n_estimators': 310,
        'max_depth': 13,
        'min_samples_split': 70,
        'min_samples_leaf': 10,
        'criterion': 'entropy',
    })

    return train_et(clf)


def train(clf):
    global log

    data = get_train_data()

    train_data, test_data = train_test_split(data,
                                             train_size=100000,
                                             random_state=0
                                             )

    _, test_data = train_test_split(data, random_state=0)

    X_train = train_data.copy().drop(columns='Coupon_id')
    y_train = X_train.pop('label')

    clf = fit_eval_metric(clf, X_train, y_train)

    X_test = test_data.copy().drop(columns='Coupon_id')
    y_test = X_test.pop('label')

    y_true, y_pred = y_test, clf.predict(X_test)
    # log += '%s\n' % classification_report(y_test, y_pred)
    log += '  accuracy: %f\n' % accuracy_score(y_true, y_pred)
    y_score = clf.predict_proba(X_test)[:, 1]
    log += '       auc: %f\n' % roc_auc_score(y_true, y_score)

    # coupon average auc
    coupons = test_data.groupby('Coupon_id').size().reset_index(name='total')
    aucs = []
    for _, coupon in coupons.iterrows():
        if coupon.total > 1:
            X_test = test_data[test_data.Coupon_id == coupon.Coupon_id].copy()
            X_test.drop(columns='Coupon_id', inplace=True)

            if len(X_test.label.unique()) != 2:
                continue

            y_true = X_test.pop('label')
            y_score = clf.predict_proba(X_test)[:, 1]
            aucs.append(roc_auc_score(y_true, y_score))

    log += 'coupon auc: %f\n\n' % np.mean(aucs)

    return clf


def predict(model):
    path = 'cache_%s_predict.csv' % os.path.basename(__file__)

    if os.path.exists(path):
        X = pd.read_csv(path, parse_dates=['Date_received'])
    else:
        offline, online = get_preprocess_data()

        # 2016-03-16 ~ 2016-06-30
        start = '2016-03-16'
        offline = offline[(offline.Coupon_id == 0) & (start <= offline.Date) | (start <= offline.Date_received)]
        online = online[(online.Coupon_id == 0) & (start <= online.Date) | (start <= online.Date_received)]

        X = get_preprocess_data(True)
        X = get_offline_features(X, offline)
        X = get_online_features(online, X)
        X.drop_duplicates(inplace=True)
        X.fillna(0, inplace=True)
        X.to_csv(path, index=False)

    sample_submission = X[['User_id', 'Coupon_id', 'Date_received']].copy()
    sample_submission.Date_received = sample_submission.Date_received.dt.strftime('%Y%m%d')
    drop_columns(X, True)

    if model is 'blending':
        predict = blending(X)
    else:
        clf = eval('train_%s' % model)()
        predict = clf.predict_proba(X)[:, 1]

    sample_submission['Probability'] = predict
    sample_submission.to_csv('submission_%s.csv' % model,
                             #  float_format='%.5f',
                             index=False, header=False)


def blending(predict_X=None):
    global log
    log += '\n'

    X = get_train_data().drop(columns='Coupon_id')
    y = X.pop('label')

    X = np.asarray(X)
    y = np.asarray(y)

    _, X_submission, _, y_test_blend = train_test_split(X, y,
                                                        random_state=0
                                                        )

    if predict_X is not None:
        X_submission = np.asarray(predict_X)

    X, _, y, _ = train_test_split(X, y,
                                  train_size=100000,
                                  random_state=0
                                  )

    # np.random.seed(0)
    # idx = np.random.permutation(y.size)
    # X = X[idx]
    # y = y[idx]

    skf = StratifiedKFold()
    # clfs = ['gbdt', 'xgb', 'lgb', 'cat',
    #         # 'rf_gini', 'et_gini'
    #         ]
    clfs = ['gbdt', 'cat', 'lgb']

    blend_X_train = np.zeros((X.shape[0], len(clfs)))
    blend_X_test = np.zeros((X_submission.shape[0], len(clfs)))

    for j, v in enumerate(clfs):
        clf = eval('train_%s' % v)(True)

        aucs = []
        dataset_blend_test_j = []

        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            clf = fit_eval_metric(clf, X_train, y_train)

            y_submission = clf.predict_proba(X_test)[:, 1]
            aucs.append(roc_auc_score(y_test, y_submission))

            blend_X_train[test_index, j] = y_submission
            dataset_blend_test_j.append(clf.predict_proba(X_submission)[:, 1])

        log += '%7s' % v + ' auc: %f\n' % np.mean(aucs)
        blend_X_test[:, j] = np.asarray(dataset_blend_test_j).T.mean(1)

    print('blending')
    clf = LogisticRegression()
    # clf = GradientBoostingClassifier()
    clf.fit(blend_X_train, y)
    y_submission = clf.predict_proba(blend_X_test)[:, 1]

    # Linear stretch of predictions to [0,1]
    y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())
    if predict_X is not None:
        return y_submission
    log += '\n  blend auc: %f\n\n' % roc_auc_score(y_test_blend, y_submission)


if __name__ == '__main__':
    start = datetime.datetime.now()
    print(start.strftime('%Y-%m-%d %H:%M:%S'))
    log = '%s\n' % start.strftime('%Y-%m-%d %H:%M:%S')
    cpu_jobs = os.cpu_count() - 1
    date_null = pd.to_datetime('1970-01-01', format='%Y-%m-%d')

    predict('blending')

    log += 'time: %s\n' % str((datetime.datetime.now() - start)).split('.')[0]
    log += '----------------------------------------------------\n'
    open('%s.log' % os.path.basename(__file__), 'a').write(log)
    print(log)

模型2:分數0.79

# 解決lgb報錯
import os

os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

import datetime
import os
from concurrent.futures import ProcessPoolExecutor
from math import ceil

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from xgboost.sklearn import XGBClassifier
os.chdir(r'E:\專案檔案\o2o優惠券使用預測')

pd.set_option('expand_frame_repr', False)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)


def drop_columns(X, predict=False):
    columns = [
        'User_id', 'Merchant_id', 'Discount_rate', 'Date_received', 'discount_rate_x', 'discount_rate_y',
        # 'u33', 'u34'
    ]

    if predict:
        columns.append('Coupon_id')
    else:
        columns.append('Date')

    X.drop(columns=columns, inplace=True)


def get_preprocess_data(predict=False):
    if predict:
        offline = pd.read_csv('ccf_offline_stage1_test_revised.csv', parse_dates=['Date_received'])
    else:
        offline = pd.read_csv('ccf_offline_stage1_train.csv', parse_dates=['Date_received', 'Date'])

    offline.Distance.fillna(11, inplace=True)
    offline.Distance = offline.Distance.astype(int)
    offline.Coupon_id.fillna(0, inplace=True)
    offline.Coupon_id = offline.Coupon_id.astype(int)
    offline.Date_received.fillna(date_null, inplace=True)

    offline[['discount_rate_x', 'discount_rate_y']] = offline[offline.Discount_rate.str.contains(':') == True][
        'Discount_rate'].str.split(':', expand=True).astype(int)
    offline['discount_rate'] = 1 - offline.discount_rate_y / offline.discount_rate_x
    offline.discount_rate = offline.discount_rate.fillna(offline.Discount_rate).astype(float)

    if predict:
        return offline

    offline.Date.fillna(date_null, inplace=True)

    # online
    online = pd.read_csv('ccf_online_stage1_train.csv', parse_dates=['Date_received', 'Date'])

    online.Coupon_id.fillna(0, inplace=True)
    # online.Coupon_id = online.Coupon_id.astype(int)
    online.Date_received.fillna(date_null, inplace=True)
    online.Date.fillna(date_null, inplace=True)

    return offline, online


def task(X_chunk, X, counter):
    print(counter, end=',', flush=True)
    X_chunk = X_chunk.copy()

    X_chunk['o17'] = -1
    X_chunk['o18'] = -1

    for i, user in X_chunk.iterrows():
        temp = X[X.User_id == user.User_id]

        temp1 = temp[temp.Date_received < user.Date_received]
        temp2 = temp[temp.Date_received > user.Date_received]

        # 使用者此次之後/前領取的所有優惠券數目
        X_chunk.loc[i, 'o3'] = len(temp1)
        X_chunk.loc[i, 'o4'] = len(temp2)

        # 使用者此次之後/前領取的特定優惠券數目
        X_chunk.loc[i, 'o5'] = len(temp1[temp1.Coupon_id == user.Coupon_id])
        X_chunk.loc[i, 'o6'] = len(temp2[temp2.Coupon_id == user.Coupon_id])

        # 使用者上/下一次領取的時間間隔
        temp1 = temp1.sort_values(by='Date_received', ascending=False)
        if len(temp1):
            X_chunk.loc[i, 'o17'] = (user.Date_received - temp1.iloc[0].Date_received).days

        temp2 = temp2.sort_values(by='Date_received')
        if len(temp2):
            X_chunk.loc[i, 'o18'] = (temp2.iloc[0].Date_received - user.Date_received).days

    return X_chunk


def get_offline_features(X, offline):
    # X = X[:1000]

    print(len(X), len(X.columns))

    temp = offline[offline.Coupon_id != 0]
    coupon_consume = temp[temp.Date != date_null]
    coupon_no_consume = temp[temp.Date == date_null]

    user_coupon_consume = coupon_consume.groupby('User_id')

    X['weekday'] = X.Date_received.dt.weekday
    X['day'] = X.Date_received.dt.day

    # # 距離優惠券消費次數
    # temp = coupon_consume.groupby('Distance').size().reset_index(name='distance_0')
    # X = pd.merge(X, temp, how='left', on='Distance')
    #
    # # 距離優惠券不消費次數
    # temp = coupon_no_consume.groupby('Distance').size().reset_index(name='distance_1')
    # X = pd.merge(X, temp, how='left', on='Distance')
    #
    # # 距離優惠券領取次數
    # X['distance_2'] = X.distance_0 + X.distance_1
    #
    # # 距離優惠券消費率
    # X['distance_3'] = X.distance_0 / X.distance_2

    # temp = coupon_consume[coupon_consume.Distance != 11].groupby('Distance').size()
    # temp['d4'] = temp.Distance.sum() / len(temp)
    # X = pd.merge(X, temp, how='left', on='Distance')

    '''user features'''

    # 優惠券消費次數
    temp = user_coupon_consume.size().reset_index(name='u2')
    X = pd.merge(X, temp, how='left', on='User_id')
    # X.u2.fillna(0, inplace=True)
    # X.u2 = X.u2.astype(int)

    # 優惠券不消費次數
    temp = coupon_no_consume.groupby('User_id').size().reset_index(name='u3')
    X = pd.merge(X, temp, how='left', on='User_id')

    # 使用優惠券次數與沒使用優惠券次數比值
    X['u19'] = X.u2 / X.u3

    # 領取優惠券次數
    X['u1'] = X.u2.fillna(0) + X.u3.fillna(0)

    # 優惠券核銷率
    X['u4'] = X.u2 / X.u1

    # 普通消費次數
    temp = offline[(offline.Coupon_id == 0) & (offline.Date != date_null)]
    temp1 = temp.groupby('User_id').size().reset_index(name='u5')
    X = pd.merge(X, temp1, how='left', on='User_id')

    # 一共消費多少次
    X['u25'] = X.u2 + X.u5

    # 使用者使用優惠券消費佔比
    X['u20'] = X.u2 / X.u25

    # 正常消費平均間隔
    temp = pd.merge(temp, temp.groupby('User_id').Date.max().reset_index(name='max'))
    temp = pd.merge(temp, temp.groupby('User_id').Date.min().reset_index(name='min'))
    temp = pd.merge(temp, temp.groupby('User_id').size().reset_index(name='len'))
    temp['u6'] = ((temp['max'] - temp['min']).dt.days / (temp['len'] - 1))
    temp = temp.drop_duplicates('User_id')
    X = pd.merge(X, temp[['User_id', 'u6']], how='left', on='User_id')

    # 優惠券消費平均間隔
    temp = pd.merge(coupon_consume, user_coupon_consume.Date.max().reset_index(name='max'))
    temp = pd.merge(temp, temp.groupby('User_id').Date.min().reset_index(name='min'))
    temp = pd.merge(temp, temp.groupby('User_id').size().reset_index(name='len'))
    temp['u7'] = ((temp['max'] - temp['min']).dt.days / (temp['len'] - 1))
    temp = temp.drop_duplicates('User_id')
    X = pd.merge(X, temp[['User_id', 'u7']], how='left', on='User_id')

    # 15天內平均會普通消費幾次
    X['u8'] = X.u6 / 15

    # 15天內平均會優惠券消費幾次
    X['u9'] = X.u7 / 15

    # 領取優惠券到使用優惠券的平均間隔時間
    temp = coupon_consume.copy()
    temp['days'] = (temp.Date - temp.Date_received).dt.days
    temp = (temp.groupby('User_id').days.sum() / temp.groupby('User_id').size()).reset_index(name='u10')
    X = pd.merge(X, temp, how='left', on='User_id')

    # 在15天內使用掉優惠券的值大小
    X['u11'] = X.u10 / 15

    # 領取優惠券到使用優惠券間隔小於15天的次數
    temp = coupon_consume.copy()
    temp['days'] = (temp.Date - temp.Date_received).dt.days
    temp = temp[temp.days <= 15]
    temp = temp.groupby('User_id').size().reset_index(name='u21')
    X = pd.merge(X, temp, how='left', on='User_id')

    # 使用者15天使用掉優惠券的次數除以使用優惠券的次數
    X['u22'] = X.u21 / X.u2

    # 使用者15天使用掉優惠券的次數除以領取優惠券未消費的次數
    X['u23'] = X.u21 / X.u3

    # 使用者15天使用掉優惠券的次數除以領取優惠券的總次數
    X['u24'] = X.u21 / X.u1

    # 消費優惠券的平均折率
    temp = user_coupon_consume.discount_rate.mean().reset_index(name='u45')
    X = pd.merge(X, temp, how='left', on='User_id')

    # 使用者核銷優惠券的最低消費折率
    temp = user_coupon_consume.discount_rate.min().reset_index(name='u27')
    X = pd.merge(X, temp, how='left', on='User_id')

    # 使用者核銷優惠券的最高消費折率
    temp = user_coupon_consume.discount_rate.max().reset_index(name='u28')
    X = pd.merge(X, temp, how='left', on='User_id')

    # 使用者核銷過的不同優惠券數量
    temp = coupon_consume.groupby(['User_id', 'Coupon_id']).size()
    temp = temp.groupby('User_id').size().reset_index(name='u32')
    X = pd.merge(X, temp, how='left', on='User_id')

    # 使用者領取所有不同優惠券數量
    temp = offline[offline.Date_received != date_null]
    temp = temp.groupby(['User_id', 'Coupon_id']).size().reset_index(name='u47')
    X = pd.merge(X, temp, how='left', on=['User_id', 'Coupon_id'])

    # 使用者核銷過的不同優惠券數量佔所有不同優惠券的比重
    X['u33'] = X.u32 / X.u47

    # 使用者平均每種優惠券核銷多少張
    X['u34'] = X.u2 / X.u47

    # 核銷優惠券使用者-商家平均距離
    temp = offline[(offline.Coupon_id != 0) & (offline.Date != date_null) & (offline.Distance != 11)]
    temp = temp.groupby('User_id').Distance
    temp = pd.merge(temp.count().reset_index(name='x'), temp.sum().reset_index(name='y'), on='User_id')
    temp['u35'] = temp.y / temp.x
    temp = temp[['User_id', 'u35']]
    X = pd.merge(X, temp, how='left', on='User_id')

    # 使用者核銷優惠券中的最小使用者-商家距離
    temp = coupon_consume[coupon_consume.Distance != 11]
    temp = temp.groupby('User_id').Distance.min().reset_index(name='u36')
    X = pd.merge(X, temp, how='left', on='User_id')

    # 使用者核銷優惠券中的最大使用者-商家距離
    temp = coupon_consume[coupon_consume.Distance != 11]
    temp = temp.groupby('User_id').Distance.max().reset_index(name='u37')
    X = pd.merge(X, temp, how='left', on='User_id')

    # 優惠券型別
    discount_types = [
        '0.2', '0.5', '0.6', '0.7', '0.75', '0.8', '0.85', '0.9', '0.95', '30:20', '50:30', '10:5',
        '20:10', '100:50', '200:100', '50:20', '30:10', '150:50', '100:30', '20:5', '200:50', '5:1',
        '50:10', '100:20', '150:30', '30:5', '300:50', '200:30', '150:20', '10:1', '50:5', '100:10',
        '200:20', '300:30', '150:10', '300:20', '500:30', '20:1', '100:5', '200:10', '30:1', '150:5',
        '300:10', '200:5', '50:1', '100:1',
    ]
    X['discount_type'] = -1
    for k, v in enumerate(discount_types):
        X.loc[X.Discount_rate == v, 'discount_type'] = k

    # 不同優惠券領取次數
    temp = offline.groupby(['User_id', 'Discount_rate']).size().reset_index(name='u41')
    X = pd.merge(X, temp, how='left', on=['User_id', 'Discount_rate'])

    # 不同優惠券使用次數
    temp = coupon_consume.groupby(['User_id', 'Discount_rate']).size().reset_index(name='u42')
    X = pd.merge(X, temp, how='left', on=['User_id', 'Discount_rate'])

    # 不同優惠券不使用次數
    temp = coupon_no_consume.groupby(['User_id', 'Discount_rate']).size().reset_index(name='u43')
    X = pd.merge(X, temp, how='left', on=['User_id', 'Discount_rate'])

    # 不同打折優惠券使用率
    X['u44'] = X.u42 / X.u41

    # 滿減型別優惠券領取次數
    temp = offline[offline.Discount_rate.str.contains(':') == True]
    temp = temp.groupby('User_id').size().reset_index(name='u48')
    X = pd.merge(X, temp, how='left', on='User_id')

    # 打折型別優惠券領取次數
    temp = offline[offline.Discount_rate.str.contains('\.') == True]
    temp = temp.groupby('User_id').size().reset_index(name='u49')
    X = pd.merge(X, temp, how='left', on='User_id')

    '''offline merchant features'''

    # 商戶消費次數
    temp = offline[offline.Date != date_null].groupby('Merchant_id').size().reset_index(name='m0')
    X = pd.merge(X, temp, how='left', on='Merchant_id')

    # 商家優惠券被領取後核銷次數
    temp = coupon_consume.groupby('Merchant_id').size().reset_index(name='m1')
    X = pd.merge(X, temp, how='left', on='Merchant_id')

    # 商戶正常消費筆數
    X['m2'] = X.m0.fillna(0) - X.m1.fillna(0)

    # 商家優惠券被領取次數
    temp = offline[offline.Date_received != date_null].groupby('Merchant_id').size().reset_index(name='m3')
    X = pd.merge(X, temp, how='left', on='Merchant_id')

    # 商家優惠券被領取後核銷率
    X['m4'] = X.m1 / X.m3

    # 商家優惠券被領取後不核銷次數
    temp = coupon_no_consume.groupby('Merchant_id').size().reset_index(name='m7')
    X = pd.merge(X, temp, how='left', on='Merchant_id')

    # 商戶當天優惠券領取次數
    temp = X[X.Date_received != date_null]
    temp = temp.groupby(['Merchant_id', 'Date_received']).size().reset_index(name='m5')
    X = pd.merge(X, temp, how='left', on=['Merchant_id', 'Date_received'])

    # 商戶當天優惠券領取人數
    temp = X[X.Date_received != date_null]
    temp = temp.groupby(['User_id', 'Merchant_id', 'Date_received']).size().reset_index()
    temp = temp.groupby(['Merchant_id', 'Date_received']).size().reset_index(name='m6')
    X = pd.merge(X, temp, how='left', on=['Merchant_id', 'Date_received'])

    # 商家優惠券核銷的平均消費折率
    temp = coupon_consume.groupby('Merchant_id').discount_rate.mean().reset_index(name='m8')
    X = pd.merge(X, temp, how='left', on='Merchant_id')

    # 商家優惠券核銷的最小消費折率
    temp = coupon_consume.groupby('Merchant_id').discount_rate.max().reset_index(name='m9')
    X = pd.merge(X, temp, how='left', on='Merchant_id')

    # 商家優惠券核銷的最大消費折率
    temp = coupon_consume.groupby('Merchant_id').discount_rate.min().reset_index(name='m10')
    X = pd.merge(X, temp, how='left', on='Merchant_id')

    # 商家優惠券核銷不同的使用者數量
    temp = coupon_consume.groupby(['Merchant_id', 'User_id']).size()
    temp = temp.groupby('Merchant_id').size().reset_index(name='m11')
    X = pd.merge(X, temp, how='left', on='Merchant_id')

    # 商家優惠券領取不同的使用者數量
    temp = offline[offline.Date_received != date_null].groupby(['Merchant_id', 'User_id']).size()
    temp = temp.groupby('Merchant_id').size().reset_index(name='m12')
    X = pd.merge(X, temp, how='left', on='Merchant_id')

    # 核銷商家優惠券的不同使用者數量其佔領取不同的使用者比重
    X['m13'] = X.m11 / X.m12

    # 商家優惠券平均每個使用者核銷多少張
    X['m14'] = X.m1 / X.m12

    # 商家被核銷過的不同優惠券數量
    temp = coupon_consume.groupby(['Merchant_id', 'Coupon_id']).size()
    temp = temp.groupby('Merchant_id').size().reset_index(name='m15')
    X = pd.merge(X, temp, how='left', on='Merchant_id')

    # 商家領取過的不同優惠券數量的比重
    temp = offline[offline.Date_received != date_null].groupby(['Merchant_id', 'Coupon_id']).size()
    temp = temp.groupby('Merchant_id').count().reset_index(name='m18')
    X = pd.merge(X, temp, how='left', on='Merchant_id')

    # 商家被核銷過的不同優惠券數量佔所有領取過的不同優惠券數量的比重
    X['m19'] = X.m15 / X.m18

    # 商家被核銷優惠券的平均時間
    temp = pd.merge(coupon_consume, coupon_consume.groupby('Merchant_id').Date.max().reset_index(name='max'))
    temp = pd.merge(temp, temp.groupby('Merchant_id').Date.min().reset_index(name='min'))
    temp = pd.merge(temp, temp.groupby('Merchant_id').size().reset_index(name='len'))
    temp['m20'] = ((temp['max'] - temp['min']).dt.days / (temp['len'] - 1))
    temp = temp.drop_duplicates('Merchant_id')
    X = pd.merge(X, temp[['Merchant_id', 'm20']], how='left', on='Merchant_id')

    # 商家被核銷優惠券中的使用者-商家平均距離
    temp = coupon_consume[coupon_consume.Distance != 11].groupby('Merchant_id').Distance
    temp = pd.merge(temp.count().reset_index(name='x'), temp.sum().reset_index(name='y'), on='Merchant_id')
    temp['m21'] = temp.y / temp.x
    temp = temp[['Merchant_id', 'm21']]
    X = pd.merge(X, temp, how='left', on='Merchant_id')

    # 商家被核銷優惠券中的使用者-商家最小距離
    temp = coupon_consume[coupon_consume.Distance != 11]
    temp = temp.groupby('Merchant_id').Distance.min().reset_index(name='m22')
    X = pd.merge(X, temp, how='left', on='Merchant_id')

    # 商家被核銷優惠券中的使用者-商家最大距離
    temp = coupon_consume[coupon_consume.Distance != 11]
    temp = temp.groupby('Merchant_id').Distance.max().reset_index(name='m23')
    X = pd.merge(X, temp, how='left', on='Merchant_id')

    """offline coupon features"""

    # 此優惠券一共發行多少張
    temp = offline[offline.Coupon_id != 0].groupby('Coupon_id').size().reset_index(name='c1')
    X = pd.merge(X, temp, how='left', on='Coupon_id')

    # 此優惠券一共被使用多少張
    temp = coupon_consume.groupby('Coupon_id').size().reset_index(name='c2')
    X = pd.merge(X, temp, how='left', on='Coupon_id')

    # 優惠券使用率
    X['c3'] = X.c2 / X.c1

    # 沒有使用的數目
    X['c4'] = X.c1 - X.c2

    # 此優惠券在當天發行了多少張
    temp = X.groupby(['Coupon_id', 'Date_received']).size().reset_index(name='c5')
    X = pd.merge(X, temp, how='left', on=['Coupon_id', 'Date_received'])

    # 優惠券型別(直接優惠為0, 滿減為1)
    X['c6'] = 0
    X.loc[X.Discount_rate.str.contains(':') == True, 'c6'] = 1

    # 不同打折優惠券領取次數
    temp = offline.groupby('Discount_rate').size().reset_index(name='c8')
    X = pd.merge(X, temp, how='left', on='Discount_rate')

    # 不同打折優惠券使用次數
    temp = coupon_consume.groupby('Discount_rate').size().reset_index(name='c9')
    X = pd.merge(X, temp, how='left', on='Discount_rate')

    # 不同打折優惠券不使用次數
    temp = coupon_no_consume.groupby('Discount_rate').size().reset_index(name='c10')
    X = pd.merge(X, temp, how='left', on='Discount_rate')

    # 不同打折優惠券使用率
    X['c11'] = X.c9 / X.c8

    # 優惠券核銷平均時間
    temp = pd.merge(coupon_consume, coupon_consume.groupby('Coupon_id').Date.max().reset_index(name='max'))
    temp = pd.merge(temp, temp.groupby('Coupon_id').Date.min().reset_index(name='min'))
    temp = pd.merge(temp, temp.groupby('Coupon_id').size().reset_index(name='count'))
    temp['c12'] = ((temp['max'] - temp['min']).dt.days / (temp['count'] - 1))
    temp = temp.drop_duplicates('Coupon_id')
    X = pd.merge(X, temp[['Coupon_id', 'c12']], how='left', on='Coupon_id')

    '''user merchant feature'''

    # 使用者領取商家的優惠券次數
    temp = offline[offline.Coupon_id != 0]
    temp = temp.groupby(['User_id', 'Merchant_id']).size().reset_index(name='um1')
    X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])

    # 使用者領取商家的優惠券後不核銷次數
    temp = coupon_no_consume.groupby(['User_id', 'Merchant_id']).size().reset_index(name='um2')
    X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])

    # 使用者領取商家的優惠券後核銷次數
    temp = coupon_consume.groupby(['User_id', 'Merchant_id']).size().reset_index(name='um3')
    X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])

    # 使用者領取商家的優惠券後核銷率
    X['um4'] = X.um3 / X.um1

    # 使用者對每個商家的不核銷次數佔使用者總的不核銷次數的比重
    temp = coupon_no_consume.groupby('User_id').size().reset_index(name='temp')
    X = pd.merge(X, temp, how='left', on='User_id')
    X['um5'] = X.um2 / X.temp
    X.drop(columns='temp', inplace=True)

    # 使用者在商店總共消費過幾次
    temp = offline[offline.Date != date_null].groupby(['User_id', 'Merchant_id']).size().reset_index(name='um6')
    X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])

    # 使用者在商店普通消費次數
    temp = offline[(offline.Coupon_id == 0) & (offline.Date != date_null)]
    temp = temp.groupby(['User_id', 'Merchant_id']).size().reset_index(name='um7')
    X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])

    # 使用者當天在此商店領取的優惠券數目
    temp = offline[offline.Date_received != date_null]
    temp = temp.groupby(['User_id', 'Merchant_id', 'Date_received']).size().reset_index(name='um8')
    X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id', 'Date_received'])

    # 使用者領取優惠券不同商家數量
    temp = offline[offline.Coupon_id == offline.Coupon_id]
    temp = temp.groupby(['User_id', 'Merchant_id']).size().reset_index()
    temp = temp.groupby('User_id').size().reset_index(name='um9')
    X = pd.merge(X, temp, how='left', on='User_id')

    # 使用者核銷優惠券不同商家數量
    temp = coupon_consume.groupby(['User_id', 'Merchant_id']).size()
    temp = temp.groupby('User_id').size().reset_index(name='um10')
    X = pd.merge(X, temp, how='left', on='User_id')

    # 使用者核銷過優惠券的不同商家數量佔所有不同商家的比重
    X['um11'] = X.um10 / X.um9

    # 使用者平均核銷每個商家多少張優惠券
    X['um12'] = X.u2 / X.um9

    '''other feature'''

    # 使用者領取的所有優惠券數目
    temp = X.groupby('User_id').size().reset_index(name='o1')
    X = pd.merge(X, temp, how='left', on='User_id')

    # 使用者領取的特定優惠券數目
    temp = X.groupby(['User_id', 'Coupon_id']).size().reset_index(name='o2')
    X = pd.merge(X, temp, how='left', on=['User_id', 'Coupon_id'])

    # multiple threads
    # data split
    stop = len(X)
    step = int(ceil(stop / cpu_jobs))

    X_chunks = [X[i:i + step] for i in range(0, stop, step)]
    X_list = [X] * cpu_jobs
    counters = [i for i in range(cpu_jobs)]

    start = datetime.datetime.now()
    with ProcessPoolExecutor() as e:
        X = pd.concat(e.map(task, X_chunks, X_list, counters))
        print('time:', str(datetime.datetime.now() - start).split('.')[0])
    # multiple threads

    # 使用者領取優惠券平均時間間隔
    temp = pd.merge(X, X.groupby('User_id').Date_received.max().reset_index(name='max'))
    temp = pd.merge(temp, temp.groupby('User_id').Date_received.min().reset_index(name='min'))
    temp = pd.merge(temp, temp.groupby('User_id').size().reset_index(name='len'))
    temp['o7'] = ((temp['max'] - temp['min']).dt.days / (temp['len'] - 1))
    temp = temp.drop_duplicates('User_id')
    X = pd.merge(X, temp[['User_id', 'o7']], how='left', on='User_id')

    # 使用者領取特定商家的優惠券數目
    temp = X.groupby(['User_id', 'Merchant_id']).size().reset_index(name='o8')
    X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])

    # 使用者領取的不同商家數目
    temp = X.groupby(['User_id', 'Merchant_id']).size()
    temp = temp.groupby('User_id').size().reset_index(name='o9')
    X = pd.merge(X, temp, how='left', on='User_id')

    # 使用者當天領取的優惠券數目
    temp = X.groupby(['User_id', 'Date_received']).size().reset_index(name='o10')
    X = pd.merge(X, temp, how='left', on=['User_id', 'Date_received'])

    # 使用者當天領取的特定優惠券數目
    temp = X.groupby(['User_id', 'Coupon_id', 'Date_received']).size().reset_index(name='o11')
    X = pd.merge(X, temp, how='left', on=['User_id', 'Coupon_id', 'Date_received'])

    # 使用者領取的所有優惠券種類數目
    temp = X.groupby(['User_id', 'Coupon_id']).size()
    temp = temp.groupby('User_id').size().reset_index(name='o12')
    X = pd.merge(X, temp, how='left', on='User_id')

    # 商家被領取的優惠券數目
    temp = X.groupby('Merchant_id').size().reset_index(name='o13')
    X = pd.merge(X, temp, how='left', on='Merchant_id')

    # 商家被領取的特定優惠券數目
    temp = X.groupby(['Merchant_id', 'Coupon_id']).size().reset_index(name='o14')
    X = pd.merge(X, temp, how='left', on=['Merchant_id', 'Coupon_id'])

    # 商家被多少不同使用者領取的數目
    temp = X.groupby(['Merchant_id', 'User_id']).size()
    temp = temp.groupby('Merchant_id').size().reset_index(name='o15')
    X = pd.merge(X, temp, how='left', on='Merchant_id')

    # 商家發行的所有優惠券種類數目
    temp = X.groupby(['Merchant_id', 'Coupon_id']).size()
    temp = temp.groupby('Merchant_id').size().reset_index(name='o16')
    X = pd.merge(X, temp, how='left', on='Merchant_id')

    print(len(X), len(X.columns))

    return X


def get_online_features(online, X):
    # temp = online[online.Coupon_id == online.Coupon_id]
    # coupon_consume = temp[temp.Date == temp.Date]
    # coupon_no_consume = temp[temp.Date != temp.Date]

    # 使用者線上操作次數
    temp = online.groupby('User_id').size().reset_index(name='on_u1')
    X = pd.merge(X, temp, how='left', on='User_id')

    # 使用者線上點選次數
    temp = online[online.Action == 0].groupby('User_id').size().reset_index(name='on_u2')
    X = pd.merge(X, temp, how='left', on='User_id')

    # 使用者線上點選率
    X['on_u3'] = X.on_u2 / X.on_u1

    # 使用者線上購買次數
    temp = online[online.Action == 1].groupby('User_id').size().reset_index(name='on_u4')
    X = pd.merge(X, temp, how='left', on='User_id')

    # 使用者線上購買率
    X['on_u5'] = X.on_u4 / X.on_u1

    # 使用者線上領取次數
    temp = online[online.Coupon_id != 0].groupby('User_id').size().reset_index(name='on_u6')
    X = pd.merge(X, temp, how='left', on='User_id')

    # 使用者線上領取率
    X['on_u7'] = X.on_u6 / X.on_u1

    # 使用者線上不消費次數
    temp = online[(online.Date == date_null) & (online.Coupon_id != 0)]
    temp = temp.groupby('User_id').size().reset_index(name='on_u8')
    X = pd.merge(X, temp, how='left', on='User_id')

    # 使用者線上優惠券核銷次數
    temp = online[(online.Date != date_null) & (online.Coupon_id != 0)]
    temp = temp.groupby('User_id').size().reset_index(name='on_u9')
    X = pd.merge(X, temp, how='left', on='User_id')

    # 使用者線上優惠券核銷率
    X['on_u10'] = X.on_u9 / X.on_u6

    # 使用者線下不消費次數佔線上線下總的不消費次數的比重
    X['on_u11'] = X.u3 / (X.on_u8 + X.u3)

    # 使用者線下的優惠券核銷次數佔線上線下總的優惠券核銷次數的比重
    X['on_u12'] = X.u2 / (X.on_u9 + X.u2)

    # 使用者線下領取的記錄數量佔總的記錄數量的比重
    X['on_u13'] = X.u1 / (X.on_u6 + X.u1)

    # # 消費優惠券的平均折率
    # temp = coupon_consume.groupby('User_id').discount_rate.mean().reset_index(name='ou14')
    # X = pd.merge(X, temp, how='left', on='User_id')
    #
    # # 使用者核銷優惠券的最低消費折率
    # temp = coupon_consume.groupby('User_id').discount_rate.min().reset_index(name='ou15')
    # X = pd.merge(X, temp, how='left', on='User_id')
    #
    # # 使用者核銷優惠券的最高消費折率
    # temp = coupon_consume.groupby('User_id').discount_rate.max().reset_index(name='ou16')
    # X = pd.merge(X, temp, how='left', on='User_id')
    #
    # # 不同打折優惠券領取次數
    # temp = online.groupby('Discount_rate').size().reset_index(name='oc1')
    # X = pd.merge(X, temp, how='left', on='Discount_rate')
    #
    # # 不同打折優惠券使用次數
    # temp = coupon_consume.groupby('Discount_rate').size().reset_index(name='oc2')
    # X = pd.merge(X, temp, how='left', on='Discount_rate')
    #
    # # 不同打折優惠券不使用次數
    # temp = coupon_no_consume.groupby('Discount_rate').size().reset_index(name='oc3')
    # X = pd.merge(X, temp, how='left', on='Discount_rate')
    #
    # # 不同打折優惠券使用率
    # X['oc4'] = X.oc2 / X.oc1

    print(len(X), len(X.columns))
    print('----------')

    return X


def get_train_data():
    path = 'cache_%s_train.csv' % os.path.basename(__file__)

    if os.path.exists(path):
        data = pd.read_csv(path)
    else:
        offline, online = get_preprocess_data()

        # date received 2016-01-01 - 2016-06-15
        # date consumed 2016-01-01 - 2016-06-30

        # train data 1
        # 2016-04-16 ~ 2016-05-15
        data_1 = offline[('2016-04-16' <= offline.Date_received) & (offline.Date_received <= '2016-05-15')].copy()
        data_1['label'] = 0
        data_1.loc[
            (data_1.Date != date_null) & (data_1.Date - data_1.Date_received <= datetime.timedelta(15)), 'label'] = 1

        # feature data 1
        # 領券 2016-01-01 ~ 2016-03-31
        end = '2016-03-31'
        data_off_1 = offline[offline.Date_received <= end]
        data_on_1 = online[online.Date_received <= end]

        # 普通消費 2016-01-01 ~ 2016-04-15
        end = '2016-04-15'
        data_off_2 = offline[(offline.Coupon_id == 0) & (offline.Date <= end)]
        data_on_2 = online[(online.Coupon_id == 0) & (online.Date <= end)]

        data_1 = get_offline_features(data_1, pd.concat([data_off_1, data_off_2]))
        data_1 = get_online_features(pd.concat([data_on_1, data_on_2]), data_1)

        # train data 2
        # 2016-05-16 ~ 2016-06-15
        data_2 = offline['2016-05-16' <= offline.Date_received].copy()
        data_2['label'] = 0
        data_2.loc[
            (data_2.Date != date_null) & (data_2.Date - data_2.Date_received <= datetime.timedelta(15)), 'label'] = 1

        # feature data 2
        # 領券
        start = '2016-02-01'
        end = '2016-04-30'
        data_off_1 = offline[(start <= offline.Date_received) & (offline.Date_received <= end)]
        data_on_1 = online[(start <= online.Date_received) & (online.Date_received <= end)]

        # 普通消費
        start = '2016-02-01'
        end = '2016-05-15'
        data_off_2 = offline[(offline.Coupon_id == 0) & (start <= offline.Date) & (offline.Date <= end)]
        data_on_2 = online[(online.Coupon_id == 0) & (start <= online.Date) & (online.Date <= end)]

        data_2 = get_offline_features(data_2, pd.concat([data_off_1, data_off_2]))
        data_2 = get_online_features(pd.concat([data_on_1, data_on_2]), data_2)

        data = pd.concat([data_1, data_2])

        # undersampling
        # if undersampling:
        #     temp = X_1[X_1.label == 1].groupby('User_id').size().reset_index()
        #     temp = X_1[X_1.User_id.isin(temp.User_id)]
        #     X_1 = pd.concat([temp, X_1[~X_1.User_id.isin(temp.User_id)].sample(4041)])

        # data.drop_duplicates(inplace=True)
        drop_columns(data)
        data.fillna(0, inplace=True)
        data.to_csv(path, index=False)

    return data


def analysis():
    offline, online = get_preprocess_data()

    # t = offline.groupby('Discount_rate').size().reset_index(name='receive_count')
    # t1 = offline[(offline.Coupon_id != 0) & (offline.Date != date_null)]
    # t1 = t1.groupby('Discount_rate').size().reset_index(name='consume_count')
    # t = pd.merge(t, t1, on='Discount_rate')
    # t['consume_rate'] = t.consume_count / t.receive_count

    # t = offline.groupby('Merchant_id').size().reset_index(name='receive_count')
    # t1 = offline[(offline.Coupon_id != 0) & (offline.Date != date_null)]
    # t1 = t1.groupby('Merchant_id').size().reset_index(name='consume_count')
    # t = pd.merge(t, t1, on='Merchant_id')
    # t['consume_rate'] = t.consume_count / t.receive_count

    t = offline.groupby('Distance').size().reset_index(name='receive_count')
    t1 = offline[(offline.Coupon_id != 0) & (offline.Date != date_null)]
    t1 = t1.groupby('Distance').size().reset_index(name='consume_count')
    t = pd.merge(t, t1, on='Distance')
    t['consume_rate'] = t.consume_count / t.receive_count

    t.to_csv('note.csv')

    # plt.bar(temp.Discount_rate.values, temp.total.values)
    # plt.bar(range(num), y1, bottom=y2, fc='r')
    # plt.show()

    exit()


def detect_duplicate_columns():
    X = get_train_data()
    X = X[:1000]

    for index1 in range(len(X.columns) - 1):
        for index2 in range(index1 + 1, len(X.columns)):
            column1 = X.columns[index1]
            column2 = X.columns[index2]
            X[column1] = X[column1].astype(str)
            X[column2] = X[column2].astype(str)
            temp = len(X[X[column1] == X[column2]])
            if temp == len(X):
                print(column1, column2, temp)
    exit()


def feature_importance_score():
    clf = train_xgb()
    fscores = pd.Series(clf.get_booster().get_fscore()).sort_values(ascending=False)
    fscores.plot(kind='bar', title='Feature Importance')
    plt.ylabel('Feature Importance Score')
    plt.show()
    exit()


def feature_selection():
    data = get_train_data()

    train_data, test_data = train_test_split(data,
                                             train_size=100000,
                                             random_state=0
                                             )

    X = train_data.copy().drop(columns='Coupon_id')
    y = X.pop('label')

    # sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
    # X = sel.fit_transform(X)
    # print(X.shape)
    # Create the RFE object and rank each pixel


def fit_eval_metric(estimator, X, y, name=None):
    if name is None:
        name = estimator.__class__.__name__

    if name is 'XGBClassifier' or name is 'LGBMClassifier':
        estimator.fit(X, y, eval_metric='auc')
    else:
        estimator.fit(X, y)

    return estimator


def grid_search(estimator, param_grid):
    start = datetime.datetime.now()

    print('--------------------------------------------')
    print(start.strftime('%Y-%m-%d %H:%M:%S'))
    print(param_grid)
    print()

    data = get_train_data()

    data, _ = train_test_split(data, train_size=100000, random_state=0)

    X = data.copy().drop(columns='Coupon_id')
    y = X.pop('label')

    estimator_name = estimator.__class__.__name__
    n_jobs = cpu_jobs
    if estimator_name is 'XGBClassifier' or estimator_name is 'LGBMClassifier' or estimator_name is 'CatBoostClassifier':
        n_jobs = 1

    clf = GridSearchCV(estimator=estimator, param_grid=param_grid, scoring='roc_auc', n_jobs=n_jobs
                       # cv=5
                       )

    clf = fit_eval_metric(clf, X, y, estimator_name)

    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print('%0.5f (+/-%0.05f) for %r' % (mean, std * 2, params))
    print()
    print('best params', clf.best_params_)
    print('best score', clf.best_score_)
    print('time: %s' % str((datetime.datetime.now() - start)).split('.')[0])
    print()

    return clf.best_params_, clf.best_score_


def grid_search_auto(steps, params, estimator):
    global log

    old_params = params.copy()

    while 1:
        for name, step in steps.items():
            score = 0

            start = params[name] - step['step']
            if start <= step['min']:
                start = step['min']

            stop = params[name] + step['step']
            if step['max'] != 'inf' and stop >= step['max']:
                stop = step['max']

            while 1:

                if str(step['step']).count('.') == 1:
                    stop += step['step'] / 10
                else:
                    stop += step['step']

                param_grid = {
                    name: np.arange(start, stop, step['step']),
                }

                best_params, best_score = grid_search(estimator.set_params(**params), param_grid)

                if best_params[name] == params[name] or score > best_score:
                    print(estimator.__class__.__name__, params)
                    break

                direction = (best_params[name] - params[name]) // abs(best_params[name] - params[name])
                start = stop = best_params[name] + step['step'] * direction

                score = best_score
                params[name] = best_params[name]
                print(estimator.__class__.__name__, params)

                if best_params[name] - step['step'] < step['min'] or (
                        step['max'] != 'inf' and best_params[name] + step['step'] > step['max']):
                    break

        if old_params == params:
            break
        old_params = params
        print('--------------------------------------------')
        print('new grid search')

    print('--------------------------------------------')
    log += 'grid search: %s\n%r\n' % (estimator.__class__.__name__, params)


def grid_search_gbdt(get_param=False):
    params = {
        # 10
        'learning_rate': 1e-2,
        'n_estimators': 1900,
        'max_depth': 9,
        'min_samples_split': 200,
        'min_samples_leaf': 50,
        'subsample': .8,
    }

    if get_param:
        return params

    steps = {
        'n_estimators': {'step': 100, 'min': 1, 'max': 'inf'},
        'max_depth': {'step': 1, 'min': 1, 'max': 'inf'},
        'min_samples_split': {'step': 10, 'min': 2, 'max': 'inf'},
        'min_samples_leaf': {'step': 10, 'min': 1, 'max': 'inf'},
        'subsample': {'step': .1, 'min': .1, 'max': 1},
    }

    grid_search_auto(steps, params, GradientBoostingClassifier())


def grid_search_xgb(get_param=False):
    params = {
        # 8名的引數
        'booster': 'gbtree',
        'objective': 'rank:pairwise',
        'min_child_weight': 1.1,
        'colsample_bylevel': .7,
        'reg_lambda': 1,

        'learning_rate': 1e-2,
        'n_estimators': 3500,
        'max_depth': 5,
        'gamma': .1,
        'subsample': .7,
        'colsample_bytree': .7,
        'scale_pos_weight': 1,
        'reg_alpha': 0,
        'nthread': 12,
    }

    if get_param:
        return params

    steps = {
        'n_estimators': {'step': 10, 'min': 1, 'max': 'inf'},
        'max_depth': {'step': 1, 'min': 1, 'max': 'inf'},
        'min_child_weight': {'step': 1, 'min': 1, 'max': 'inf'},
        'gamma': {'step': .1, 'min': 0, 'max': 1},
        'subsample': {'step': .1, 'min': .1, 'max': 1},
        'colsample_bytree': {'step': .1, 'min': .1, 'max': 1},
        'scale_pos_weight': {'step': 1, 'min': 1, 'max': 10},
        'reg_alpha': {'step': .1, 'min': 0, 'max': 1},
    }

    grid_search_auto(steps, params, XGBClassifier())


def grid_search_lgb(get_param=False):
    params = {
        # 10
        'learning_rate': 1e-2,
        'n_estimators': 1200,
        'num_leaves': 51,
        'min_split_gain': 0,
        'min_child_weight': 1e-3,
        'min_child_samples': 22,
        'subsample': .8,
        'colsample_bytree': .8,


    }

    if get_param:
        return params

    steps = {
        'n_estimators': {'step': 100, 'min': 1, 'max': 'inf'},
        'num_leaves': {'step': 1, 'min': 1, 'max': 'inf'},
        'min_split_gain': {'step': .1, 'min': 0, 'max': 1},
        'min_child_weight': {'step': 1e-3, 'min': 1e-3, 'max': 'inf'},
        'min_child_samples': {'step': 1, 'min': 1, 'max': 'inf'},
        # 'subsample': {'step': .1, 'min': .1, 'max': 1},
        'colsample_bytree': {'step': .1, 'min': .1, 'max': 1},
    }

    grid_search_auto(steps, params, LGBMClassifier())


def grid_search_cat(get_param=False):
    params = {
        # 10
        'learning_rate': 1e-2,
        'n_estimators': 3600,
        'max_depth': 8,
        'max_bin': 127,
        'reg_lambda': 2,
        'subsample': .7,

        'one_hot_max_size': 2,
        'bootstrap_type': 'Bernoulli',
        'leaf_estimation_method': 'Newton',
        'verbose': False,
        'eval_metric': 'AUC',
        'thread_count': cpu_jobs
    }

    if get_param:
        return params

    steps = {
        'n_estimators': {'step': 150, 'min': 1, 'max': 'inf'},
        'max_depth': {'step': 1, 'min': 1, 'max': 'inf'},
        'max_bin': {'step': 1, 'min': 1, 'max': 255},
        'reg_lambda': {'step': 1, 'min': 0, 'max': 'inf'},
        'subsample': {'step': .1, 'min': .1, 'max': 1},
        'one_hot_max_size': {'step': 1, 'min': 0, 'max': 255},
    }

    grid_search_auto(steps, params, CatBoostClassifier())


def grid_search_rf(criterion='gini', get_param=False):
    if criterion == 'gini':
        params = {
            # 10
            'n_estimators': 3090,
            'max_depth': 15,
            'min_samples_split': 2,
            'min_samples_leaf': 1,

            'criterion': 'gini',
        }
    else:
        params = {
            'n_estimators': 3110,
            'max_depth': 13,
            'min_samples_split': 70,
            'min_samples_leaf': 10,
            'criterion': 'entropy',
        }

    if get_param:
        return params

    steps = {
        'n_estimators': {'step': 10, 'min': 1, 'max': 'inf'},
        'max_depth': {'step': 1, 'min': 1, 'max': 'inf'},
        'min_samples_split': {'step': 2, 'min': 2, 'max': 'inf'},
        'min_samples_leaf': {'step': 2, 'min': 1, 'max': 'inf'},
    }

    grid_search_auto(steps, params, RandomForestClassifier())


def grid_search_et(criterion='gini', get_param=False):
    if criterion == 'gini':
        params = {
            # 10
            'n_estimators': 3060,
            'max_depth': 22,
            'min_samples_split': 12,
            'min_samples_leaf': 1,

            'criterion': 'gini',
        }
    else:
        params = {
            'n_estimators': 3100,
            'max_depth': 13,
            'min_samples_split': 70,
            'min_samples_leaf': 10,
            'criterion': 'entropy',
        }

    if get_param:
        return params

    steps = {
        'n_estimators': {'step': 10, 'min': 1, 'max': 'inf'},
        'max_depth': {'step': 1, 'min': 1, 'max': 'inf'},
        'min_samples_split': {'step': 2, 'min': 2, 'max': 'inf'},
        'min_samples_leaf': {'step': 2, 'min': 1, 'max': 'inf'},
    }

    grid_search_auto(steps, params, ExtraTreesClassifier())


def train_gbdt(model=False):
    global log

    params = grid_search_gbdt(True)
    clf = GradientBoostingClassifier().set_params(**params)

    if model:
        return clf

    params = clf.get_params()
    log += 'gbdt'
    log += ', learning_rate: %.3f' % params['learning_rate']
    log += ', n_estimators: %d' % params['n_estimators']
    log += ', max_depth: %d' % params['max_depth']
    log += ', min_samples_split: %d' % params['min_samples_split']
    log += ', min_samples_leaf: %d' % params['min_samples_leaf']
    log += ', subsample: %.1f' % params['subsample']
    log += '\n\n'

    return train(clf)


def train_xgb(model=False):
    global log

    params = grid_search_xgb(True)

    clf = XGBClassifier().set_params(**params)

    if model:
        return clf

    params = clf.get_params()
    log += 'xgb'
    log += ', learning_rate: %.3f' % params['learning_rate']
    log += ', n_estimators: %d' % params['n_estimators']
    log += ', max_depth: %d' % params['max_depth']
    log += ', min_child_weight: %d' % params['min_child_weight']
    log += ', gamma: %.1f' % params['gamma']
    log += ', subsample: %.1f' % params['subsample']
    log += ', colsample_bytree: %.1f' % params['colsample_bytree']
    log += '\n\n'

    return train(clf)


def train_lgb(model=False):
    global log

    params = grid_search_lgb(True)

    clf = LGBMClassifier().set_params(**params)

    if model:
        return clf

    params = clf.get_params()
    log += 'lgb'
    log += ', learning_rate: %.3f' % params['learning_rate']
    log += ', n_estimators: %d' % params['n_estimators']
    log += ', num_leaves: %d' % params['num_leaves']
    log += ', min_split_gain: %.1f' % params['min_split_gain']
    log += ', min_child_weight: %.4f' % params['min_child_weight']
    log += ', min_child_samples: %d' % params['min_child_samples']
    log += ', subsample: %.1f' % params['subsample']
    log += ', colsample_bytree: %.1f' % params['colsample_bytree']
    log += '\n\n'

    return train(clf)


def train_cat(model=False):
    global log

    params = grid_search_cat(True)

    clf = CatBoostClassifier().set_params(**params)

    if model:
        return clf

    params = clf.get_params()
    log += 'cat'
    log += ', learning_rate: %.3f' % params['learning_rate']
    log += ', iterations: %d' % params['iterations']
    log += ', depth: %d' % params['depth']
    log += ', l2_leaf_reg: %d' % params['l2_leaf_reg']
    log += ', border_count: %d' % params['border_count']
    log += ', subsample: %d' % params['subsample']
    log += ', one_hot_max_size: %d' % params['one_hot_max_size']
    log += '\n\n'

    return train(clf)


def train_rf(clf):
    global log

    params = clf.get_params()
    log += 'rf'
    log += ', n_estimators: %d' % params['n_estimators']
    log += ', max_depth: %d' % params['max_depth']
    log += ', min_samples_split: %d' % params['min_samples_split']
    log += ', min_samples_leaf: %d' % params['min_samples_leaf']
    log += ', criterion: %s' % params['criterion']
    log += '\n\n'

    return train(clf)


def train_rf_gini(model=False):
    clf = RandomForestClassifier().set_params(**grid_search_rf('gini', True))
    if model:
        return clf
    return train_rf(clf)


def train_rf_entropy():
    clf = RandomForestClassifier().set_params(**grid_search_rf('entropy', True))

    return train_rf(clf)


def train_et(clf):
    global log

    params = clf.get_params()
    log += 'et'
    log += ', n_estimators: %d' % params['n_estimators']
    log += ', max_depth: %d' % params['max_depth']
    log += ', min_samples_split: %d' % params['min_samples_split']
    log += ', min_samples_leaf: %d' % params['min_samples_leaf']
    log += ', criterion: %s' % params['criterion']
    log += '\n\n'

    return train(clf)


def train_et_gini(model=False):
    clf = ExtraTreesClassifier().set_params(**grid_search_et('gini', True))
    if model:
        return clf
    return train_et(clf)


def train_et_entropy():
    clf = ExtraTreesClassifier().set_params(**{
        'n_estimators': 3100,
        'max_depth': 13,
        'min_samples_split': 70,
        'min_samples_leaf': 10,
        'criterion': 'entropy',
        'random_state': 0
    })

    return train_et(clf)


def train(clf):
    global log

    data = get_train_data()

    train_data, test_data = train_test_split(data,
                                             train_size=100000,
                                             random_state=0
                                             )

    _, test_data = train_test_split(data, random_state=0)

    X_train = train_data.copy().drop(columns='Coupon_id')
    y_train = X_train.pop('label')

    clf = fit_eval_metric(clf, X_train, y_train)

    X_test = test_data.copy().drop(columns='Coupon_id')
    y_test = X_test.pop('label')

    y_true, y_pred = y_test, clf.predict(X_test)
    # log += '%s\n' % classification_report(y_test, y_pred)
    log += '  accuracy: %f\n' % accuracy_score(y_true, y_pred)
    y_score = clf.predict_proba(X_test)[:, 1]
    log += '       auc: %f\n' % roc_auc_score(y_true, y_score)

    # coupon average auc
    coupons = test_data.groupby('Coupon_id').size().reset_index(name='total')
    aucs = []
    for _, coupon in coupons.iterrows():
        if coupon.total > 1:
            X_test = test_data[test_data.Coupon_id == coupon.Coupon_id].copy()
            X_test.drop(columns='Coupon_id', inplace=True)

            if len(X_test.label.unique()) != 2:
                continue

            y_true = X_test.pop('label')
            y_score = clf.predict_proba(X_test)[:, 1]
            aucs.append(roc_auc_score(y_true, y_score))

    log += 'coupon auc: %f\n\n' % np.mean(aucs)

    return clf


def predict(model):
    path = 'cache_%s_predict.csv' % os.path.basename(__file__)

    if os.path.exists(path):
        X = pd.read_csv(path, parse_dates=['Date_received'])
    else:
        offline, online = get_preprocess_data()

        # 2016-03-16 ~ 2016-06-30
        start = '2016-03-16'
        offline = offline[(offline.Coupon_id == 0) & (start <= offline.Date) | (start <= offline.Date_received)]
        online = online[(online.Coupon_id == 0) & (start <= online.Date) | (start <= online.Date_received)]

        X = get_preprocess_data(True)
        X = get_offline_features(X, offline)
        X = get_online_features(online, X)
        X.drop_duplicates(inplace=True)
        X.fillna(0, inplace=True)
        X.to_csv(path, index=False)

    sample_submission = X[['User_id', 'Coupon_id', 'Date_received']].copy()
    sample_submission.Date_received = sample_submission.Date_received.dt.strftime('%Y%m%d')
    drop_columns(X, True)

    if model is 'blending':
        predict = blending(X)
    else:
        clf = eval('train_%s' % model)()
        predict = clf.predict_proba(X)[:, 1]

    sample_submission['Probability'] = predict
    sample_submission.to_csv('submission_%s.csv' % model,
                             #  float_format='%.5f',
                             index=False, header=False)


def blending(predict_X=None):
    global log
    log += '\n'

    X = get_train_data().drop(columns='Coupon_id')
    y = X.pop('label')

    X = np.asarray(X)
    y = np.asarray(y)

    _, X_submission, _, y_test_blend = train_test_split(X, y,
                                                        random_state=0
                                                        )

    if predict_X is not None:
        X_submission = np.asarray(predict_X)

    X, _, y, _ = train_test_split(X, y,
                                  train_size=100000,
                                  random_state=0
                                  )

    # np.random.seed(0)
    # idx = np.random.permutation(y.size)
    # X = X[idx]
    # y = y[idx]

    skf = StratifiedKFold()
    clfs = ['gbdt', 'xgb',
            'rf_gini', 'et_gini', 'lgb', 'cat'
            ]

    blend_X_train = np.zeros((X.shape[0], len(clfs)))
    blend_X_test = np.zeros((X_submission.shape[0], len(clfs)))

    for j, v in enumerate(clfs):
        clf = eval('train_%s' % v)(True)

        aucs = []
        dataset_blend_test_j = []

        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            clf = fit_eval_metric(clf, X_train, y_train)

            y_submission = clf.predict_proba(X_test)[:, 1]
            aucs.append(roc_auc_score(y_test, y_submission))

            blend_X_train[test_index, j] = y_submission
            dataset_blend_test_j.append(clf.predict_proba(X_submission)[:, 1])

        log += '%7s' % v + ' auc: %f\n' % np.mean(aucs)
        blend_X_test[:, j] = np.asarray(dataset_blend_test_j).T.mean(1)

    print('blending')
    clf = LogisticRegression()
    # clf = GradientBoostingClassifier()
    clf.fit(blend_X_train, y)
    y_submission = clf.predict_proba(blend_X_test)[:, 1]

    # Linear stretch of predictions to [0,1]
    y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())
    if predict_X is not None:
        return y_submission
    log += '\n  blend auc: %f\n\n' % roc_auc_score(y_test_blend, y_submission)
    print(log)


if __name__ == '__main__':
    start = datetime.datetime.now()
    print(start.strftime('%Y-%m-%d %H:%M:%S'))
    log = '%s\n' % start.strftime('%Y-%m-%d %H:%M:%S')
    cpu_jobs = os.cpu_count() - 1
    date_null = pd.to_datetime('1970-01-01', format='%Y-%m-%d')

    blending()
    predict('blending')

    log += 'time: %s\n' % str((datetime.datetime.now() - start)).split('.')[0]
    log += '----------------------------------------------------\n'
    open('%s.log' % os.path.basename(__file__), 'a').write(log)
    print(log)

模型三:0.5

import os
import pickle
from datetime import date

import numpy as np
import pandas as pd
from sklearn.linear_model import SGDClassifier

from Motor_fault.model_utils import build_model_etr, build_model_rf

os.chdir(r'E:\專案檔案\o2o優惠券使用預測')

dfoff = pd.read_csv('ccf_offline_stage1_train.csv')
dftest = pd.read_csv('ccf_offline_stage1_test_revised.csv')
dfon = pd.read_csv('ccf_online_stage1_train.csv')


# 1. 將滿xx減yy型別(`xx:yy`)的券變成折扣率 : `1 - yy/xx`,同時建立折扣券相關的特徵 `discount_rate, discount_man, discount_jian, discount_type`
# 2. 將距離 `str` 轉為 `int`
# convert Discount_rate and Distance
def getDiscountType(row):
    if pd.isnull(row):
        return np.nan
    elif ':' in row:
        return 1
    else:
        return 0


def convertRate(row):
    """Convert discount to rate"""
    if pd.isnull(row):
        return 1.0
    elif ':' in str(row):
        rows = row.split(':')
        return 1.0 - float(rows[1]) / float(rows[0])
    else:
        return float(row)


def getDiscountMan(row):
    if ':' in str(row):
        rows = row.split(':')
        return int(rows[0])
    else:
        return 0


def getDiscountJian(row):
    if ':' in str(row):
        rows = row.split(':')
        return int(rows[1])
    else:
        return 0


print("tool is ok.")


def processData(df):
    # convert discunt_rate
    df['discount_rate'] = df['Discount_rate'].apply(convertRate)
    df['discount_man'] = df['Discount_rate'].apply(getDiscountMan)
    df['discount_jian'] = df['Discount_rate'].apply(getDiscountJian)
    df['discount_type'] = df['Discount_rate'].apply(getDiscountType)
    print(df['discount_rate'].unique())
    # convert distance
    df['distance'] = df['Distance'].fillna(-1).astype(int)
    return df


dfoff = processData(dfoff)
dftest = processData(dftest)

date_received = dfoff['Date_received'].unique()
date_received = sorted(date_received[pd.notnull(date_received)])

date_buy = dfoff['Date'].unique()
date_buy = sorted(date_buy[pd.notnull(date_buy)])
date_buy = sorted(dfoff[dfoff['Date'].notnull()]['Date'])
couponbydate = dfoff[dfoff['Date_received'].notnull()][['Date_received', 'Date']].groupby(['Date_received'],
                                                                                          as_index=False).count()
couponbydate.columns = ['Date_received', 'count']
buybydate = dfoff[(dfoff['Date'].notnull()) & (dfoff['Date_received'].notnull())][['Date_received', 'Date']].groupby(
    ['Date_received'], as_index=False).count()
buybydate.columns = ['Date_received', 'count']

print("end")


def getWeekday(row):
    if row == 'nan':
        return np.nan
    else:
        return date(int(row[0:4]), int(row[4:6]), int(row[6:8])).weekday() + 1


dfoff['weekday'] = dfoff['Date_received'].astype(str).apply(getWeekday)
dftest['weekday'] = dftest['Date_received'].astype(str).apply(getWeekday)

# weekday_type :  週六和週日為1,其他為0
dfoff['weekday_type'] = dfoff['weekday'].apply(lambda x: 1 if x in [6, 7] else 0)
dftest['weekday_type'] = dftest['weekday'].apply(lambda x: 1 if x in [6, 7] else 0)

# change weekday to one-hot encoding
weekdaycols = ['weekday_' + str(i) for i in range(1, 8)]
tmpdf = pd.get_dummies(dfoff['weekday'].replace('nan', np.nan))
tmpdf.columns = weekdaycols
dfoff[weekdaycols] = tmpdf

tmpdf = pd.get_dummies(dftest['weekday'].replace('nan', np.nan))
tmpdf.columns = weekdaycols
dftest[weekdaycols] = tmpdf


def label(row):
    if pd.isnull(row['Date_received']):
        return -1
    if pd.notnull(row['Date']):
        td = pd.to_datetime(row['Date'], format='%Y%m%d') - pd.to_datetime(row['Date_received'], format='%Y%m%d')
        if td <= pd.Timedelta(15, 'D'):
            return 1
    return 0


dfoff['label'] = dfoff.apply(label, axis=1)

print("end")

# data split
print("-----data split------")
df = dfoff[dfoff['label'] != -1].copy()
train = df[(df['Date_received'] < 20160516)].copy()
valid = df[(df['Date_received'] >= 20160516) & (df['Date_received'] <= 20160615)].copy()
print("end")

# feature
original_feature = ['discount_rate', 'discount_type', 'discount_man', 'discount_jian', 'distance', 'weekday',
                    'weekday_type'] + weekdaycols
print("----train-----")
x_train, y_train =train[original_feature], train['label']
x_val = dftest[original_feature]
model_rf = build_model_rf(x_train, y_train)
model_etr = build_model_etr(x_train, y_train)

train_etr_pred = model_etr.predict(x_train)
train_rf_pred = model_rf.predict(x_train)

Strak_X_train = pd.DataFrame()
Strak_X_train['Method_2'] = train_rf_pred
Strak_X_train['Method_4'] = train_etr_pred
#
# # 第二層
model = build_model_etr(Strak_X_train, y_train)

val_rf = model_rf.predict(x_val)
val_etr = model_etr.predict(x_val)

Strak_X_val = pd.DataFrame()
Strak_X_val['Method_1'] = val_rf
Strak_X_val['Method_4'] = val_etr



# test prediction for submission
y_test_pred = model.predict_proba(Strak_X_val)
dftest1 = dftest[['User_id', 'Coupon_id', 'Date_received']].copy()
dftest1['label'] = y_test_pred[:, 1]
dftest1.to_csv('submit.csv', index=False, header=False)
dftest1.head()
import joblib
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier, \
    AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, accuracy_score, roc_auc_score, precision_recall_curve, auc, roc_curve, \
    f1_score, recall_score, cohen_kappa_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
from utils.read_write import writeOneCsv

src = r'E:\專案檔案\電機故障診斷\data\\'


def build_model_dt(x_train, y_train):
    estimator = DecisionTreeClassifier(random_state=7)
    param_grid = {
        'max_depth': range(10, 25, 1),
    }
    model = GridSearchCV(estimator, param_grid, cv=3)
    model.fit(x_train, y_train)
    print('dt')
    print(model.best_params_)
    writeParams('dt', model.best_params_)
    return model


def build_model_rf(x_train, y_train):
    estimator = RandomForestClassifier()
    param_grid = {
        'max_depth': range(42, 43, 1),
        'n_estimators': range(79, 80, 1),
    }
    model = GridSearchCV(estimator, param_grid, cv=3)
    model.fit(x_train, y_train)
    print('rf')
    print(model.best_params_)
    writeParams('rf', model.best_params_)
    return model


def build_model_etr(x_train, y_train):
    # 極端隨機森林迴歸   n_estimators 即ExtraTreesRegressor最大的決策樹個數
    estimator = ExtraTreesClassifier()
    param_grid = {
        'max_depth': range(33, 34, 1),
        'n_estimators': range(108, 109, 1),
    }
    model = GridSearchCV(estimator, param_grid, cv=3)
    model.fit(x_train, y_train)
    print('etr')
    print(model.best_params_)
    writeParams('etr', model.best_params_)
    return model


def build_model_xgb(x_train, y_train):
    estimator = XGBClassifier(gamma=0, colsample_bytree=0.9, subsample=0.91)
    param_grid = {
        'learning_rate': [ 0.27],
        'max_depth': range(12, 13, 1),
        'n_estimators': range(34, 35, 3),
    }
    model = GridSearchCV(estimator, param_grid, cv=3)
    model.fit(x_train, y_train)
    print('xgb')
    print(model.best_params_)
    writeParams('xgb', model.best_params_)
    return model


def build_model_lgb(x_train, y_train):
    estimator = LGBMClassifier()
    param_grid = {
        'learning_rate': [0.18],
        'n_estimators': range(100, 101, 1),
        'num_leaves': range(75, 80, 5)
    }
    gbm = GridSearchCV(estimator, param_grid)
    gbm.fit(x_train, y_train.ravel())
    print('lgb')
    print(gbm.best_params_)
    writeParams('lgb', gbm.best_params_)
    return gbm


def build_model_mlpr(x_train, y_train):
    from sklearn.neural_network import MLPClassifier
    '''啟用函式用relu,梯度下降方法用lbfgs,效果是最好的'''
    mlp = MLPClassifier(activation='relu', solver='lbfgs')
    param_grid = {
        'alpha': [0.002, 0.001],
        'hidden_layer_sizes': [(38, 19)],
        'max_iter': range(75, 85, 1),
    }
    model = GridSearchCV(mlp, param_grid, cv=3)
    model.fit(x_train, y_train.ravel())
    print('mlpr')
    print(model.best_params_)
    writeParams('mlpr', model.best_params_)
    return model


def build_model_ada(x_train, y_train):
    estimator = AdaBoostClassifier()
    param_grid = {
        'learning_rate': [0.23],
        'n_estimators': range(13, 14, 1),
    }
    model = GridSearchCV(estimator, param_grid, cv=3)
    model.fit(x_train, y_train)
    print('ada')
    print(model.best_params_)
    writeParams('ada', model.best_params_)
    return model


def build_model_gbdt(x_train, y_train):
    estimator = GradientBoostingClassifier(min_samples_leaf=0.1, min_samples_split=10, subsample=0.998)
    param_grid = {
        'learning_rate': [0.75],
        'max_depth': range(25, 30, 1),
        'n_estimators': range(80, 85, 1)
    }
    gbdt = GridSearchCV(estimator, param_grid, cv=3)
    gbdt.fit(x_train, y_train.ravel())
    print('gbdt')
    print(gbdt.best_params_)
    writeParams('gbdt', gbdt.best_params_)
    return gbdt


def build_model_liner_svc(x_train, y_train):
    svm_reg = LinearSVC(max_iter=-1)
    param_grid = {
        'C': range(1, 2, 1),
    }
    model = GridSearchCV(svm_reg, param_grid, cv=3)
    model.fit(x_train, y_train)
    print('LinearSVC')
    print(model.best_params_)
    return model


def train_logistic_classifier(x_train, y_train):
    model = LogisticRegression()
    param_grid = {
        'C': range(2, 3, 1),
        'penalty': ['l2'],
    }
    model = GridSearchCV(model, param_grid, cv=3)
    model.fit(x_train, y_train.ravel())
    print('LR')
    print(model.best_params_)
    return model


def build_model_svc(x_train, y_train):
    model = SVC(max_iter=-1)
    param_grid = {
        'C': range(1, 2, 2),
        'kernel': ['poly', 'rbf', 'precomputed'],
        'cache_size': range(200, 210, 20),
    }
    model = GridSearchCV(model, param_grid, cv=3)
    model.fit(x_train, y_train.ravel())
    print('SVC')
    print(model.best_params_)
    return model


def score_model(test, predict, model, data_type):
    accuracy = round(accuracy_score(test, predict), 6)
    print(data_type + ',accuracy,', accuracy)
    writeOneCsv(['staking', data_type, 'accuracy', accuracy], src + '調參記錄.csv')
    pre_score = precision_score(test, predict, average="macro")
    print(data_type + ",precision,", round(pre_score, 6))
    writeOneCsv(['staking', data_type, 'precision', round(pre_score, 6)], src + '調參記錄.csv')
    roc_auc = round(roc_auc_score(test, predict), 6)
    print(data_type + ",roc_auc,", roc_auc)
    writeOneCsv(['staking', data_type, 'roc_auc', roc_auc], src + '調參記錄.csv')
    f1 = f1_score(predict, test)
    print(data_type + ",f1,", round(f1, 6))
    writeOneCsv(['staking', data_type, 'f1', round(f1, 6)], src + '調參記錄.csv')
    recall = recall_score(predict, test)
    print(data_type + ",recall,", round(recall, 6))
    writeOneCsv(['staking', data_type, 'recall', round(recall, 6)], src + '調參記錄.csv')
    cohen_kappa = cohen_kappa_score(predict, test)
    print(data_type + ",cohen_kappa,", round(cohen_kappa, 6))
    writeOneCsv(['staking', data_type, 'cohen_kappa', round(cohen_kappa, 6)], src + '調參記錄.csv')


def save_load(model, save_or_load):
    path = src + 'etr.pkl'
    # save model
    if save_or_load == 'save':
        joblib.dump(model, path)
    else:
        # load model
        model_etr = joblib.load(path)
        return model_etr


def fit_size(x, y):
    from sklearn import preprocessing
    x_min = preprocessing.MinMaxScaler()
    y_min = preprocessing.MinMaxScaler()
    y = np.array(y).reshape(len(y), 1)
    x = x_min.fit_transform(x)
    y = y_min.fit_transform(y)
    return x, y


def scatter_line(y_val, y_pre):
    xx = range(0, len(y_val))
    plt.scatter(xx, y_val, color="red", label="actual", linewidth=3)
    plt.plot(xx, y_pre, color="orange", label="predicted", linewidth=2)
    plt.legend()
    plt.show()


def draw_ROC_curve(y_test, y_predict):
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_predict)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    plt.title('ROC')
    plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.2f' % roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], 'r--')
    plt.ylabel('TPR')
    plt.xlabel('FPR')
    plt.legend()
    plt.show()
    plt.close(0)


def pr(y_val, predict_proba):
    precision, recall, thresholds = precision_recall_curve(y_val, predict_proba)
    plt.step(recall, precision, color='b', alpha=0.2, where='post')
    plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title('2-class Precision-Recall curve')
    plt.legend()
    plt.show()


def writeParams(model, best):
    if model in ['gbdt', 'xgb']:
        writeOneCsv([model, best['max_depth'], best['n_estimators'], best['learning_rate']], src + '調參記錄.csv')
    elif model == 'mlpr':
        writeOneCsv([model, best['hidden_layer_sizes'], best['max_iter'], best['alpha']], src + '調參記錄.csv')
    elif model == 'ada':
        writeOneCsv([model, 0, best['n_estimators'], best['learning_rate']], src + '調參記錄.csv')
    elif model == 'lgb':
        writeOneCsv([model, best['num_leaves'], best['n_estimators'], best['learning_rate']], src + '調參記錄.csv')
    elif model == 'dt':
        writeOneCsv([model, best['max_depth'], 0, 0], src + '調參記錄.csv')
    else:
        writeOneCsv([model, best['max_depth'], best['n_estimators'], 0], src + '調參記錄.csv')


def write_mae(model, data_type, mae):
    writeOneCsv([model, data_type, 'mae', mae], src + '調參記錄.csv')

喜歡記得一鍵三連

相關文章