天池新人實戰賽o2o優惠券使用預測-排名181
資料
本賽題提供使用者在2016年1月1日至2016年6月30日之間真實線上線下消費行為,預測使用者在2016年7月領取優惠券後15天以內的使用情況。
具體請移步:o2o優惠券使用預測
具體思路:
去除不要的特徵
填充空值
計算統計特徵
使用[‘gbdt’, ‘xgb’, ‘rf_gini’, ‘et_gini’, ‘lgb’, ‘cat’]做blending去預測
模型1:分數0.8
# 解決lgb報錯
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
import datetime
import os
from concurrent.futures import ProcessPoolExecutor
from math import ceil
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from xgboost.sklearn import XGBClassifier
os.chdir(r'E:\專案檔案\o2o優惠券使用預測')
# dfoff = pd.read_csv('ccf_offline_stage1_train.csv')
# dftest = pd.read_csv('ccf_offline_stage1_test_revised.csv')
# dfon = pd.read_csv('ccf_online_stage1_train.csv')
pd.set_option('expand_frame_repr', False)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)
def drop_columns(X, predict=False):
columns = [
'User_id', 'Merchant_id', 'Discount_rate', 'Date_received', 'discount_rate_x', 'discount_rate_y',
# 'u33', 'u34'
]
if predict:
columns.append('Coupon_id')
else:
columns.append('Date')
X.drop(columns=columns, inplace=True)
def get_preprocess_data(predict=False):
if predict:
offline = pd.read_csv('ccf_offline_stage1_test_revised.csv', parse_dates=['Date_received'])
else:
offline = pd.read_csv('ccf_offline_stage1_train.csv', parse_dates=['Date_received', 'Date'])
offline.Distance.fillna(11, inplace=True)
offline.Distance = offline.Distance.astype(int)
offline.Coupon_id.fillna(0, inplace=True)
offline.Coupon_id = offline.Coupon_id.astype(int)
offline.Date_received.fillna(date_null, inplace=True)
offline[['discount_rate_x', 'discount_rate_y']] = offline[offline.Discount_rate.str.contains(':') == True][
'Discount_rate'].str.split(':', expand=True).astype(int)
offline['discount_rate'] = 1 - offline.discount_rate_y / offline.discount_rate_x
offline.discount_rate = offline.discount_rate.fillna(offline.Discount_rate).astype(float)
if predict:
return offline
offline.Date.fillna(date_null, inplace=True)
# online
online = pd.read_csv('ccf_online_stage1_train.csv', parse_dates=['Date_received', 'Date'])
online.Coupon_id.fillna(0, inplace=True)
# online.Coupon_id = online.Coupon_id.astype(int)
online.Date_received.fillna(date_null, inplace=True)
online.Date.fillna(date_null, inplace=True)
return offline, online
def task(X_chunk, X, counter):
print(counter, end=',', flush=True)
X_chunk = X_chunk.copy()
X_chunk['o17'] = -1
X_chunk['o18'] = -1
for i, user in X_chunk.iterrows():
temp = X[X.User_id == user.User_id]
temp1 = temp[temp.Date_received < user.Date_received]
temp2 = temp[temp.Date_received > user.Date_received]
# 使用者此次之後/前領取的所有優惠券數目
X_chunk.loc[i, 'o3'] = len(temp1)
X_chunk.loc[i, 'o4'] = len(temp2)
# 使用者此次之後/前領取的特定優惠券數目
X_chunk.loc[i, 'o5'] = len(temp1[temp1.Coupon_id == user.Coupon_id])
X_chunk.loc[i, 'o6'] = len(temp2[temp2.Coupon_id == user.Coupon_id])
# 使用者上/下一次領取的時間間隔
temp1 = temp1.sort_values(by='Date_received', ascending=False)
if len(temp1):
X_chunk.loc[i, 'o17'] = (user.Date_received - temp1.iloc[0].Date_received).days
temp2 = temp2.sort_values(by='Date_received')
if len(temp2):
X_chunk.loc[i, 'o18'] = (temp2.iloc[0].Date_received - user.Date_received).days
return X_chunk
def get_offline_features(X, offline):
# X = X[:1000]
print(len(X), len(X.columns))
temp = offline[offline.Coupon_id != 0]
coupon_consume = temp[temp.Date != date_null]
coupon_no_consume = temp[temp.Date == date_null]
user_coupon_consume = coupon_consume.groupby('User_id')
X['weekday'] = X.Date_received.dt.weekday
X['day'] = X.Date_received.dt.day
# # 距離優惠券消費次數
# temp = coupon_consume.groupby('Distance').size().reset_index(name='distance_0')
# X = pd.merge(X, temp, how='left', on='Distance')
#
# # 距離優惠券不消費次數
# temp = coupon_no_consume.groupby('Distance').size().reset_index(name='distance_1')
# X = pd.merge(X, temp, how='left', on='Distance')
#
# # 距離優惠券領取次數
# X['distance_2'] = X.distance_0 + X.distance_1
#
# # 距離優惠券消費率
# X['distance_3'] = X.distance_0 / X.distance_2
# temp = coupon_consume[coupon_consume.Distance != 11].groupby('Distance').size()
# temp['d4'] = temp.Distance.sum() / len(temp)
# X = pd.merge(X, temp, how='left', on='Distance')
'''user features'''
# 優惠券消費次數
temp = user_coupon_consume.size().reset_index(name='u2')
X = pd.merge(X, temp, how='left', on='User_id')
# X.u2.fillna(0, inplace=True)
# X.u2 = X.u2.astype(int)
# 優惠券不消費次數
temp = coupon_no_consume.groupby('User_id').size().reset_index(name='u3')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用優惠券次數與沒使用優惠券次數比值
X['u19'] = X.u2 / X.u3
# 領取優惠券次數
X['u1'] = X.u2.fillna(0) + X.u3.fillna(0)
# 優惠券核銷率
X['u4'] = X.u2 / X.u1
# 普通消費次數
temp = offline[(offline.Coupon_id == 0) & (offline.Date != date_null)]
temp1 = temp.groupby('User_id').size().reset_index(name='u5')
X = pd.merge(X, temp1, how='left', on='User_id')
# 一共消費多少次
X['u25'] = X.u2 + X.u5
# 使用者使用優惠券消費佔比
X['u20'] = X.u2 / X.u25
# 正常消費平均間隔
temp = pd.merge(temp, temp.groupby('User_id').Date.max().reset_index(name='max'))
temp = pd.merge(temp, temp.groupby('User_id').Date.min().reset_index(name='min'))
temp = pd.merge(temp, temp.groupby('User_id').size().reset_index(name='len'))
temp['u6'] = ((temp['max'] - temp['min']).dt.days / (temp['len'] - 1))
temp = temp.drop_duplicates('User_id')
X = pd.merge(X, temp[['User_id', 'u6']], how='left', on='User_id')
# 優惠券消費平均間隔
temp = pd.merge(coupon_consume, user_coupon_consume.Date.max().reset_index(name='max'))
temp = pd.merge(temp, temp.groupby('User_id').Date.min().reset_index(name='min'))
temp = pd.merge(temp, temp.groupby('User_id').size().reset_index(name='len'))
temp['u7'] = ((temp['max'] - temp['min']).dt.days / (temp['len'] - 1))
temp = temp.drop_duplicates('User_id')
X = pd.merge(X, temp[['User_id', 'u7']], how='left', on='User_id')
# 15天內平均會普通消費幾次
X['u8'] = X.u6 / 15
# 15天內平均會優惠券消費幾次
X['u9'] = X.u7 / 15
# 領取優惠券到使用優惠券的平均間隔時間
temp = coupon_consume.copy()
temp['days'] = (temp.Date - temp.Date_received).dt.days
temp = (temp.groupby('User_id').days.sum() / temp.groupby('User_id').size()).reset_index(name='u10')
X = pd.merge(X, temp, how='left', on='User_id')
# 在15天內使用掉優惠券的值大小
X['u11'] = X.u10 / 15
# 領取優惠券到使用優惠券間隔小於15天的次數
temp = coupon_consume.copy()
temp['days'] = (temp.Date - temp.Date_received).dt.days
temp = temp[temp.days <= 15]
temp = temp.groupby('User_id').size().reset_index(name='u21')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者15天使用掉優惠券的次數除以使用優惠券的次數
X['u22'] = X.u21 / X.u2
# 使用者15天使用掉優惠券的次數除以領取優惠券未消費的次數
X['u23'] = X.u21 / X.u3
# 使用者15天使用掉優惠券的次數除以領取優惠券的總次數
X['u24'] = X.u21 / X.u1
# 消費優惠券的平均折率
temp = user_coupon_consume.discount_rate.mean().reset_index(name='u45')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者核銷優惠券的最低消費折率
temp = user_coupon_consume.discount_rate.min().reset_index(name='u27')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者核銷優惠券的最高消費折率
temp = user_coupon_consume.discount_rate.max().reset_index(name='u28')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者核銷過的不同優惠券數量
temp = coupon_consume.groupby(['User_id', 'Coupon_id']).size()
temp = temp.groupby('User_id').size().reset_index(name='u32')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者領取所有不同優惠券數量
temp = offline[offline.Date_received != date_null]
temp = temp.groupby(['User_id', 'Coupon_id']).size().reset_index(name='u47')
X = pd.merge(X, temp, how='left', on=['User_id', 'Coupon_id'])
# 使用者核銷過的不同優惠券數量佔所有不同優惠券的比重
X['u33'] = X.u32 / X.u47
# 使用者平均每種優惠券核銷多少張
X['u34'] = X.u2 / X.u47
# 核銷優惠券使用者-商家平均距離
temp = offline[(offline.Coupon_id != 0) & (offline.Date != date_null) & (offline.Distance != 11)]
temp = temp.groupby('User_id').Distance
temp = pd.merge(temp.count().reset_index(name='x'), temp.sum().reset_index(name='y'), on='User_id')
temp['u35'] = temp.y / temp.x
temp = temp[['User_id', 'u35']]
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者核銷優惠券中的最小使用者-商家距離
temp = coupon_consume[coupon_consume.Distance != 11]
temp = temp.groupby('User_id').Distance.min().reset_index(name='u36')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者核銷優惠券中的最大使用者-商家距離
temp = coupon_consume[coupon_consume.Distance != 11]
temp = temp.groupby('User_id').Distance.max().reset_index(name='u37')
X = pd.merge(X, temp, how='left', on='User_id')
# 優惠券型別
discount_types = [
'0.2', '0.5', '0.6', '0.7', '0.75', '0.8', '0.85', '0.9', '0.95', '30:20', '50:30', '10:5',
'20:10', '100:50', '200:100', '50:20', '30:10', '150:50', '100:30', '20:5', '200:50', '5:1',
'50:10', '100:20', '150:30', '30:5', '300:50', '200:30', '150:20', '10:1', '50:5', '100:10',
'200:20', '300:30', '150:10', '300:20', '500:30', '20:1', '100:5', '200:10', '30:1', '150:5',
'300:10', '200:5', '50:1', '100:1',
]
X['discount_type'] = -1
for k, v in enumerate(discount_types):
X.loc[X.Discount_rate == v, 'discount_type'] = k
# 不同優惠券領取次數
temp = offline.groupby(['User_id', 'Discount_rate']).size().reset_index(name='u41')
X = pd.merge(X, temp, how='left', on=['User_id', 'Discount_rate'])
# 不同優惠券使用次數
temp = coupon_consume.groupby(['User_id', 'Discount_rate']).size().reset_index(name='u42')
X = pd.merge(X, temp, how='left', on=['User_id', 'Discount_rate'])
# 不同優惠券不使用次數
temp = coupon_no_consume.groupby(['User_id', 'Discount_rate']).size().reset_index(name='u43')
X = pd.merge(X, temp, how='left', on=['User_id', 'Discount_rate'])
# 不同打折優惠券使用率
X['u44'] = X.u42 / X.u41
# 滿減型別優惠券領取次數
temp = offline[offline.Discount_rate.str.contains(':') == True]
temp = temp.groupby('User_id').size().reset_index(name='u48')
X = pd.merge(X, temp, how='left', on='User_id')
# 打折型別優惠券領取次數
temp = offline[offline.Discount_rate.str.contains('\.') == True]
temp = temp.groupby('User_id').size().reset_index(name='u49')
X = pd.merge(X, temp, how='left', on='User_id')
'''offline merchant features'''
# 商戶消費次數
temp = offline[offline.Date != date_null].groupby('Merchant_id').size().reset_index(name='m0')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家優惠券被領取後核銷次數
temp = coupon_consume.groupby('Merchant_id').size().reset_index(name='m1')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商戶正常消費筆數
X['m2'] = X.m0.fillna(0) - X.m1.fillna(0)
# 商家優惠券被領取次數
temp = offline[offline.Date_received != date_null].groupby('Merchant_id').size().reset_index(name='m3')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家優惠券被領取後核銷率
X['m4'] = X.m1 / X.m3
# 商家優惠券被領取後不核銷次數
temp = coupon_no_consume.groupby('Merchant_id').size().reset_index(name='m7')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商戶當天優惠券領取次數
temp = X[X.Date_received != date_null]
temp = temp.groupby(['Merchant_id', 'Date_received']).size().reset_index(name='m5')
X = pd.merge(X, temp, how='left', on=['Merchant_id', 'Date_received'])
# 商戶當天優惠券領取人數
temp = X[X.Date_received != date_null]
temp = temp.groupby(['User_id', 'Merchant_id', 'Date_received']).size().reset_index()
temp = temp.groupby(['Merchant_id', 'Date_received']).size().reset_index(name='m6')
X = pd.merge(X, temp, how='left', on=['Merchant_id', 'Date_received'])
# 商家優惠券核銷的平均消費折率
temp = coupon_consume.groupby('Merchant_id').discount_rate.mean().reset_index(name='m8')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家優惠券核銷的最小消費折率
temp = coupon_consume.groupby('Merchant_id').discount_rate.max().reset_index(name='m9')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家優惠券核銷的最大消費折率
temp = coupon_consume.groupby('Merchant_id').discount_rate.min().reset_index(name='m10')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家優惠券核銷不同的使用者數量
temp = coupon_consume.groupby(['Merchant_id', 'User_id']).size()
temp = temp.groupby('Merchant_id').size().reset_index(name='m11')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家優惠券領取不同的使用者數量
temp = offline[offline.Date_received != date_null].groupby(['Merchant_id', 'User_id']).size()
temp = temp.groupby('Merchant_id').size().reset_index(name='m12')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 核銷商家優惠券的不同使用者數量其佔領取不同的使用者比重
X['m13'] = X.m11 / X.m12
# 商家優惠券平均每個使用者核銷多少張
X['m14'] = X.m1 / X.m12
# 商家被核銷過的不同優惠券數量
temp = coupon_consume.groupby(['Merchant_id', 'Coupon_id']).size()
temp = temp.groupby('Merchant_id').size().reset_index(name='m15')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家領取過的不同優惠券數量的比重
temp = offline[offline.Date_received != date_null].groupby(['Merchant_id', 'Coupon_id']).size()
temp = temp.groupby('Merchant_id').count().reset_index(name='m18')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家被核銷過的不同優惠券數量佔所有領取過的不同優惠券數量的比重
X['m19'] = X.m15 / X.m18
# 商家被核銷優惠券的平均時間
temp = pd.merge(coupon_consume, coupon_consume.groupby('Merchant_id').Date.max().reset_index(name='max'))
temp = pd.merge(temp, temp.groupby('Merchant_id').Date.min().reset_index(name='min'))
temp = pd.merge(temp, temp.groupby('Merchant_id').size().reset_index(name='len'))
temp['m20'] = ((temp['max'] - temp['min']).dt.days / (temp['len'] - 1))
temp = temp.drop_duplicates('Merchant_id')
X = pd.merge(X, temp[['Merchant_id', 'm20']], how='left', on='Merchant_id')
# 商家被核銷優惠券中的使用者-商家平均距離
temp = coupon_consume[coupon_consume.Distance != 11].groupby('Merchant_id').Distance
temp = pd.merge(temp.count().reset_index(name='x'), temp.sum().reset_index(name='y'), on='Merchant_id')
temp['m21'] = temp.y / temp.x
temp = temp[['Merchant_id', 'm21']]
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家被核銷優惠券中的使用者-商家最小距離
temp = coupon_consume[coupon_consume.Distance != 11]
temp = temp.groupby('Merchant_id').Distance.min().reset_index(name='m22')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家被核銷優惠券中的使用者-商家最大距離
temp = coupon_consume[coupon_consume.Distance != 11]
temp = temp.groupby('Merchant_id').Distance.max().reset_index(name='m23')
X = pd.merge(X, temp, how='left', on='Merchant_id')
"""offline coupon features"""
# 此優惠券一共發行多少張
temp = offline[offline.Coupon_id != 0].groupby('Coupon_id').size().reset_index(name='c1')
X = pd.merge(X, temp, how='left', on='Coupon_id')
# 此優惠券一共被使用多少張
temp = coupon_consume.groupby('Coupon_id').size().reset_index(name='c2')
X = pd.merge(X, temp, how='left', on='Coupon_id')
# 優惠券使用率
X['c3'] = X.c2 / X.c1
# 沒有使用的數目
X['c4'] = X.c1 - X.c2
# 此優惠券在當天發行了多少張
temp = X.groupby(['Coupon_id', 'Date_received']).size().reset_index(name='c5')
X = pd.merge(X, temp, how='left', on=['Coupon_id', 'Date_received'])
# 優惠券型別(直接優惠為0, 滿減為1)
X['c6'] = 0
X.loc[X.Discount_rate.str.contains(':') == True, 'c6'] = 1
# 不同打折優惠券領取次數
temp = offline.groupby('Discount_rate').size().reset_index(name='c8')
X = pd.merge(X, temp, how='left', on='Discount_rate')
# 不同打折優惠券使用次數
temp = coupon_consume.groupby('Discount_rate').size().reset_index(name='c9')
X = pd.merge(X, temp, how='left', on='Discount_rate')
# 不同打折優惠券不使用次數
temp = coupon_no_consume.groupby('Discount_rate').size().reset_index(name='c10')
X = pd.merge(X, temp, how='left', on='Discount_rate')
# 不同打折優惠券使用率
X['c11'] = X.c9 / X.c8
# 優惠券核銷平均時間
temp = pd.merge(coupon_consume, coupon_consume.groupby('Coupon_id').Date.max().reset_index(name='max'))
temp = pd.merge(temp, temp.groupby('Coupon_id').Date.min().reset_index(name='min'))
temp = pd.merge(temp, temp.groupby('Coupon_id').size().reset_index(name='count'))
temp['c12'] = ((temp['max'] - temp['min']).dt.days / (temp['count'] - 1))
temp = temp.drop_duplicates('Coupon_id')
X = pd.merge(X, temp[['Coupon_id', 'c12']], how='left', on='Coupon_id')
'''user merchant feature'''
# 使用者領取商家的優惠券次數
temp = offline[offline.Coupon_id != 0]
temp = temp.groupby(['User_id', 'Merchant_id']).size().reset_index(name='um1')
X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])
# 使用者領取商家的優惠券後不核銷次數
temp = coupon_no_consume.groupby(['User_id', 'Merchant_id']).size().reset_index(name='um2')
X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])
# 使用者領取商家的優惠券後核銷次數
temp = coupon_consume.groupby(['User_id', 'Merchant_id']).size().reset_index(name='um3')
X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])
# 使用者領取商家的優惠券後核銷率
X['um4'] = X.um3 / X.um1
# 使用者對每個商家的不核銷次數佔使用者總的不核銷次數的比重
temp = coupon_no_consume.groupby('User_id').size().reset_index(name='temp')
X = pd.merge(X, temp, how='left', on='User_id')
X['um5'] = X.um2 / X.temp
X.drop(columns='temp', inplace=True)
# 使用者在商店總共消費過幾次
temp = offline[offline.Date != date_null].groupby(['User_id', 'Merchant_id']).size().reset_index(name='um6')
X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])
# 使用者在商店普通消費次數
temp = offline[(offline.Coupon_id == 0) & (offline.Date != date_null)]
temp = temp.groupby(['User_id', 'Merchant_id']).size().reset_index(name='um7')
X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])
# 使用者當天在此商店領取的優惠券數目
temp = offline[offline.Date_received != date_null]
temp = temp.groupby(['User_id', 'Merchant_id', 'Date_received']).size().reset_index(name='um8')
X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id', 'Date_received'])
# 使用者領取優惠券不同商家數量
temp = offline[offline.Coupon_id == offline.Coupon_id]
temp = temp.groupby(['User_id', 'Merchant_id']).size().reset_index()
temp = temp.groupby('User_id').size().reset_index(name='um9')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者核銷優惠券不同商家數量
temp = coupon_consume.groupby(['User_id', 'Merchant_id']).size()
temp = temp.groupby('User_id').size().reset_index(name='um10')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者核銷過優惠券的不同商家數量佔所有不同商家的比重
X['um11'] = X.um10 / X.um9
# 使用者平均核銷每個商家多少張優惠券
X['um12'] = X.u2 / X.um9
'''other feature'''
# 使用者領取的所有優惠券數目
temp = X.groupby('User_id').size().reset_index(name='o1')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者領取的特定優惠券數目
temp = X.groupby(['User_id', 'Coupon_id']).size().reset_index(name='o2')
X = pd.merge(X, temp, how='left', on=['User_id', 'Coupon_id'])
# multiple threads
# data split
stop = len(X)
step = int(ceil(stop / cpu_jobs))
X_chunks = [X[i:i + step] for i in range(0, stop, step)]
X_list = [X] * cpu_jobs
counters = [i for i in range(cpu_jobs)]
start = datetime.datetime.now()
with ProcessPoolExecutor() as e:
X = pd.concat(e.map(task, X_chunks, X_list, counters))
print('time:', str(datetime.datetime.now() - start).split('.')[0])
# multiple threads
# 使用者領取優惠券平均時間間隔
temp = pd.merge(X, X.groupby('User_id').Date_received.max().reset_index(name='max'))
temp = pd.merge(temp, temp.groupby('User_id').Date_received.min().reset_index(name='min'))
temp = pd.merge(temp, temp.groupby('User_id').size().reset_index(name='len'))
temp['o7'] = ((temp['max'] - temp['min']).dt.days / (temp['len'] - 1))
temp = temp.drop_duplicates('User_id')
X = pd.merge(X, temp[['User_id', 'o7']], how='left', on='User_id')
# 使用者領取特定商家的優惠券數目
temp = X.groupby(['User_id', 'Merchant_id']).size().reset_index(name='o8')
X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])
# 使用者領取的不同商家數目
temp = X.groupby(['User_id', 'Merchant_id']).size()
temp = temp.groupby('User_id').size().reset_index(name='o9')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者當天領取的優惠券數目
temp = X.groupby(['User_id', 'Date_received']).size().reset_index(name='o10')
X = pd.merge(X, temp, how='left', on=['User_id', 'Date_received'])
# 使用者當天領取的特定優惠券數目
temp = X.groupby(['User_id', 'Coupon_id', 'Date_received']).size().reset_index(name='o11')
X = pd.merge(X, temp, how='left', on=['User_id', 'Coupon_id', 'Date_received'])
# 使用者領取的所有優惠券種類數目
temp = X.groupby(['User_id', 'Coupon_id']).size()
temp = temp.groupby('User_id').size().reset_index(name='o12')
X = pd.merge(X, temp, how='left', on='User_id')
# 商家被領取的優惠券數目
temp = X.groupby('Merchant_id').size().reset_index(name='o13')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家被領取的特定優惠券數目
temp = X.groupby(['Merchant_id', 'Coupon_id']).size().reset_index(name='o14')
X = pd.merge(X, temp, how='left', on=['Merchant_id', 'Coupon_id'])
# 商家被多少不同使用者領取的數目
temp = X.groupby(['Merchant_id', 'User_id']).size()
temp = temp.groupby('Merchant_id').size().reset_index(name='o15')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家發行的所有優惠券種類數目
temp = X.groupby(['Merchant_id', 'Coupon_id']).size()
temp = temp.groupby('Merchant_id').size().reset_index(name='o16')
X = pd.merge(X, temp, how='left', on='Merchant_id')
print(len(X), len(X.columns))
return X
def get_online_features(online, X):
# temp = online[online.Coupon_id == online.Coupon_id]
# coupon_consume = temp[temp.Date == temp.Date]
# coupon_no_consume = temp[temp.Date != temp.Date]
# 使用者線上操作次數
temp = online.groupby('User_id').size().reset_index(name='on_u1')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者線上點選次數
temp = online[online.Action == 0].groupby('User_id').size().reset_index(name='on_u2')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者線上點選率
X['on_u3'] = X.on_u2 / X.on_u1
# 使用者線上購買次數
temp = online[online.Action == 1].groupby('User_id').size().reset_index(name='on_u4')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者線上購買率
X['on_u5'] = X.on_u4 / X.on_u1
# 使用者線上領取次數
temp = online[online.Coupon_id != 0].groupby('User_id').size().reset_index(name='on_u6')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者線上領取率
X['on_u7'] = X.on_u6 / X.on_u1
# 使用者線上不消費次數
temp = online[(online.Date == date_null) & (online.Coupon_id != 0)]
temp = temp.groupby('User_id').size().reset_index(name='on_u8')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者線上優惠券核銷次數
temp = online[(online.Date != date_null) & (online.Coupon_id != 0)]
temp = temp.groupby('User_id').size().reset_index(name='on_u9')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者線上優惠券核銷率
X['on_u10'] = X.on_u9 / X.on_u6
# 使用者線下不消費次數佔線上線下總的不消費次數的比重
X['on_u11'] = X.u3 / (X.on_u8 + X.u3)
# 使用者線下的優惠券核銷次數佔線上線下總的優惠券核銷次數的比重
X['on_u12'] = X.u2 / (X.on_u9 + X.u2)
# 使用者線下領取的記錄數量佔總的記錄數量的比重
X['on_u13'] = X.u1 / (X.on_u6 + X.u1)
# # 消費優惠券的平均折率
# temp = coupon_consume.groupby('User_id').discount_rate.mean().reset_index(name='ou14')
# X = pd.merge(X, temp, how='left', on='User_id')
#
# # 使用者核銷優惠券的最低消費折率
# temp = coupon_consume.groupby('User_id').discount_rate.min().reset_index(name='ou15')
# X = pd.merge(X, temp, how='left', on='User_id')
#
# # 使用者核銷優惠券的最高消費折率
# temp = coupon_consume.groupby('User_id').discount_rate.max().reset_index(name='ou16')
# X = pd.merge(X, temp, how='left', on='User_id')
#
# # 不同打折優惠券領取次數
# temp = online.groupby('Discount_rate').size().reset_index(name='oc1')
# X = pd.merge(X, temp, how='left', on='Discount_rate')
#
# # 不同打折優惠券使用次數
# temp = coupon_consume.groupby('Discount_rate').size().reset_index(name='oc2')
# X = pd.merge(X, temp, how='left', on='Discount_rate')
#
# # 不同打折優惠券不使用次數
# temp = coupon_no_consume.groupby('Discount_rate').size().reset_index(name='oc3')
# X = pd.merge(X, temp, how='left', on='Discount_rate')
#
# # 不同打折優惠券使用率
# X['oc4'] = X.oc2 / X.oc1
print(len(X), len(X.columns))
print('----------')
return X
def get_train_data():
path = 'cache_%s_train.csv' % os.path.basename(__file__)
if os.path.exists(path):
data = pd.read_csv(path)
else:
offline, online = get_preprocess_data()
# date received 2016-01-01 - 2016-06-15
# date consumed 2016-01-01 - 2016-06-30
# train data 1
# 2016-04-16 ~ 2016-05-15
data_1 = offline[('2016-04-16' <= offline.Date_received) & (offline.Date_received <= '2016-05-15')].copy()
data_1['label'] = 0
data_1.loc[
(data_1.Date != date_null) & (data_1.Date - data_1.Date_received <= datetime.timedelta(15)), 'label'] = 1
# feature data 1
# 領券 2016-01-01 ~ 2016-03-31
end = '2016-03-31'
data_off_1 = offline[offline.Date_received <= end]
data_on_1 = online[online.Date_received <= end]
# 普通消費 2016-01-01 ~ 2016-04-15
end = '2016-04-15'
data_off_2 = offline[(offline.Coupon_id == 0) & (offline.Date <= end)]
data_on_2 = online[(online.Coupon_id == 0) & (online.Date <= end)]
data_1 = get_offline_features(data_1, pd.concat([data_off_1, data_off_2]))
data_1 = get_online_features(pd.concat([data_on_1, data_on_2]), data_1)
# train data 2
# 2016-05-16 ~ 2016-06-15
data_2 = offline['2016-05-16' <= offline.Date_received].copy()
data_2['label'] = 0
data_2.loc[
(data_2.Date != date_null) & (data_2.Date - data_2.Date_received <= datetime.timedelta(15)), 'label'] = 1
# feature data 2
# 領券
start = '2016-02-01'
end = '2016-04-30'
data_off_1 = offline[(start <= offline.Date_received) & (offline.Date_received <= end)]
data_on_1 = online[(start <= online.Date_received) & (online.Date_received <= end)]
# 普通消費
start = '2016-02-01'
end = '2016-05-15'
data_off_2 = offline[(offline.Coupon_id == 0) & (start <= offline.Date) & (offline.Date <= end)]
data_on_2 = online[(online.Coupon_id == 0) & (start <= online.Date) & (online.Date <= end)]
data_2 = get_offline_features(data_2, pd.concat([data_off_1, data_off_2]))
data_2 = get_online_features(pd.concat([data_on_1, data_on_2]), data_2)
data = pd.concat([data_1, data_2])
# undersampling
# if undersampling:
# temp = X_1[X_1.label == 1].groupby('User_id').size().reset_index()
# temp = X_1[X_1.User_id.isin(temp.User_id)]
# X_1 = pd.concat([temp, X_1[~X_1.User_id.isin(temp.User_id)].sample(4041)])
# data.drop_duplicates(inplace=True)
drop_columns(data)
data.fillna(0, inplace=True)
data.to_csv(path, index=False)
return data
def analysis():
offline, online = get_preprocess_data()
# t = offline.groupby('Discount_rate').size().reset_index(name='receive_count')
# t1 = offline[(offline.Coupon_id != 0) & (offline.Date != date_null)]
# t1 = t1.groupby('Discount_rate').size().reset_index(name='consume_count')
# t = pd.merge(t, t1, on='Discount_rate')
# t['consume_rate'] = t.consume_count / t.receive_count
# t = offline.groupby('Merchant_id').size().reset_index(name='receive_count')
# t1 = offline[(offline.Coupon_id != 0) & (offline.Date != date_null)]
# t1 = t1.groupby('Merchant_id').size().reset_index(name='consume_count')
# t = pd.merge(t, t1, on='Merchant_id')
# t['consume_rate'] = t.consume_count / t.receive_count
t = offline.groupby('Distance').size().reset_index(name='receive_count')
t1 = offline[(offline.Coupon_id != 0) & (offline.Date != date_null)]
t1 = t1.groupby('Distance').size().reset_index(name='consume_count')
t = pd.merge(t, t1, on='Distance')
t['consume_rate'] = t.consume_count / t.receive_count
t.to_csv('note.csv')
# plt.bar(temp.Discount_rate.values, temp.total.values)
# plt.bar(range(num), y1, bottom=y2, fc='r')
# plt.show()
exit()
def detect_duplicate_columns():
X = get_train_data()
X = X[:1000]
for index1 in range(len(X.columns) - 1):
for index2 in range(index1 + 1, len(X.columns)):
column1 = X.columns[index1]
column2 = X.columns[index2]
X[column1] = X[column1].astype(str)
X[column2] = X[column2].astype(str)
temp = len(X[X[column1] == X[column2]])
if temp == len(X):
print(column1, column2, temp)
exit()
def feature_importance_score():
clf = train_xgb()
fscores = pd.Series(clf.get_booster().get_fscore()).sort_values(ascending=False)
fscores.plot(kind='bar', title='Feature Importance')
plt.ylabel('Feature Importance Score')
plt.show()
exit()
def feature_selection():
data = get_train_data()
train_data, test_data = train_test_split(data,
train_size=100000,
random_state=0
)
X = train_data.copy().drop(columns='Coupon_id')
y = X.pop('label')
# sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
# X = sel.fit_transform(X)
# print(X.shape)
# Create the RFE object and rank each pixel
def fit_eval_metric(estimator, X, y, name=None):
if name is None:
name = estimator.__class__.__name__
if name is 'XGBClassifier' or name is 'LGBMClassifier':
estimator.fit(X, y, eval_metric='auc')
else:
estimator.fit(X, y)
return estimator
def grid_search(estimator, param_grid):
start = datetime.datetime.now()
print('--------------------------------------------')
print(start.strftime('%Y-%m-%d %H:%M:%S'))
print(param_grid)
print()
data = get_train_data()
data, _ = train_test_split(data, train_size=100000, random_state=0)
X = data.copy().drop(columns='Coupon_id')
y = X.pop('label')
estimator_name = estimator.__class__.__name__
n_jobs = cpu_jobs
clf = GridSearchCV(estimator=estimator, param_grid=param_grid, scoring='roc_auc', n_jobs=n_jobs
# cv=5
)
clf = fit_eval_metric(clf, X, y, estimator_name)
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
print('%0.5f (+/-%0.05f) for %r' % (mean, std * 2, params))
print()
print('best params', clf.best_params_)
print('best score', clf.best_score_)
print('time: %s' % str((datetime.datetime.now() - start)).split('.')[0])
print()
return clf.best_params_, clf.best_score_
def grid_search_auto(steps, params, estimator):
global log
old_params = params.copy()
while 1:
for name, step in steps.items():
score = 0
start = params[name] - step['step']
if start <= step['min']:
start = step['min']
stop = params[name] + step['step']
if step['max'] != 'inf' and stop >= step['max']:
stop = step['max']
while 1:
if str(step['step']).count('.') == 1:
stop += step['step'] / 10
else:
stop += step['step']
param_grid = {
name: np.arange(start, stop, step['step']),
}
best_params, best_score = grid_search(estimator.set_params(**params), param_grid)
if best_params[name] == params[name] or score > best_score:
print(estimator.__class__.__name__, params)
break
direction = (best_params[name] - params[name]) // abs(best_params[name] - params[name])
start = stop = best_params[name] + step['step'] * direction
score = best_score
params[name] = best_params[name]
print(estimator.__class__.__name__, params)
if best_params[name] - step['step'] < step['min'] or (
step['max'] != 'inf' and best_params[name] + step['step'] > step['max']):
break
if old_params == params:
break
old_params = params
print('--------------------------------------------')
print('new grid search')
print('--------------------------------------------')
log += 'grid search: %s\n%r\n' % (estimator.__class__.__name__, params)
def grid_search_gbdt(get_param=False):
params = {
# 10
'learning_rate': 1e-2,
'n_estimators': 1900,
'max_depth': 9,
'min_samples_split': 200,
'min_samples_leaf': 50,
'subsample': .8,
# 'learning_rate': 1e-1,
# 'n_estimators': 200,
# 'max_depth': 8,
# 'min_samples_split': 200,
# 'min_samples_leaf': 50,
# 'subsample': .8,
}
if get_param:
return params
steps = {
'n_estimators': {'step': 100, 'min': 1, 'max': 'inf'},
'max_depth': {'step': 1, 'min': 1, 'max': 'inf'},
'min_samples_split': {'step': 10, 'min': 2, 'max': 'inf'},
'min_samples_leaf': {'step': 10, 'min': 1, 'max': 'inf'},
'subsample': {'step': .1, 'min': .1, 'max': 1},
}
grid_search_auto(steps, params, GradientBoostingClassifier())
def grid_search_xgb(get_param=False):
params = {
'learning_rate': 1e-2,
'n_estimators': 1260,
'max_depth': 8,
'min_child_weight': 4,
'gamma': .2,
'subsample': .6,
'colsample_bytree': .8,
'scale_pos_weight': 1,
'reg_alpha': 0,
}
if get_param:
return params
steps = {
'n_estimators': {'step': 10, 'min': 1, 'max': 'inf'},
'max_depth': {'step': 1, 'min': 1, 'max': 'inf'},
'min_child_weight': {'step': 1, 'min': 1, 'max': 'inf'},
'gamma': {'step': .1, 'min': 0, 'max': 1},
'subsample': {'step': .1, 'min': .1, 'max': 1},
'colsample_bytree': {'step': .1, 'min': .1, 'max': 1},
'scale_pos_weight': {'step': 1, 'min': 1, 'max': 10},
'reg_alpha': {'step': .1, 'min': 0, 'max': 1},
}
grid_search_auto(steps, params, XGBClassifier())
def grid_search_lgb(get_param=False):
params = {
# 10
'learning_rate': 1e-2,
'n_estimators': 1200,
'num_leaves': 51,
'min_split_gain': 0,
'min_child_weight': 1e-3,
'min_child_samples': 22,
'subsample': .8,
'colsample_bytree': .8,
# 'learning_rate': .1,
# 'n_estimators': 90,
# 'num_leaves': 50,
# 'min_split_gain': 0,
# 'min_child_weight': 1e-3,
# 'min_child_samples': 21,
# 'subsample': .8,
# 'colsample_bytree': .8,
}
if get_param:
return params
steps = {
'n_estimators': {'step': 100, 'min': 1, 'max': 'inf'},
'num_leaves': {'step': 1, 'min': 1, 'max': 'inf'},
'min_split_gain': {'step': .1, 'min': 0, 'max': 1},
'min_child_weight': {'step': 1e-3, 'min': 1e-3, 'max': 'inf'},
'min_child_samples': {'step': 1, 'min': 1, 'max': 'inf'},
# 'subsample': {'step': .1, 'min': .1, 'max': 1},
'colsample_bytree': {'step': .1, 'min': .1, 'max': 1},
}
grid_search_auto(steps, params, LGBMClassifier())
def grid_search_cat(get_param=False):
params = {
# 10
'learning_rate': 1e-2,
'n_estimators': 3600,
'max_depth': 8,
'max_bin': 127,
'reg_lambda': 2,
'subsample': .7,
# 'learning_rate': 1e-1,
# 'iterations': 460,
# 'depth': 8,
# 'l2_leaf_reg': 8,
# 'border_count': 37,
# 'ctr_border_count': 16,
'one_hot_max_size': 2,
'bootstrap_type': 'Bernoulli',
'leaf_estimation_method': 'Newton',
'verbose': False,
'eval_metric': 'AUC',
'thread_count': cpu_jobs
}
if get_param:
return params
steps = {
'n_estimators': {'step': 100, 'min': 1, 'max': 'inf'},
'max_depth': {'step': 1, 'min': 1, 'max': 'inf'},
'max_bin': {'step': 1, 'min': 1, 'max': 255},
'reg_lambda': {'step': 1, 'min': 0, 'max': 'inf'},
'subsample': {'step': .1, 'min': .1, 'max': 1},
'one_hot_max_size': {'step': 1, 'min': 0, 'max': 255},
}
grid_search_auto(steps, params, CatBoostClassifier())
def grid_search_rf(criterion='gini', get_param=False):
if criterion == 'gini':
params = {
# 10
'n_estimators': 3090,
'max_depth': 15,
'min_samples_split': 2,
'min_samples_leaf': 1,
'criterion': 'gini',
}
else:
params = {
'n_estimators': 3110,
'max_depth': 13,
'min_samples_split': 70,
'min_samples_leaf': 10,
'criterion': 'entropy',
}
if get_param:
return params
steps = {
'n_estimators': {'step': 10, 'min': 1, 'max': 'inf'},
'max_depth': {'step': 1, 'min': 1, 'max': 'inf'},
'min_samples_split': {'step': 2, 'min': 2, 'max': 'inf'},
'min_samples_leaf': {'step': 2, 'min': 1, 'max': 'inf'},
}
grid_search_auto(steps, params, RandomForestClassifier())
def grid_search_et(criterion='gini', get_param=False):
if criterion == 'gini':
params = {
# 10
'n_estimators': 3060,
'max_depth': 22,
'min_samples_split': 12,
'min_samples_leaf': 1,
'criterion': 'gini',
}
else:
params = {
'n_estimators': 3100,
'max_depth': 13,
'min_samples_split': 70,
'min_samples_leaf': 10,
'criterion': 'entropy',
}
if get_param:
return params
steps = {
'n_estimators': {'step': 10, 'min': 1, 'max': 'inf'},
'max_depth': {'step': 1, 'min': 1, 'max': 'inf'},
'min_samples_split': {'step': 2, 'min': 2, 'max': 'inf'},
'min_samples_leaf': {'step': 2, 'min': 1, 'max': 'inf'},
}
grid_search_auto(steps, params, ExtraTreesClassifier())
def train_gbdt(model=False):
global log
params = grid_search_gbdt(True)
clf = GradientBoostingClassifier().set_params(**params)
if model:
return clf
params = clf.get_params()
log += 'gbdt'
log += ', learning_rate: %.3f' % params['learning_rate']
log += ', n_estimators: %d' % params['n_estimators']
log += ', max_depth: %d' % params['max_depth']
log += ', min_samples_split: %d' % params['min_samples_split']
log += ', min_samples_leaf: %d' % params['min_samples_leaf']
log += ', subsample: %.1f' % params['subsample']
log += '\n\n'
return train(clf)
def train_xgb(model=False):
global log
params = grid_search_xgb(True)
clf = XGBClassifier().set_params(**params)
if model:
return clf
params = clf.get_params()
log += 'xgb'
log += ', learning_rate: %.3f' % params['learning_rate']
log += ', n_estimators: %d' % params['n_estimators']
log += ', max_depth: %d' % params['max_depth']
log += ', min_child_weight: %d' % params['min_child_weight']
log += ', gamma: %.1f' % params['gamma']
log += ', subsample: %.1f' % params['subsample']
log += ', colsample_bytree: %.1f' % params['colsample_bytree']
log += '\n\n'
return train(clf)
def train_lgb(model=False):
global log
params = grid_search_lgb(True)
clf = LGBMClassifier().set_params(**params)
if model:
return clf
params = clf.get_params()
log += 'lgb'
log += ', learning_rate: %.3f' % params['learning_rate']
log += ', n_estimators: %d' % params['n_estimators']
log += ', num_leaves: %d' % params['num_leaves']
log += ', min_split_gain: %.1f' % params['min_split_gain']
log += ', min_child_weight: %.4f' % params['min_child_weight']
log += ', min_child_samples: %d' % params['min_child_samples']
log += ', subsample: %.1f' % params['subsample']
log += ', colsample_bytree: %.1f' % params['colsample_bytree']
log += '\n\n'
return train(clf)
def train_cat(model=False):
global log
params = grid_search_cat(True)
clf = CatBoostClassifier().set_params(**params)
if model:
return clf
params = clf.get_params()
log += 'cat'
log += ', learning_rate: %.3f' % params['learning_rate']
log += ', iterations: %d' % params['iterations']
log += ', depth: %d' % params['depth']
log += ', l2_leaf_reg: %d' % params['l2_leaf_reg']
log += ', border_count: %d' % params['border_count']
log += ', subsample: %d' % params['subsample']
log += ', one_hot_max_size: %d' % params['one_hot_max_size']
log += '\n\n'
return train(clf)
def train_rf(clf):
global log
params = clf.get_params()
log += 'rf'
log += ', n_estimators: %d' % params['n_estimators']
log += ', max_depth: %d' % params['max_depth']
log += ', min_samples_split: %d' % params['min_samples_split']
log += ', min_samples_leaf: %d' % params['min_samples_leaf']
log += ', criterion: %s' % params['criterion']
log += '\n\n'
return train(clf)
def train_rf_gini(model=False):
clf = RandomForestClassifier().set_params(**grid_search_rf('gini', True))
if model:
return clf
return train_rf(clf)
def train_rf_entropy():
clf = RandomForestClassifier().set_params(**grid_search_rf('entropy', True))
return train_rf(clf)
def train_et(clf):
global log
params = clf.get_params()
log += 'et'
log += ', n_estimators: %d' % params['n_estimators']
log += ', max_depth: %d' % params['max_depth']
log += ', min_samples_split: %d' % params['min_samples_split']
log += ', min_samples_leaf: %d' % params['min_samples_leaf']
log += ', criterion: %s' % params['criterion']
log += '\n\n'
return train(clf)
def train_et_gini(model=False):
clf = ExtraTreesClassifier().set_params(**grid_search_et('gini', True))
if model:
return clf
return train_et(clf)
def train_et_entropy():
clf = ExtraTreesClassifier().set_params(**{
'n_estimators': 310,
'max_depth': 13,
'min_samples_split': 70,
'min_samples_leaf': 10,
'criterion': 'entropy',
})
return train_et(clf)
def train(clf):
global log
data = get_train_data()
train_data, test_data = train_test_split(data,
train_size=100000,
random_state=0
)
_, test_data = train_test_split(data, random_state=0)
X_train = train_data.copy().drop(columns='Coupon_id')
y_train = X_train.pop('label')
clf = fit_eval_metric(clf, X_train, y_train)
X_test = test_data.copy().drop(columns='Coupon_id')
y_test = X_test.pop('label')
y_true, y_pred = y_test, clf.predict(X_test)
# log += '%s\n' % classification_report(y_test, y_pred)
log += ' accuracy: %f\n' % accuracy_score(y_true, y_pred)
y_score = clf.predict_proba(X_test)[:, 1]
log += ' auc: %f\n' % roc_auc_score(y_true, y_score)
# coupon average auc
coupons = test_data.groupby('Coupon_id').size().reset_index(name='total')
aucs = []
for _, coupon in coupons.iterrows():
if coupon.total > 1:
X_test = test_data[test_data.Coupon_id == coupon.Coupon_id].copy()
X_test.drop(columns='Coupon_id', inplace=True)
if len(X_test.label.unique()) != 2:
continue
y_true = X_test.pop('label')
y_score = clf.predict_proba(X_test)[:, 1]
aucs.append(roc_auc_score(y_true, y_score))
log += 'coupon auc: %f\n\n' % np.mean(aucs)
return clf
def predict(model):
path = 'cache_%s_predict.csv' % os.path.basename(__file__)
if os.path.exists(path):
X = pd.read_csv(path, parse_dates=['Date_received'])
else:
offline, online = get_preprocess_data()
# 2016-03-16 ~ 2016-06-30
start = '2016-03-16'
offline = offline[(offline.Coupon_id == 0) & (start <= offline.Date) | (start <= offline.Date_received)]
online = online[(online.Coupon_id == 0) & (start <= online.Date) | (start <= online.Date_received)]
X = get_preprocess_data(True)
X = get_offline_features(X, offline)
X = get_online_features(online, X)
X.drop_duplicates(inplace=True)
X.fillna(0, inplace=True)
X.to_csv(path, index=False)
sample_submission = X[['User_id', 'Coupon_id', 'Date_received']].copy()
sample_submission.Date_received = sample_submission.Date_received.dt.strftime('%Y%m%d')
drop_columns(X, True)
if model is 'blending':
predict = blending(X)
else:
clf = eval('train_%s' % model)()
predict = clf.predict_proba(X)[:, 1]
sample_submission['Probability'] = predict
sample_submission.to_csv('submission_%s.csv' % model,
# float_format='%.5f',
index=False, header=False)
def blending(predict_X=None):
global log
log += '\n'
X = get_train_data().drop(columns='Coupon_id')
y = X.pop('label')
X = np.asarray(X)
y = np.asarray(y)
_, X_submission, _, y_test_blend = train_test_split(X, y,
random_state=0
)
if predict_X is not None:
X_submission = np.asarray(predict_X)
X, _, y, _ = train_test_split(X, y,
train_size=100000,
random_state=0
)
# np.random.seed(0)
# idx = np.random.permutation(y.size)
# X = X[idx]
# y = y[idx]
skf = StratifiedKFold()
# clfs = ['gbdt', 'xgb', 'lgb', 'cat',
# # 'rf_gini', 'et_gini'
# ]
clfs = ['gbdt', 'cat', 'lgb']
blend_X_train = np.zeros((X.shape[0], len(clfs)))
blend_X_test = np.zeros((X_submission.shape[0], len(clfs)))
for j, v in enumerate(clfs):
clf = eval('train_%s' % v)(True)
aucs = []
dataset_blend_test_j = []
for train_index, test_index in skf.split(X, y):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
clf = fit_eval_metric(clf, X_train, y_train)
y_submission = clf.predict_proba(X_test)[:, 1]
aucs.append(roc_auc_score(y_test, y_submission))
blend_X_train[test_index, j] = y_submission
dataset_blend_test_j.append(clf.predict_proba(X_submission)[:, 1])
log += '%7s' % v + ' auc: %f\n' % np.mean(aucs)
blend_X_test[:, j] = np.asarray(dataset_blend_test_j).T.mean(1)
print('blending')
clf = LogisticRegression()
# clf = GradientBoostingClassifier()
clf.fit(blend_X_train, y)
y_submission = clf.predict_proba(blend_X_test)[:, 1]
# Linear stretch of predictions to [0,1]
y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())
if predict_X is not None:
return y_submission
log += '\n blend auc: %f\n\n' % roc_auc_score(y_test_blend, y_submission)
if __name__ == '__main__':
start = datetime.datetime.now()
print(start.strftime('%Y-%m-%d %H:%M:%S'))
log = '%s\n' % start.strftime('%Y-%m-%d %H:%M:%S')
cpu_jobs = os.cpu_count() - 1
date_null = pd.to_datetime('1970-01-01', format='%Y-%m-%d')
predict('blending')
log += 'time: %s\n' % str((datetime.datetime.now() - start)).split('.')[0]
log += '----------------------------------------------------\n'
open('%s.log' % os.path.basename(__file__), 'a').write(log)
print(log)
模型2:分數0.79
# 解決lgb報錯
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
import datetime
import os
from concurrent.futures import ProcessPoolExecutor
from math import ceil
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from xgboost.sklearn import XGBClassifier
os.chdir(r'E:\專案檔案\o2o優惠券使用預測')
pd.set_option('expand_frame_repr', False)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)
def drop_columns(X, predict=False):
columns = [
'User_id', 'Merchant_id', 'Discount_rate', 'Date_received', 'discount_rate_x', 'discount_rate_y',
# 'u33', 'u34'
]
if predict:
columns.append('Coupon_id')
else:
columns.append('Date')
X.drop(columns=columns, inplace=True)
def get_preprocess_data(predict=False):
if predict:
offline = pd.read_csv('ccf_offline_stage1_test_revised.csv', parse_dates=['Date_received'])
else:
offline = pd.read_csv('ccf_offline_stage1_train.csv', parse_dates=['Date_received', 'Date'])
offline.Distance.fillna(11, inplace=True)
offline.Distance = offline.Distance.astype(int)
offline.Coupon_id.fillna(0, inplace=True)
offline.Coupon_id = offline.Coupon_id.astype(int)
offline.Date_received.fillna(date_null, inplace=True)
offline[['discount_rate_x', 'discount_rate_y']] = offline[offline.Discount_rate.str.contains(':') == True][
'Discount_rate'].str.split(':', expand=True).astype(int)
offline['discount_rate'] = 1 - offline.discount_rate_y / offline.discount_rate_x
offline.discount_rate = offline.discount_rate.fillna(offline.Discount_rate).astype(float)
if predict:
return offline
offline.Date.fillna(date_null, inplace=True)
# online
online = pd.read_csv('ccf_online_stage1_train.csv', parse_dates=['Date_received', 'Date'])
online.Coupon_id.fillna(0, inplace=True)
# online.Coupon_id = online.Coupon_id.astype(int)
online.Date_received.fillna(date_null, inplace=True)
online.Date.fillna(date_null, inplace=True)
return offline, online
def task(X_chunk, X, counter):
print(counter, end=',', flush=True)
X_chunk = X_chunk.copy()
X_chunk['o17'] = -1
X_chunk['o18'] = -1
for i, user in X_chunk.iterrows():
temp = X[X.User_id == user.User_id]
temp1 = temp[temp.Date_received < user.Date_received]
temp2 = temp[temp.Date_received > user.Date_received]
# 使用者此次之後/前領取的所有優惠券數目
X_chunk.loc[i, 'o3'] = len(temp1)
X_chunk.loc[i, 'o4'] = len(temp2)
# 使用者此次之後/前領取的特定優惠券數目
X_chunk.loc[i, 'o5'] = len(temp1[temp1.Coupon_id == user.Coupon_id])
X_chunk.loc[i, 'o6'] = len(temp2[temp2.Coupon_id == user.Coupon_id])
# 使用者上/下一次領取的時間間隔
temp1 = temp1.sort_values(by='Date_received', ascending=False)
if len(temp1):
X_chunk.loc[i, 'o17'] = (user.Date_received - temp1.iloc[0].Date_received).days
temp2 = temp2.sort_values(by='Date_received')
if len(temp2):
X_chunk.loc[i, 'o18'] = (temp2.iloc[0].Date_received - user.Date_received).days
return X_chunk
def get_offline_features(X, offline):
# X = X[:1000]
print(len(X), len(X.columns))
temp = offline[offline.Coupon_id != 0]
coupon_consume = temp[temp.Date != date_null]
coupon_no_consume = temp[temp.Date == date_null]
user_coupon_consume = coupon_consume.groupby('User_id')
X['weekday'] = X.Date_received.dt.weekday
X['day'] = X.Date_received.dt.day
# # 距離優惠券消費次數
# temp = coupon_consume.groupby('Distance').size().reset_index(name='distance_0')
# X = pd.merge(X, temp, how='left', on='Distance')
#
# # 距離優惠券不消費次數
# temp = coupon_no_consume.groupby('Distance').size().reset_index(name='distance_1')
# X = pd.merge(X, temp, how='left', on='Distance')
#
# # 距離優惠券領取次數
# X['distance_2'] = X.distance_0 + X.distance_1
#
# # 距離優惠券消費率
# X['distance_3'] = X.distance_0 / X.distance_2
# temp = coupon_consume[coupon_consume.Distance != 11].groupby('Distance').size()
# temp['d4'] = temp.Distance.sum() / len(temp)
# X = pd.merge(X, temp, how='left', on='Distance')
'''user features'''
# 優惠券消費次數
temp = user_coupon_consume.size().reset_index(name='u2')
X = pd.merge(X, temp, how='left', on='User_id')
# X.u2.fillna(0, inplace=True)
# X.u2 = X.u2.astype(int)
# 優惠券不消費次數
temp = coupon_no_consume.groupby('User_id').size().reset_index(name='u3')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用優惠券次數與沒使用優惠券次數比值
X['u19'] = X.u2 / X.u3
# 領取優惠券次數
X['u1'] = X.u2.fillna(0) + X.u3.fillna(0)
# 優惠券核銷率
X['u4'] = X.u2 / X.u1
# 普通消費次數
temp = offline[(offline.Coupon_id == 0) & (offline.Date != date_null)]
temp1 = temp.groupby('User_id').size().reset_index(name='u5')
X = pd.merge(X, temp1, how='left', on='User_id')
# 一共消費多少次
X['u25'] = X.u2 + X.u5
# 使用者使用優惠券消費佔比
X['u20'] = X.u2 / X.u25
# 正常消費平均間隔
temp = pd.merge(temp, temp.groupby('User_id').Date.max().reset_index(name='max'))
temp = pd.merge(temp, temp.groupby('User_id').Date.min().reset_index(name='min'))
temp = pd.merge(temp, temp.groupby('User_id').size().reset_index(name='len'))
temp['u6'] = ((temp['max'] - temp['min']).dt.days / (temp['len'] - 1))
temp = temp.drop_duplicates('User_id')
X = pd.merge(X, temp[['User_id', 'u6']], how='left', on='User_id')
# 優惠券消費平均間隔
temp = pd.merge(coupon_consume, user_coupon_consume.Date.max().reset_index(name='max'))
temp = pd.merge(temp, temp.groupby('User_id').Date.min().reset_index(name='min'))
temp = pd.merge(temp, temp.groupby('User_id').size().reset_index(name='len'))
temp['u7'] = ((temp['max'] - temp['min']).dt.days / (temp['len'] - 1))
temp = temp.drop_duplicates('User_id')
X = pd.merge(X, temp[['User_id', 'u7']], how='left', on='User_id')
# 15天內平均會普通消費幾次
X['u8'] = X.u6 / 15
# 15天內平均會優惠券消費幾次
X['u9'] = X.u7 / 15
# 領取優惠券到使用優惠券的平均間隔時間
temp = coupon_consume.copy()
temp['days'] = (temp.Date - temp.Date_received).dt.days
temp = (temp.groupby('User_id').days.sum() / temp.groupby('User_id').size()).reset_index(name='u10')
X = pd.merge(X, temp, how='left', on='User_id')
# 在15天內使用掉優惠券的值大小
X['u11'] = X.u10 / 15
# 領取優惠券到使用優惠券間隔小於15天的次數
temp = coupon_consume.copy()
temp['days'] = (temp.Date - temp.Date_received).dt.days
temp = temp[temp.days <= 15]
temp = temp.groupby('User_id').size().reset_index(name='u21')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者15天使用掉優惠券的次數除以使用優惠券的次數
X['u22'] = X.u21 / X.u2
# 使用者15天使用掉優惠券的次數除以領取優惠券未消費的次數
X['u23'] = X.u21 / X.u3
# 使用者15天使用掉優惠券的次數除以領取優惠券的總次數
X['u24'] = X.u21 / X.u1
# 消費優惠券的平均折率
temp = user_coupon_consume.discount_rate.mean().reset_index(name='u45')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者核銷優惠券的最低消費折率
temp = user_coupon_consume.discount_rate.min().reset_index(name='u27')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者核銷優惠券的最高消費折率
temp = user_coupon_consume.discount_rate.max().reset_index(name='u28')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者核銷過的不同優惠券數量
temp = coupon_consume.groupby(['User_id', 'Coupon_id']).size()
temp = temp.groupby('User_id').size().reset_index(name='u32')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者領取所有不同優惠券數量
temp = offline[offline.Date_received != date_null]
temp = temp.groupby(['User_id', 'Coupon_id']).size().reset_index(name='u47')
X = pd.merge(X, temp, how='left', on=['User_id', 'Coupon_id'])
# 使用者核銷過的不同優惠券數量佔所有不同優惠券的比重
X['u33'] = X.u32 / X.u47
# 使用者平均每種優惠券核銷多少張
X['u34'] = X.u2 / X.u47
# 核銷優惠券使用者-商家平均距離
temp = offline[(offline.Coupon_id != 0) & (offline.Date != date_null) & (offline.Distance != 11)]
temp = temp.groupby('User_id').Distance
temp = pd.merge(temp.count().reset_index(name='x'), temp.sum().reset_index(name='y'), on='User_id')
temp['u35'] = temp.y / temp.x
temp = temp[['User_id', 'u35']]
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者核銷優惠券中的最小使用者-商家距離
temp = coupon_consume[coupon_consume.Distance != 11]
temp = temp.groupby('User_id').Distance.min().reset_index(name='u36')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者核銷優惠券中的最大使用者-商家距離
temp = coupon_consume[coupon_consume.Distance != 11]
temp = temp.groupby('User_id').Distance.max().reset_index(name='u37')
X = pd.merge(X, temp, how='left', on='User_id')
# 優惠券型別
discount_types = [
'0.2', '0.5', '0.6', '0.7', '0.75', '0.8', '0.85', '0.9', '0.95', '30:20', '50:30', '10:5',
'20:10', '100:50', '200:100', '50:20', '30:10', '150:50', '100:30', '20:5', '200:50', '5:1',
'50:10', '100:20', '150:30', '30:5', '300:50', '200:30', '150:20', '10:1', '50:5', '100:10',
'200:20', '300:30', '150:10', '300:20', '500:30', '20:1', '100:5', '200:10', '30:1', '150:5',
'300:10', '200:5', '50:1', '100:1',
]
X['discount_type'] = -1
for k, v in enumerate(discount_types):
X.loc[X.Discount_rate == v, 'discount_type'] = k
# 不同優惠券領取次數
temp = offline.groupby(['User_id', 'Discount_rate']).size().reset_index(name='u41')
X = pd.merge(X, temp, how='left', on=['User_id', 'Discount_rate'])
# 不同優惠券使用次數
temp = coupon_consume.groupby(['User_id', 'Discount_rate']).size().reset_index(name='u42')
X = pd.merge(X, temp, how='left', on=['User_id', 'Discount_rate'])
# 不同優惠券不使用次數
temp = coupon_no_consume.groupby(['User_id', 'Discount_rate']).size().reset_index(name='u43')
X = pd.merge(X, temp, how='left', on=['User_id', 'Discount_rate'])
# 不同打折優惠券使用率
X['u44'] = X.u42 / X.u41
# 滿減型別優惠券領取次數
temp = offline[offline.Discount_rate.str.contains(':') == True]
temp = temp.groupby('User_id').size().reset_index(name='u48')
X = pd.merge(X, temp, how='left', on='User_id')
# 打折型別優惠券領取次數
temp = offline[offline.Discount_rate.str.contains('\.') == True]
temp = temp.groupby('User_id').size().reset_index(name='u49')
X = pd.merge(X, temp, how='left', on='User_id')
'''offline merchant features'''
# 商戶消費次數
temp = offline[offline.Date != date_null].groupby('Merchant_id').size().reset_index(name='m0')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家優惠券被領取後核銷次數
temp = coupon_consume.groupby('Merchant_id').size().reset_index(name='m1')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商戶正常消費筆數
X['m2'] = X.m0.fillna(0) - X.m1.fillna(0)
# 商家優惠券被領取次數
temp = offline[offline.Date_received != date_null].groupby('Merchant_id').size().reset_index(name='m3')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家優惠券被領取後核銷率
X['m4'] = X.m1 / X.m3
# 商家優惠券被領取後不核銷次數
temp = coupon_no_consume.groupby('Merchant_id').size().reset_index(name='m7')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商戶當天優惠券領取次數
temp = X[X.Date_received != date_null]
temp = temp.groupby(['Merchant_id', 'Date_received']).size().reset_index(name='m5')
X = pd.merge(X, temp, how='left', on=['Merchant_id', 'Date_received'])
# 商戶當天優惠券領取人數
temp = X[X.Date_received != date_null]
temp = temp.groupby(['User_id', 'Merchant_id', 'Date_received']).size().reset_index()
temp = temp.groupby(['Merchant_id', 'Date_received']).size().reset_index(name='m6')
X = pd.merge(X, temp, how='left', on=['Merchant_id', 'Date_received'])
# 商家優惠券核銷的平均消費折率
temp = coupon_consume.groupby('Merchant_id').discount_rate.mean().reset_index(name='m8')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家優惠券核銷的最小消費折率
temp = coupon_consume.groupby('Merchant_id').discount_rate.max().reset_index(name='m9')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家優惠券核銷的最大消費折率
temp = coupon_consume.groupby('Merchant_id').discount_rate.min().reset_index(name='m10')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家優惠券核銷不同的使用者數量
temp = coupon_consume.groupby(['Merchant_id', 'User_id']).size()
temp = temp.groupby('Merchant_id').size().reset_index(name='m11')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家優惠券領取不同的使用者數量
temp = offline[offline.Date_received != date_null].groupby(['Merchant_id', 'User_id']).size()
temp = temp.groupby('Merchant_id').size().reset_index(name='m12')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 核銷商家優惠券的不同使用者數量其佔領取不同的使用者比重
X['m13'] = X.m11 / X.m12
# 商家優惠券平均每個使用者核銷多少張
X['m14'] = X.m1 / X.m12
# 商家被核銷過的不同優惠券數量
temp = coupon_consume.groupby(['Merchant_id', 'Coupon_id']).size()
temp = temp.groupby('Merchant_id').size().reset_index(name='m15')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家領取過的不同優惠券數量的比重
temp = offline[offline.Date_received != date_null].groupby(['Merchant_id', 'Coupon_id']).size()
temp = temp.groupby('Merchant_id').count().reset_index(name='m18')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家被核銷過的不同優惠券數量佔所有領取過的不同優惠券數量的比重
X['m19'] = X.m15 / X.m18
# 商家被核銷優惠券的平均時間
temp = pd.merge(coupon_consume, coupon_consume.groupby('Merchant_id').Date.max().reset_index(name='max'))
temp = pd.merge(temp, temp.groupby('Merchant_id').Date.min().reset_index(name='min'))
temp = pd.merge(temp, temp.groupby('Merchant_id').size().reset_index(name='len'))
temp['m20'] = ((temp['max'] - temp['min']).dt.days / (temp['len'] - 1))
temp = temp.drop_duplicates('Merchant_id')
X = pd.merge(X, temp[['Merchant_id', 'm20']], how='left', on='Merchant_id')
# 商家被核銷優惠券中的使用者-商家平均距離
temp = coupon_consume[coupon_consume.Distance != 11].groupby('Merchant_id').Distance
temp = pd.merge(temp.count().reset_index(name='x'), temp.sum().reset_index(name='y'), on='Merchant_id')
temp['m21'] = temp.y / temp.x
temp = temp[['Merchant_id', 'm21']]
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家被核銷優惠券中的使用者-商家最小距離
temp = coupon_consume[coupon_consume.Distance != 11]
temp = temp.groupby('Merchant_id').Distance.min().reset_index(name='m22')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家被核銷優惠券中的使用者-商家最大距離
temp = coupon_consume[coupon_consume.Distance != 11]
temp = temp.groupby('Merchant_id').Distance.max().reset_index(name='m23')
X = pd.merge(X, temp, how='left', on='Merchant_id')
"""offline coupon features"""
# 此優惠券一共發行多少張
temp = offline[offline.Coupon_id != 0].groupby('Coupon_id').size().reset_index(name='c1')
X = pd.merge(X, temp, how='left', on='Coupon_id')
# 此優惠券一共被使用多少張
temp = coupon_consume.groupby('Coupon_id').size().reset_index(name='c2')
X = pd.merge(X, temp, how='left', on='Coupon_id')
# 優惠券使用率
X['c3'] = X.c2 / X.c1
# 沒有使用的數目
X['c4'] = X.c1 - X.c2
# 此優惠券在當天發行了多少張
temp = X.groupby(['Coupon_id', 'Date_received']).size().reset_index(name='c5')
X = pd.merge(X, temp, how='left', on=['Coupon_id', 'Date_received'])
# 優惠券型別(直接優惠為0, 滿減為1)
X['c6'] = 0
X.loc[X.Discount_rate.str.contains(':') == True, 'c6'] = 1
# 不同打折優惠券領取次數
temp = offline.groupby('Discount_rate').size().reset_index(name='c8')
X = pd.merge(X, temp, how='left', on='Discount_rate')
# 不同打折優惠券使用次數
temp = coupon_consume.groupby('Discount_rate').size().reset_index(name='c9')
X = pd.merge(X, temp, how='left', on='Discount_rate')
# 不同打折優惠券不使用次數
temp = coupon_no_consume.groupby('Discount_rate').size().reset_index(name='c10')
X = pd.merge(X, temp, how='left', on='Discount_rate')
# 不同打折優惠券使用率
X['c11'] = X.c9 / X.c8
# 優惠券核銷平均時間
temp = pd.merge(coupon_consume, coupon_consume.groupby('Coupon_id').Date.max().reset_index(name='max'))
temp = pd.merge(temp, temp.groupby('Coupon_id').Date.min().reset_index(name='min'))
temp = pd.merge(temp, temp.groupby('Coupon_id').size().reset_index(name='count'))
temp['c12'] = ((temp['max'] - temp['min']).dt.days / (temp['count'] - 1))
temp = temp.drop_duplicates('Coupon_id')
X = pd.merge(X, temp[['Coupon_id', 'c12']], how='left', on='Coupon_id')
'''user merchant feature'''
# 使用者領取商家的優惠券次數
temp = offline[offline.Coupon_id != 0]
temp = temp.groupby(['User_id', 'Merchant_id']).size().reset_index(name='um1')
X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])
# 使用者領取商家的優惠券後不核銷次數
temp = coupon_no_consume.groupby(['User_id', 'Merchant_id']).size().reset_index(name='um2')
X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])
# 使用者領取商家的優惠券後核銷次數
temp = coupon_consume.groupby(['User_id', 'Merchant_id']).size().reset_index(name='um3')
X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])
# 使用者領取商家的優惠券後核銷率
X['um4'] = X.um3 / X.um1
# 使用者對每個商家的不核銷次數佔使用者總的不核銷次數的比重
temp = coupon_no_consume.groupby('User_id').size().reset_index(name='temp')
X = pd.merge(X, temp, how='left', on='User_id')
X['um5'] = X.um2 / X.temp
X.drop(columns='temp', inplace=True)
# 使用者在商店總共消費過幾次
temp = offline[offline.Date != date_null].groupby(['User_id', 'Merchant_id']).size().reset_index(name='um6')
X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])
# 使用者在商店普通消費次數
temp = offline[(offline.Coupon_id == 0) & (offline.Date != date_null)]
temp = temp.groupby(['User_id', 'Merchant_id']).size().reset_index(name='um7')
X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])
# 使用者當天在此商店領取的優惠券數目
temp = offline[offline.Date_received != date_null]
temp = temp.groupby(['User_id', 'Merchant_id', 'Date_received']).size().reset_index(name='um8')
X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id', 'Date_received'])
# 使用者領取優惠券不同商家數量
temp = offline[offline.Coupon_id == offline.Coupon_id]
temp = temp.groupby(['User_id', 'Merchant_id']).size().reset_index()
temp = temp.groupby('User_id').size().reset_index(name='um9')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者核銷優惠券不同商家數量
temp = coupon_consume.groupby(['User_id', 'Merchant_id']).size()
temp = temp.groupby('User_id').size().reset_index(name='um10')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者核銷過優惠券的不同商家數量佔所有不同商家的比重
X['um11'] = X.um10 / X.um9
# 使用者平均核銷每個商家多少張優惠券
X['um12'] = X.u2 / X.um9
'''other feature'''
# 使用者領取的所有優惠券數目
temp = X.groupby('User_id').size().reset_index(name='o1')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者領取的特定優惠券數目
temp = X.groupby(['User_id', 'Coupon_id']).size().reset_index(name='o2')
X = pd.merge(X, temp, how='left', on=['User_id', 'Coupon_id'])
# multiple threads
# data split
stop = len(X)
step = int(ceil(stop / cpu_jobs))
X_chunks = [X[i:i + step] for i in range(0, stop, step)]
X_list = [X] * cpu_jobs
counters = [i for i in range(cpu_jobs)]
start = datetime.datetime.now()
with ProcessPoolExecutor() as e:
X = pd.concat(e.map(task, X_chunks, X_list, counters))
print('time:', str(datetime.datetime.now() - start).split('.')[0])
# multiple threads
# 使用者領取優惠券平均時間間隔
temp = pd.merge(X, X.groupby('User_id').Date_received.max().reset_index(name='max'))
temp = pd.merge(temp, temp.groupby('User_id').Date_received.min().reset_index(name='min'))
temp = pd.merge(temp, temp.groupby('User_id').size().reset_index(name='len'))
temp['o7'] = ((temp['max'] - temp['min']).dt.days / (temp['len'] - 1))
temp = temp.drop_duplicates('User_id')
X = pd.merge(X, temp[['User_id', 'o7']], how='left', on='User_id')
# 使用者領取特定商家的優惠券數目
temp = X.groupby(['User_id', 'Merchant_id']).size().reset_index(name='o8')
X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])
# 使用者領取的不同商家數目
temp = X.groupby(['User_id', 'Merchant_id']).size()
temp = temp.groupby('User_id').size().reset_index(name='o9')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者當天領取的優惠券數目
temp = X.groupby(['User_id', 'Date_received']).size().reset_index(name='o10')
X = pd.merge(X, temp, how='left', on=['User_id', 'Date_received'])
# 使用者當天領取的特定優惠券數目
temp = X.groupby(['User_id', 'Coupon_id', 'Date_received']).size().reset_index(name='o11')
X = pd.merge(X, temp, how='left', on=['User_id', 'Coupon_id', 'Date_received'])
# 使用者領取的所有優惠券種類數目
temp = X.groupby(['User_id', 'Coupon_id']).size()
temp = temp.groupby('User_id').size().reset_index(name='o12')
X = pd.merge(X, temp, how='left', on='User_id')
# 商家被領取的優惠券數目
temp = X.groupby('Merchant_id').size().reset_index(name='o13')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家被領取的特定優惠券數目
temp = X.groupby(['Merchant_id', 'Coupon_id']).size().reset_index(name='o14')
X = pd.merge(X, temp, how='left', on=['Merchant_id', 'Coupon_id'])
# 商家被多少不同使用者領取的數目
temp = X.groupby(['Merchant_id', 'User_id']).size()
temp = temp.groupby('Merchant_id').size().reset_index(name='o15')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家發行的所有優惠券種類數目
temp = X.groupby(['Merchant_id', 'Coupon_id']).size()
temp = temp.groupby('Merchant_id').size().reset_index(name='o16')
X = pd.merge(X, temp, how='left', on='Merchant_id')
print(len(X), len(X.columns))
return X
def get_online_features(online, X):
# temp = online[online.Coupon_id == online.Coupon_id]
# coupon_consume = temp[temp.Date == temp.Date]
# coupon_no_consume = temp[temp.Date != temp.Date]
# 使用者線上操作次數
temp = online.groupby('User_id').size().reset_index(name='on_u1')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者線上點選次數
temp = online[online.Action == 0].groupby('User_id').size().reset_index(name='on_u2')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者線上點選率
X['on_u3'] = X.on_u2 / X.on_u1
# 使用者線上購買次數
temp = online[online.Action == 1].groupby('User_id').size().reset_index(name='on_u4')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者線上購買率
X['on_u5'] = X.on_u4 / X.on_u1
# 使用者線上領取次數
temp = online[online.Coupon_id != 0].groupby('User_id').size().reset_index(name='on_u6')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者線上領取率
X['on_u7'] = X.on_u6 / X.on_u1
# 使用者線上不消費次數
temp = online[(online.Date == date_null) & (online.Coupon_id != 0)]
temp = temp.groupby('User_id').size().reset_index(name='on_u8')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者線上優惠券核銷次數
temp = online[(online.Date != date_null) & (online.Coupon_id != 0)]
temp = temp.groupby('User_id').size().reset_index(name='on_u9')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者線上優惠券核銷率
X['on_u10'] = X.on_u9 / X.on_u6
# 使用者線下不消費次數佔線上線下總的不消費次數的比重
X['on_u11'] = X.u3 / (X.on_u8 + X.u3)
# 使用者線下的優惠券核銷次數佔線上線下總的優惠券核銷次數的比重
X['on_u12'] = X.u2 / (X.on_u9 + X.u2)
# 使用者線下領取的記錄數量佔總的記錄數量的比重
X['on_u13'] = X.u1 / (X.on_u6 + X.u1)
# # 消費優惠券的平均折率
# temp = coupon_consume.groupby('User_id').discount_rate.mean().reset_index(name='ou14')
# X = pd.merge(X, temp, how='left', on='User_id')
#
# # 使用者核銷優惠券的最低消費折率
# temp = coupon_consume.groupby('User_id').discount_rate.min().reset_index(name='ou15')
# X = pd.merge(X, temp, how='left', on='User_id')
#
# # 使用者核銷優惠券的最高消費折率
# temp = coupon_consume.groupby('User_id').discount_rate.max().reset_index(name='ou16')
# X = pd.merge(X, temp, how='left', on='User_id')
#
# # 不同打折優惠券領取次數
# temp = online.groupby('Discount_rate').size().reset_index(name='oc1')
# X = pd.merge(X, temp, how='left', on='Discount_rate')
#
# # 不同打折優惠券使用次數
# temp = coupon_consume.groupby('Discount_rate').size().reset_index(name='oc2')
# X = pd.merge(X, temp, how='left', on='Discount_rate')
#
# # 不同打折優惠券不使用次數
# temp = coupon_no_consume.groupby('Discount_rate').size().reset_index(name='oc3')
# X = pd.merge(X, temp, how='left', on='Discount_rate')
#
# # 不同打折優惠券使用率
# X['oc4'] = X.oc2 / X.oc1
print(len(X), len(X.columns))
print('----------')
return X
def get_train_data():
path = 'cache_%s_train.csv' % os.path.basename(__file__)
if os.path.exists(path):
data = pd.read_csv(path)
else:
offline, online = get_preprocess_data()
# date received 2016-01-01 - 2016-06-15
# date consumed 2016-01-01 - 2016-06-30
# train data 1
# 2016-04-16 ~ 2016-05-15
data_1 = offline[('2016-04-16' <= offline.Date_received) & (offline.Date_received <= '2016-05-15')].copy()
data_1['label'] = 0
data_1.loc[
(data_1.Date != date_null) & (data_1.Date - data_1.Date_received <= datetime.timedelta(15)), 'label'] = 1
# feature data 1
# 領券 2016-01-01 ~ 2016-03-31
end = '2016-03-31'
data_off_1 = offline[offline.Date_received <= end]
data_on_1 = online[online.Date_received <= end]
# 普通消費 2016-01-01 ~ 2016-04-15
end = '2016-04-15'
data_off_2 = offline[(offline.Coupon_id == 0) & (offline.Date <= end)]
data_on_2 = online[(online.Coupon_id == 0) & (online.Date <= end)]
data_1 = get_offline_features(data_1, pd.concat([data_off_1, data_off_2]))
data_1 = get_online_features(pd.concat([data_on_1, data_on_2]), data_1)
# train data 2
# 2016-05-16 ~ 2016-06-15
data_2 = offline['2016-05-16' <= offline.Date_received].copy()
data_2['label'] = 0
data_2.loc[
(data_2.Date != date_null) & (data_2.Date - data_2.Date_received <= datetime.timedelta(15)), 'label'] = 1
# feature data 2
# 領券
start = '2016-02-01'
end = '2016-04-30'
data_off_1 = offline[(start <= offline.Date_received) & (offline.Date_received <= end)]
data_on_1 = online[(start <= online.Date_received) & (online.Date_received <= end)]
# 普通消費
start = '2016-02-01'
end = '2016-05-15'
data_off_2 = offline[(offline.Coupon_id == 0) & (start <= offline.Date) & (offline.Date <= end)]
data_on_2 = online[(online.Coupon_id == 0) & (start <= online.Date) & (online.Date <= end)]
data_2 = get_offline_features(data_2, pd.concat([data_off_1, data_off_2]))
data_2 = get_online_features(pd.concat([data_on_1, data_on_2]), data_2)
data = pd.concat([data_1, data_2])
# undersampling
# if undersampling:
# temp = X_1[X_1.label == 1].groupby('User_id').size().reset_index()
# temp = X_1[X_1.User_id.isin(temp.User_id)]
# X_1 = pd.concat([temp, X_1[~X_1.User_id.isin(temp.User_id)].sample(4041)])
# data.drop_duplicates(inplace=True)
drop_columns(data)
data.fillna(0, inplace=True)
data.to_csv(path, index=False)
return data
def analysis():
offline, online = get_preprocess_data()
# t = offline.groupby('Discount_rate').size().reset_index(name='receive_count')
# t1 = offline[(offline.Coupon_id != 0) & (offline.Date != date_null)]
# t1 = t1.groupby('Discount_rate').size().reset_index(name='consume_count')
# t = pd.merge(t, t1, on='Discount_rate')
# t['consume_rate'] = t.consume_count / t.receive_count
# t = offline.groupby('Merchant_id').size().reset_index(name='receive_count')
# t1 = offline[(offline.Coupon_id != 0) & (offline.Date != date_null)]
# t1 = t1.groupby('Merchant_id').size().reset_index(name='consume_count')
# t = pd.merge(t, t1, on='Merchant_id')
# t['consume_rate'] = t.consume_count / t.receive_count
t = offline.groupby('Distance').size().reset_index(name='receive_count')
t1 = offline[(offline.Coupon_id != 0) & (offline.Date != date_null)]
t1 = t1.groupby('Distance').size().reset_index(name='consume_count')
t = pd.merge(t, t1, on='Distance')
t['consume_rate'] = t.consume_count / t.receive_count
t.to_csv('note.csv')
# plt.bar(temp.Discount_rate.values, temp.total.values)
# plt.bar(range(num), y1, bottom=y2, fc='r')
# plt.show()
exit()
def detect_duplicate_columns():
X = get_train_data()
X = X[:1000]
for index1 in range(len(X.columns) - 1):
for index2 in range(index1 + 1, len(X.columns)):
column1 = X.columns[index1]
column2 = X.columns[index2]
X[column1] = X[column1].astype(str)
X[column2] = X[column2].astype(str)
temp = len(X[X[column1] == X[column2]])
if temp == len(X):
print(column1, column2, temp)
exit()
def feature_importance_score():
clf = train_xgb()
fscores = pd.Series(clf.get_booster().get_fscore()).sort_values(ascending=False)
fscores.plot(kind='bar', title='Feature Importance')
plt.ylabel('Feature Importance Score')
plt.show()
exit()
def feature_selection():
data = get_train_data()
train_data, test_data = train_test_split(data,
train_size=100000,
random_state=0
)
X = train_data.copy().drop(columns='Coupon_id')
y = X.pop('label')
# sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
# X = sel.fit_transform(X)
# print(X.shape)
# Create the RFE object and rank each pixel
def fit_eval_metric(estimator, X, y, name=None):
if name is None:
name = estimator.__class__.__name__
if name is 'XGBClassifier' or name is 'LGBMClassifier':
estimator.fit(X, y, eval_metric='auc')
else:
estimator.fit(X, y)
return estimator
def grid_search(estimator, param_grid):
start = datetime.datetime.now()
print('--------------------------------------------')
print(start.strftime('%Y-%m-%d %H:%M:%S'))
print(param_grid)
print()
data = get_train_data()
data, _ = train_test_split(data, train_size=100000, random_state=0)
X = data.copy().drop(columns='Coupon_id')
y = X.pop('label')
estimator_name = estimator.__class__.__name__
n_jobs = cpu_jobs
if estimator_name is 'XGBClassifier' or estimator_name is 'LGBMClassifier' or estimator_name is 'CatBoostClassifier':
n_jobs = 1
clf = GridSearchCV(estimator=estimator, param_grid=param_grid, scoring='roc_auc', n_jobs=n_jobs
# cv=5
)
clf = fit_eval_metric(clf, X, y, estimator_name)
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
print('%0.5f (+/-%0.05f) for %r' % (mean, std * 2, params))
print()
print('best params', clf.best_params_)
print('best score', clf.best_score_)
print('time: %s' % str((datetime.datetime.now() - start)).split('.')[0])
print()
return clf.best_params_, clf.best_score_
def grid_search_auto(steps, params, estimator):
global log
old_params = params.copy()
while 1:
for name, step in steps.items():
score = 0
start = params[name] - step['step']
if start <= step['min']:
start = step['min']
stop = params[name] + step['step']
if step['max'] != 'inf' and stop >= step['max']:
stop = step['max']
while 1:
if str(step['step']).count('.') == 1:
stop += step['step'] / 10
else:
stop += step['step']
param_grid = {
name: np.arange(start, stop, step['step']),
}
best_params, best_score = grid_search(estimator.set_params(**params), param_grid)
if best_params[name] == params[name] or score > best_score:
print(estimator.__class__.__name__, params)
break
direction = (best_params[name] - params[name]) // abs(best_params[name] - params[name])
start = stop = best_params[name] + step['step'] * direction
score = best_score
params[name] = best_params[name]
print(estimator.__class__.__name__, params)
if best_params[name] - step['step'] < step['min'] or (
step['max'] != 'inf' and best_params[name] + step['step'] > step['max']):
break
if old_params == params:
break
old_params = params
print('--------------------------------------------')
print('new grid search')
print('--------------------------------------------')
log += 'grid search: %s\n%r\n' % (estimator.__class__.__name__, params)
def grid_search_gbdt(get_param=False):
params = {
# 10
'learning_rate': 1e-2,
'n_estimators': 1900,
'max_depth': 9,
'min_samples_split': 200,
'min_samples_leaf': 50,
'subsample': .8,
}
if get_param:
return params
steps = {
'n_estimators': {'step': 100, 'min': 1, 'max': 'inf'},
'max_depth': {'step': 1, 'min': 1, 'max': 'inf'},
'min_samples_split': {'step': 10, 'min': 2, 'max': 'inf'},
'min_samples_leaf': {'step': 10, 'min': 1, 'max': 'inf'},
'subsample': {'step': .1, 'min': .1, 'max': 1},
}
grid_search_auto(steps, params, GradientBoostingClassifier())
def grid_search_xgb(get_param=False):
params = {
# 8名的引數
'booster': 'gbtree',
'objective': 'rank:pairwise',
'min_child_weight': 1.1,
'colsample_bylevel': .7,
'reg_lambda': 1,
'learning_rate': 1e-2,
'n_estimators': 3500,
'max_depth': 5,
'gamma': .1,
'subsample': .7,
'colsample_bytree': .7,
'scale_pos_weight': 1,
'reg_alpha': 0,
'nthread': 12,
}
if get_param:
return params
steps = {
'n_estimators': {'step': 10, 'min': 1, 'max': 'inf'},
'max_depth': {'step': 1, 'min': 1, 'max': 'inf'},
'min_child_weight': {'step': 1, 'min': 1, 'max': 'inf'},
'gamma': {'step': .1, 'min': 0, 'max': 1},
'subsample': {'step': .1, 'min': .1, 'max': 1},
'colsample_bytree': {'step': .1, 'min': .1, 'max': 1},
'scale_pos_weight': {'step': 1, 'min': 1, 'max': 10},
'reg_alpha': {'step': .1, 'min': 0, 'max': 1},
}
grid_search_auto(steps, params, XGBClassifier())
def grid_search_lgb(get_param=False):
params = {
# 10
'learning_rate': 1e-2,
'n_estimators': 1200,
'num_leaves': 51,
'min_split_gain': 0,
'min_child_weight': 1e-3,
'min_child_samples': 22,
'subsample': .8,
'colsample_bytree': .8,
}
if get_param:
return params
steps = {
'n_estimators': {'step': 100, 'min': 1, 'max': 'inf'},
'num_leaves': {'step': 1, 'min': 1, 'max': 'inf'},
'min_split_gain': {'step': .1, 'min': 0, 'max': 1},
'min_child_weight': {'step': 1e-3, 'min': 1e-3, 'max': 'inf'},
'min_child_samples': {'step': 1, 'min': 1, 'max': 'inf'},
# 'subsample': {'step': .1, 'min': .1, 'max': 1},
'colsample_bytree': {'step': .1, 'min': .1, 'max': 1},
}
grid_search_auto(steps, params, LGBMClassifier())
def grid_search_cat(get_param=False):
params = {
# 10
'learning_rate': 1e-2,
'n_estimators': 3600,
'max_depth': 8,
'max_bin': 127,
'reg_lambda': 2,
'subsample': .7,
'one_hot_max_size': 2,
'bootstrap_type': 'Bernoulli',
'leaf_estimation_method': 'Newton',
'verbose': False,
'eval_metric': 'AUC',
'thread_count': cpu_jobs
}
if get_param:
return params
steps = {
'n_estimators': {'step': 150, 'min': 1, 'max': 'inf'},
'max_depth': {'step': 1, 'min': 1, 'max': 'inf'},
'max_bin': {'step': 1, 'min': 1, 'max': 255},
'reg_lambda': {'step': 1, 'min': 0, 'max': 'inf'},
'subsample': {'step': .1, 'min': .1, 'max': 1},
'one_hot_max_size': {'step': 1, 'min': 0, 'max': 255},
}
grid_search_auto(steps, params, CatBoostClassifier())
def grid_search_rf(criterion='gini', get_param=False):
if criterion == 'gini':
params = {
# 10
'n_estimators': 3090,
'max_depth': 15,
'min_samples_split': 2,
'min_samples_leaf': 1,
'criterion': 'gini',
}
else:
params = {
'n_estimators': 3110,
'max_depth': 13,
'min_samples_split': 70,
'min_samples_leaf': 10,
'criterion': 'entropy',
}
if get_param:
return params
steps = {
'n_estimators': {'step': 10, 'min': 1, 'max': 'inf'},
'max_depth': {'step': 1, 'min': 1, 'max': 'inf'},
'min_samples_split': {'step': 2, 'min': 2, 'max': 'inf'},
'min_samples_leaf': {'step': 2, 'min': 1, 'max': 'inf'},
}
grid_search_auto(steps, params, RandomForestClassifier())
def grid_search_et(criterion='gini', get_param=False):
if criterion == 'gini':
params = {
# 10
'n_estimators': 3060,
'max_depth': 22,
'min_samples_split': 12,
'min_samples_leaf': 1,
'criterion': 'gini',
}
else:
params = {
'n_estimators': 3100,
'max_depth': 13,
'min_samples_split': 70,
'min_samples_leaf': 10,
'criterion': 'entropy',
}
if get_param:
return params
steps = {
'n_estimators': {'step': 10, 'min': 1, 'max': 'inf'},
'max_depth': {'step': 1, 'min': 1, 'max': 'inf'},
'min_samples_split': {'step': 2, 'min': 2, 'max': 'inf'},
'min_samples_leaf': {'step': 2, 'min': 1, 'max': 'inf'},
}
grid_search_auto(steps, params, ExtraTreesClassifier())
def train_gbdt(model=False):
global log
params = grid_search_gbdt(True)
clf = GradientBoostingClassifier().set_params(**params)
if model:
return clf
params = clf.get_params()
log += 'gbdt'
log += ', learning_rate: %.3f' % params['learning_rate']
log += ', n_estimators: %d' % params['n_estimators']
log += ', max_depth: %d' % params['max_depth']
log += ', min_samples_split: %d' % params['min_samples_split']
log += ', min_samples_leaf: %d' % params['min_samples_leaf']
log += ', subsample: %.1f' % params['subsample']
log += '\n\n'
return train(clf)
def train_xgb(model=False):
global log
params = grid_search_xgb(True)
clf = XGBClassifier().set_params(**params)
if model:
return clf
params = clf.get_params()
log += 'xgb'
log += ', learning_rate: %.3f' % params['learning_rate']
log += ', n_estimators: %d' % params['n_estimators']
log += ', max_depth: %d' % params['max_depth']
log += ', min_child_weight: %d' % params['min_child_weight']
log += ', gamma: %.1f' % params['gamma']
log += ', subsample: %.1f' % params['subsample']
log += ', colsample_bytree: %.1f' % params['colsample_bytree']
log += '\n\n'
return train(clf)
def train_lgb(model=False):
global log
params = grid_search_lgb(True)
clf = LGBMClassifier().set_params(**params)
if model:
return clf
params = clf.get_params()
log += 'lgb'
log += ', learning_rate: %.3f' % params['learning_rate']
log += ', n_estimators: %d' % params['n_estimators']
log += ', num_leaves: %d' % params['num_leaves']
log += ', min_split_gain: %.1f' % params['min_split_gain']
log += ', min_child_weight: %.4f' % params['min_child_weight']
log += ', min_child_samples: %d' % params['min_child_samples']
log += ', subsample: %.1f' % params['subsample']
log += ', colsample_bytree: %.1f' % params['colsample_bytree']
log += '\n\n'
return train(clf)
def train_cat(model=False):
global log
params = grid_search_cat(True)
clf = CatBoostClassifier().set_params(**params)
if model:
return clf
params = clf.get_params()
log += 'cat'
log += ', learning_rate: %.3f' % params['learning_rate']
log += ', iterations: %d' % params['iterations']
log += ', depth: %d' % params['depth']
log += ', l2_leaf_reg: %d' % params['l2_leaf_reg']
log += ', border_count: %d' % params['border_count']
log += ', subsample: %d' % params['subsample']
log += ', one_hot_max_size: %d' % params['one_hot_max_size']
log += '\n\n'
return train(clf)
def train_rf(clf):
global log
params = clf.get_params()
log += 'rf'
log += ', n_estimators: %d' % params['n_estimators']
log += ', max_depth: %d' % params['max_depth']
log += ', min_samples_split: %d' % params['min_samples_split']
log += ', min_samples_leaf: %d' % params['min_samples_leaf']
log += ', criterion: %s' % params['criterion']
log += '\n\n'
return train(clf)
def train_rf_gini(model=False):
clf = RandomForestClassifier().set_params(**grid_search_rf('gini', True))
if model:
return clf
return train_rf(clf)
def train_rf_entropy():
clf = RandomForestClassifier().set_params(**grid_search_rf('entropy', True))
return train_rf(clf)
def train_et(clf):
global log
params = clf.get_params()
log += 'et'
log += ', n_estimators: %d' % params['n_estimators']
log += ', max_depth: %d' % params['max_depth']
log += ', min_samples_split: %d' % params['min_samples_split']
log += ', min_samples_leaf: %d' % params['min_samples_leaf']
log += ', criterion: %s' % params['criterion']
log += '\n\n'
return train(clf)
def train_et_gini(model=False):
clf = ExtraTreesClassifier().set_params(**grid_search_et('gini', True))
if model:
return clf
return train_et(clf)
def train_et_entropy():
clf = ExtraTreesClassifier().set_params(**{
'n_estimators': 3100,
'max_depth': 13,
'min_samples_split': 70,
'min_samples_leaf': 10,
'criterion': 'entropy',
'random_state': 0
})
return train_et(clf)
def train(clf):
global log
data = get_train_data()
train_data, test_data = train_test_split(data,
train_size=100000,
random_state=0
)
_, test_data = train_test_split(data, random_state=0)
X_train = train_data.copy().drop(columns='Coupon_id')
y_train = X_train.pop('label')
clf = fit_eval_metric(clf, X_train, y_train)
X_test = test_data.copy().drop(columns='Coupon_id')
y_test = X_test.pop('label')
y_true, y_pred = y_test, clf.predict(X_test)
# log += '%s\n' % classification_report(y_test, y_pred)
log += ' accuracy: %f\n' % accuracy_score(y_true, y_pred)
y_score = clf.predict_proba(X_test)[:, 1]
log += ' auc: %f\n' % roc_auc_score(y_true, y_score)
# coupon average auc
coupons = test_data.groupby('Coupon_id').size().reset_index(name='total')
aucs = []
for _, coupon in coupons.iterrows():
if coupon.total > 1:
X_test = test_data[test_data.Coupon_id == coupon.Coupon_id].copy()
X_test.drop(columns='Coupon_id', inplace=True)
if len(X_test.label.unique()) != 2:
continue
y_true = X_test.pop('label')
y_score = clf.predict_proba(X_test)[:, 1]
aucs.append(roc_auc_score(y_true, y_score))
log += 'coupon auc: %f\n\n' % np.mean(aucs)
return clf
def predict(model):
path = 'cache_%s_predict.csv' % os.path.basename(__file__)
if os.path.exists(path):
X = pd.read_csv(path, parse_dates=['Date_received'])
else:
offline, online = get_preprocess_data()
# 2016-03-16 ~ 2016-06-30
start = '2016-03-16'
offline = offline[(offline.Coupon_id == 0) & (start <= offline.Date) | (start <= offline.Date_received)]
online = online[(online.Coupon_id == 0) & (start <= online.Date) | (start <= online.Date_received)]
X = get_preprocess_data(True)
X = get_offline_features(X, offline)
X = get_online_features(online, X)
X.drop_duplicates(inplace=True)
X.fillna(0, inplace=True)
X.to_csv(path, index=False)
sample_submission = X[['User_id', 'Coupon_id', 'Date_received']].copy()
sample_submission.Date_received = sample_submission.Date_received.dt.strftime('%Y%m%d')
drop_columns(X, True)
if model is 'blending':
predict = blending(X)
else:
clf = eval('train_%s' % model)()
predict = clf.predict_proba(X)[:, 1]
sample_submission['Probability'] = predict
sample_submission.to_csv('submission_%s.csv' % model,
# float_format='%.5f',
index=False, header=False)
def blending(predict_X=None):
global log
log += '\n'
X = get_train_data().drop(columns='Coupon_id')
y = X.pop('label')
X = np.asarray(X)
y = np.asarray(y)
_, X_submission, _, y_test_blend = train_test_split(X, y,
random_state=0
)
if predict_X is not None:
X_submission = np.asarray(predict_X)
X, _, y, _ = train_test_split(X, y,
train_size=100000,
random_state=0
)
# np.random.seed(0)
# idx = np.random.permutation(y.size)
# X = X[idx]
# y = y[idx]
skf = StratifiedKFold()
clfs = ['gbdt', 'xgb',
'rf_gini', 'et_gini', 'lgb', 'cat'
]
blend_X_train = np.zeros((X.shape[0], len(clfs)))
blend_X_test = np.zeros((X_submission.shape[0], len(clfs)))
for j, v in enumerate(clfs):
clf = eval('train_%s' % v)(True)
aucs = []
dataset_blend_test_j = []
for train_index, test_index in skf.split(X, y):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
clf = fit_eval_metric(clf, X_train, y_train)
y_submission = clf.predict_proba(X_test)[:, 1]
aucs.append(roc_auc_score(y_test, y_submission))
blend_X_train[test_index, j] = y_submission
dataset_blend_test_j.append(clf.predict_proba(X_submission)[:, 1])
log += '%7s' % v + ' auc: %f\n' % np.mean(aucs)
blend_X_test[:, j] = np.asarray(dataset_blend_test_j).T.mean(1)
print('blending')
clf = LogisticRegression()
# clf = GradientBoostingClassifier()
clf.fit(blend_X_train, y)
y_submission = clf.predict_proba(blend_X_test)[:, 1]
# Linear stretch of predictions to [0,1]
y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())
if predict_X is not None:
return y_submission
log += '\n blend auc: %f\n\n' % roc_auc_score(y_test_blend, y_submission)
print(log)
if __name__ == '__main__':
start = datetime.datetime.now()
print(start.strftime('%Y-%m-%d %H:%M:%S'))
log = '%s\n' % start.strftime('%Y-%m-%d %H:%M:%S')
cpu_jobs = os.cpu_count() - 1
date_null = pd.to_datetime('1970-01-01', format='%Y-%m-%d')
blending()
predict('blending')
log += 'time: %s\n' % str((datetime.datetime.now() - start)).split('.')[0]
log += '----------------------------------------------------\n'
open('%s.log' % os.path.basename(__file__), 'a').write(log)
print(log)
模型三:0.5
import os
import pickle
from datetime import date
import numpy as np
import pandas as pd
from sklearn.linear_model import SGDClassifier
from Motor_fault.model_utils import build_model_etr, build_model_rf
os.chdir(r'E:\專案檔案\o2o優惠券使用預測')
dfoff = pd.read_csv('ccf_offline_stage1_train.csv')
dftest = pd.read_csv('ccf_offline_stage1_test_revised.csv')
dfon = pd.read_csv('ccf_online_stage1_train.csv')
# 1. 將滿xx減yy型別(`xx:yy`)的券變成折扣率 : `1 - yy/xx`,同時建立折扣券相關的特徵 `discount_rate, discount_man, discount_jian, discount_type`
# 2. 將距離 `str` 轉為 `int`
# convert Discount_rate and Distance
def getDiscountType(row):
if pd.isnull(row):
return np.nan
elif ':' in row:
return 1
else:
return 0
def convertRate(row):
"""Convert discount to rate"""
if pd.isnull(row):
return 1.0
elif ':' in str(row):
rows = row.split(':')
return 1.0 - float(rows[1]) / float(rows[0])
else:
return float(row)
def getDiscountMan(row):
if ':' in str(row):
rows = row.split(':')
return int(rows[0])
else:
return 0
def getDiscountJian(row):
if ':' in str(row):
rows = row.split(':')
return int(rows[1])
else:
return 0
print("tool is ok.")
def processData(df):
# convert discunt_rate
df['discount_rate'] = df['Discount_rate'].apply(convertRate)
df['discount_man'] = df['Discount_rate'].apply(getDiscountMan)
df['discount_jian'] = df['Discount_rate'].apply(getDiscountJian)
df['discount_type'] = df['Discount_rate'].apply(getDiscountType)
print(df['discount_rate'].unique())
# convert distance
df['distance'] = df['Distance'].fillna(-1).astype(int)
return df
dfoff = processData(dfoff)
dftest = processData(dftest)
date_received = dfoff['Date_received'].unique()
date_received = sorted(date_received[pd.notnull(date_received)])
date_buy = dfoff['Date'].unique()
date_buy = sorted(date_buy[pd.notnull(date_buy)])
date_buy = sorted(dfoff[dfoff['Date'].notnull()]['Date'])
couponbydate = dfoff[dfoff['Date_received'].notnull()][['Date_received', 'Date']].groupby(['Date_received'],
as_index=False).count()
couponbydate.columns = ['Date_received', 'count']
buybydate = dfoff[(dfoff['Date'].notnull()) & (dfoff['Date_received'].notnull())][['Date_received', 'Date']].groupby(
['Date_received'], as_index=False).count()
buybydate.columns = ['Date_received', 'count']
print("end")
def getWeekday(row):
if row == 'nan':
return np.nan
else:
return date(int(row[0:4]), int(row[4:6]), int(row[6:8])).weekday() + 1
dfoff['weekday'] = dfoff['Date_received'].astype(str).apply(getWeekday)
dftest['weekday'] = dftest['Date_received'].astype(str).apply(getWeekday)
# weekday_type : 週六和週日為1,其他為0
dfoff['weekday_type'] = dfoff['weekday'].apply(lambda x: 1 if x in [6, 7] else 0)
dftest['weekday_type'] = dftest['weekday'].apply(lambda x: 1 if x in [6, 7] else 0)
# change weekday to one-hot encoding
weekdaycols = ['weekday_' + str(i) for i in range(1, 8)]
tmpdf = pd.get_dummies(dfoff['weekday'].replace('nan', np.nan))
tmpdf.columns = weekdaycols
dfoff[weekdaycols] = tmpdf
tmpdf = pd.get_dummies(dftest['weekday'].replace('nan', np.nan))
tmpdf.columns = weekdaycols
dftest[weekdaycols] = tmpdf
def label(row):
if pd.isnull(row['Date_received']):
return -1
if pd.notnull(row['Date']):
td = pd.to_datetime(row['Date'], format='%Y%m%d') - pd.to_datetime(row['Date_received'], format='%Y%m%d')
if td <= pd.Timedelta(15, 'D'):
return 1
return 0
dfoff['label'] = dfoff.apply(label, axis=1)
print("end")
# data split
print("-----data split------")
df = dfoff[dfoff['label'] != -1].copy()
train = df[(df['Date_received'] < 20160516)].copy()
valid = df[(df['Date_received'] >= 20160516) & (df['Date_received'] <= 20160615)].copy()
print("end")
# feature
original_feature = ['discount_rate', 'discount_type', 'discount_man', 'discount_jian', 'distance', 'weekday',
'weekday_type'] + weekdaycols
print("----train-----")
x_train, y_train =train[original_feature], train['label']
x_val = dftest[original_feature]
model_rf = build_model_rf(x_train, y_train)
model_etr = build_model_etr(x_train, y_train)
train_etr_pred = model_etr.predict(x_train)
train_rf_pred = model_rf.predict(x_train)
Strak_X_train = pd.DataFrame()
Strak_X_train['Method_2'] = train_rf_pred
Strak_X_train['Method_4'] = train_etr_pred
#
# # 第二層
model = build_model_etr(Strak_X_train, y_train)
val_rf = model_rf.predict(x_val)
val_etr = model_etr.predict(x_val)
Strak_X_val = pd.DataFrame()
Strak_X_val['Method_1'] = val_rf
Strak_X_val['Method_4'] = val_etr
# test prediction for submission
y_test_pred = model.predict_proba(Strak_X_val)
dftest1 = dftest[['User_id', 'Coupon_id', 'Date_received']].copy()
dftest1['label'] = y_test_pred[:, 1]
dftest1.to_csv('submit.csv', index=False, header=False)
dftest1.head()
import joblib
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier, \
AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, accuracy_score, roc_auc_score, precision_recall_curve, auc, roc_curve, \
f1_score, recall_score, cohen_kappa_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
from utils.read_write import writeOneCsv
src = r'E:\專案檔案\電機故障診斷\data\\'
def build_model_dt(x_train, y_train):
estimator = DecisionTreeClassifier(random_state=7)
param_grid = {
'max_depth': range(10, 25, 1),
}
model = GridSearchCV(estimator, param_grid, cv=3)
model.fit(x_train, y_train)
print('dt')
print(model.best_params_)
writeParams('dt', model.best_params_)
return model
def build_model_rf(x_train, y_train):
estimator = RandomForestClassifier()
param_grid = {
'max_depth': range(42, 43, 1),
'n_estimators': range(79, 80, 1),
}
model = GridSearchCV(estimator, param_grid, cv=3)
model.fit(x_train, y_train)
print('rf')
print(model.best_params_)
writeParams('rf', model.best_params_)
return model
def build_model_etr(x_train, y_train):
# 極端隨機森林迴歸 n_estimators 即ExtraTreesRegressor最大的決策樹個數
estimator = ExtraTreesClassifier()
param_grid = {
'max_depth': range(33, 34, 1),
'n_estimators': range(108, 109, 1),
}
model = GridSearchCV(estimator, param_grid, cv=3)
model.fit(x_train, y_train)
print('etr')
print(model.best_params_)
writeParams('etr', model.best_params_)
return model
def build_model_xgb(x_train, y_train):
estimator = XGBClassifier(gamma=0, colsample_bytree=0.9, subsample=0.91)
param_grid = {
'learning_rate': [ 0.27],
'max_depth': range(12, 13, 1),
'n_estimators': range(34, 35, 3),
}
model = GridSearchCV(estimator, param_grid, cv=3)
model.fit(x_train, y_train)
print('xgb')
print(model.best_params_)
writeParams('xgb', model.best_params_)
return model
def build_model_lgb(x_train, y_train):
estimator = LGBMClassifier()
param_grid = {
'learning_rate': [0.18],
'n_estimators': range(100, 101, 1),
'num_leaves': range(75, 80, 5)
}
gbm = GridSearchCV(estimator, param_grid)
gbm.fit(x_train, y_train.ravel())
print('lgb')
print(gbm.best_params_)
writeParams('lgb', gbm.best_params_)
return gbm
def build_model_mlpr(x_train, y_train):
from sklearn.neural_network import MLPClassifier
'''啟用函式用relu,梯度下降方法用lbfgs,效果是最好的'''
mlp = MLPClassifier(activation='relu', solver='lbfgs')
param_grid = {
'alpha': [0.002, 0.001],
'hidden_layer_sizes': [(38, 19)],
'max_iter': range(75, 85, 1),
}
model = GridSearchCV(mlp, param_grid, cv=3)
model.fit(x_train, y_train.ravel())
print('mlpr')
print(model.best_params_)
writeParams('mlpr', model.best_params_)
return model
def build_model_ada(x_train, y_train):
estimator = AdaBoostClassifier()
param_grid = {
'learning_rate': [0.23],
'n_estimators': range(13, 14, 1),
}
model = GridSearchCV(estimator, param_grid, cv=3)
model.fit(x_train, y_train)
print('ada')
print(model.best_params_)
writeParams('ada', model.best_params_)
return model
def build_model_gbdt(x_train, y_train):
estimator = GradientBoostingClassifier(min_samples_leaf=0.1, min_samples_split=10, subsample=0.998)
param_grid = {
'learning_rate': [0.75],
'max_depth': range(25, 30, 1),
'n_estimators': range(80, 85, 1)
}
gbdt = GridSearchCV(estimator, param_grid, cv=3)
gbdt.fit(x_train, y_train.ravel())
print('gbdt')
print(gbdt.best_params_)
writeParams('gbdt', gbdt.best_params_)
return gbdt
def build_model_liner_svc(x_train, y_train):
svm_reg = LinearSVC(max_iter=-1)
param_grid = {
'C': range(1, 2, 1),
}
model = GridSearchCV(svm_reg, param_grid, cv=3)
model.fit(x_train, y_train)
print('LinearSVC')
print(model.best_params_)
return model
def train_logistic_classifier(x_train, y_train):
model = LogisticRegression()
param_grid = {
'C': range(2, 3, 1),
'penalty': ['l2'],
}
model = GridSearchCV(model, param_grid, cv=3)
model.fit(x_train, y_train.ravel())
print('LR')
print(model.best_params_)
return model
def build_model_svc(x_train, y_train):
model = SVC(max_iter=-1)
param_grid = {
'C': range(1, 2, 2),
'kernel': ['poly', 'rbf', 'precomputed'],
'cache_size': range(200, 210, 20),
}
model = GridSearchCV(model, param_grid, cv=3)
model.fit(x_train, y_train.ravel())
print('SVC')
print(model.best_params_)
return model
def score_model(test, predict, model, data_type):
accuracy = round(accuracy_score(test, predict), 6)
print(data_type + ',accuracy,', accuracy)
writeOneCsv(['staking', data_type, 'accuracy', accuracy], src + '調參記錄.csv')
pre_score = precision_score(test, predict, average="macro")
print(data_type + ",precision,", round(pre_score, 6))
writeOneCsv(['staking', data_type, 'precision', round(pre_score, 6)], src + '調參記錄.csv')
roc_auc = round(roc_auc_score(test, predict), 6)
print(data_type + ",roc_auc,", roc_auc)
writeOneCsv(['staking', data_type, 'roc_auc', roc_auc], src + '調參記錄.csv')
f1 = f1_score(predict, test)
print(data_type + ",f1,", round(f1, 6))
writeOneCsv(['staking', data_type, 'f1', round(f1, 6)], src + '調參記錄.csv')
recall = recall_score(predict, test)
print(data_type + ",recall,", round(recall, 6))
writeOneCsv(['staking', data_type, 'recall', round(recall, 6)], src + '調參記錄.csv')
cohen_kappa = cohen_kappa_score(predict, test)
print(data_type + ",cohen_kappa,", round(cohen_kappa, 6))
writeOneCsv(['staking', data_type, 'cohen_kappa', round(cohen_kappa, 6)], src + '調參記錄.csv')
def save_load(model, save_or_load):
path = src + 'etr.pkl'
# save model
if save_or_load == 'save':
joblib.dump(model, path)
else:
# load model
model_etr = joblib.load(path)
return model_etr
def fit_size(x, y):
from sklearn import preprocessing
x_min = preprocessing.MinMaxScaler()
y_min = preprocessing.MinMaxScaler()
y = np.array(y).reshape(len(y), 1)
x = x_min.fit_transform(x)
y = y_min.fit_transform(y)
return x, y
def scatter_line(y_val, y_pre):
xx = range(0, len(y_val))
plt.scatter(xx, y_val, color="red", label="actual", linewidth=3)
plt.plot(xx, y_pre, color="orange", label="predicted", linewidth=2)
plt.legend()
plt.show()
def draw_ROC_curve(y_test, y_predict):
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_predict)
roc_auc = auc(false_positive_rate, true_positive_rate)
plt.title('ROC')
plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.2f' % roc_auc)
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.ylabel('TPR')
plt.xlabel('FPR')
plt.legend()
plt.show()
plt.close(0)
def pr(y_val, predict_proba):
precision, recall, thresholds = precision_recall_curve(y_val, predict_proba)
plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('2-class Precision-Recall curve')
plt.legend()
plt.show()
def writeParams(model, best):
if model in ['gbdt', 'xgb']:
writeOneCsv([model, best['max_depth'], best['n_estimators'], best['learning_rate']], src + '調參記錄.csv')
elif model == 'mlpr':
writeOneCsv([model, best['hidden_layer_sizes'], best['max_iter'], best['alpha']], src + '調參記錄.csv')
elif model == 'ada':
writeOneCsv([model, 0, best['n_estimators'], best['learning_rate']], src + '調參記錄.csv')
elif model == 'lgb':
writeOneCsv([model, best['num_leaves'], best['n_estimators'], best['learning_rate']], src + '調參記錄.csv')
elif model == 'dt':
writeOneCsv([model, best['max_depth'], 0, 0], src + '調參記錄.csv')
else:
writeOneCsv([model, best['max_depth'], best['n_estimators'], 0], src + '調參記錄.csv')
def write_mae(model, data_type, mae):
writeOneCsv([model, data_type, 'mae', mae], src + '調參記錄.csv')
喜歡記得一鍵三連
相關文章
- 資料探勘實戰 - 天池新人賽o2o優惠券使用預測
- 天池 O2O 優惠券使用預測思路解析與程式碼實戰
- 《阿里雲天池大賽賽題解析》——O2O優惠卷預測阿里
- 618新人199IT知識星球優惠券
- RocketMQ實現優惠券秒殺MQ
- 阿里雲最新優惠券領取及續費優惠券阿里
- flutter實現類似優惠券樣式Flutter
- 阿里雲優惠券領取及使用方法阿里
- 2019阿里雲最新優惠券阿里
- 優惠券採集資訊
- JAVA分散式優惠券系統後臺 手把手實戰開發Java分散式
- 基於XGBoost模型的幸福度預測——阿里天池學習賽模型阿里
- 安卓自定義優惠券View安卓View
- 阿里雲代金券 | 阿里雲優惠券 |阿里雲優惠碼|雲伺服器|阿里雲阿里伺服器
- 天池FashionAI全球挑戰賽小小嚐試AI
- 阿里雲全民雲端計算優惠券及官方優惠彙總阿里
- day08-優惠券秒殺04
- day05-優惠券秒殺01
- php微信掃碼領優惠券PHP
- 阿里雲伺服器代金券怎麼個優惠法?為什麼代金券是最實惠的阿里伺服器
- 優惠券系統應該如何設計?
- 談談優惠券系統的設計
- vivo 全球商城:優惠券系統架構設計與實踐架構
- 阿里雲學生機代金券 優惠伺服器購買方法和代金券使用阿里伺服器
- 阿里雲最新優惠券1888代金券免費領取及使用教程(圖文)阿里
- 144.從拼多多優惠券事件想到的事件
- 設計電商平臺優惠券系統
- 【實踐篇】教你玩轉JWT認證---從一個優惠券聊起JWT
- python 爬蟲實戰專案--爬取京東商品資訊(價格、優惠、排名、好評率等)Python爬蟲
- 科技愛好者週刊(第 182 期):新人優惠的風險
- 小魔推「動態優惠券」新功能增加使用者黏性提升復購
- 阿里雲伺服器選購攻略[內附優惠券]阿里伺服器
- 從拼多多優惠券事件看到的一些反思事件
- 營銷模組資料庫表解析:優惠券功能資料庫
- 怎麼樣可以搶到直播優惠券?有哪些技巧?
- 利用App優惠券漏洞,狂薅羊毛770萬被抓!APP
- Java效能調優實戰-劉超-極客時間-返現優惠Java
- 如何實現千萬級優惠文章的優惠資訊同步