天池新人實戰賽o2o優惠券使用預測-排名181
資料
本賽題提供使用者在2016年1月1日至2016年6月30日之間真實線上線下消費行為,預測使用者在2016年7月領取優惠券後15天以內的使用情況。
具體請移步:o2o優惠券使用預測
具體思路:
去除不要的特徵
填充空值
計算統計特徵
使用[‘gbdt’, ‘xgb’, ‘rf_gini’, ‘et_gini’, ‘lgb’, ‘cat’]做blending去預測
模型1:分數0.8
# 解決lgb報錯
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
import datetime
import os
from concurrent.futures import ProcessPoolExecutor
from math import ceil
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from xgboost.sklearn import XGBClassifier
os.chdir(r'E:\專案檔案\o2o優惠券使用預測')
# dfoff = pd.read_csv('ccf_offline_stage1_train.csv')
# dftest = pd.read_csv('ccf_offline_stage1_test_revised.csv')
# dfon = pd.read_csv('ccf_online_stage1_train.csv')
pd.set_option('expand_frame_repr', False)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)
def drop_columns(X, predict=False):
columns = [
'User_id', 'Merchant_id', 'Discount_rate', 'Date_received', 'discount_rate_x', 'discount_rate_y',
# 'u33', 'u34'
]
if predict:
columns.append('Coupon_id')
else:
columns.append('Date')
X.drop(columns=columns, inplace=True)
def get_preprocess_data(predict=False):
if predict:
offline = pd.read_csv('ccf_offline_stage1_test_revised.csv', parse_dates=['Date_received'])
else:
offline = pd.read_csv('ccf_offline_stage1_train.csv', parse_dates=['Date_received', 'Date'])
offline.Distance.fillna(11, inplace=True)
offline.Distance = offline.Distance.astype(int)
offline.Coupon_id.fillna(0, inplace=True)
offline.Coupon_id = offline.Coupon_id.astype(int)
offline.Date_received.fillna(date_null, inplace=True)
offline[['discount_rate_x', 'discount_rate_y']] = offline[offline.Discount_rate.str.contains(':') == True][
'Discount_rate'].str.split(':', expand=True).astype(int)
offline['discount_rate'] = 1 - offline.discount_rate_y / offline.discount_rate_x
offline.discount_rate = offline.discount_rate.fillna(offline.Discount_rate).astype(float)
if predict:
return offline
offline.Date.fillna(date_null, inplace=True)
# online
online = pd.read_csv('ccf_online_stage1_train.csv', parse_dates=['Date_received', 'Date'])
online.Coupon_id.fillna(0, inplace=True)
# online.Coupon_id = online.Coupon_id.astype(int)
online.Date_received.fillna(date_null, inplace=True)
online.Date.fillna(date_null, inplace=True)
return offline, online
def task(X_chunk, X, counter):
print(counter, end=',', flush=True)
X_chunk = X_chunk.copy()
X_chunk['o17'] = -1
X_chunk['o18'] = -1
for i, user in X_chunk.iterrows():
temp = X[X.User_id == user.User_id]
temp1 = temp[temp.Date_received < user.Date_received]
temp2 = temp[temp.Date_received > user.Date_received]
# 使用者此次之後/前領取的所有優惠券數目
X_chunk.loc[i, 'o3'] = len(temp1)
X_chunk.loc[i, 'o4'] = len(temp2)
# 使用者此次之後/前領取的特定優惠券數目
X_chunk.loc[i, 'o5'] = len(temp1[temp1.Coupon_id == user.Coupon_id])
X_chunk.loc[i, 'o6'] = len(temp2[temp2.Coupon_id == user.Coupon_id])
# 使用者上/下一次領取的時間間隔
temp1 = temp1.sort_values(by='Date_received', ascending=False)
if len(temp1):
X_chunk.loc[i, 'o17'] = (user.Date_received - temp1.iloc[0].Date_received).days
temp2 = temp2.sort_values(by='Date_received')
if len(temp2):
X_chunk.loc[i, 'o18'] = (temp2.iloc[0].Date_received - user.Date_received).days
return X_chunk
def get_offline_features(X, offline):
# X = X[:1000]
print(len(X), len(X.columns))
temp = offline[offline.Coupon_id != 0]
coupon_consume = temp[temp.Date != date_null]
coupon_no_consume = temp[temp.Date == date_null]
user_coupon_consume = coupon_consume.groupby('User_id')
X['weekday'] = X.Date_received.dt.weekday
X['day'] = X.Date_received.dt.day
# # 距離優惠券消費次數
# temp = coupon_consume.groupby('Distance').size().reset_index(name='distance_0')
# X = pd.merge(X, temp, how='left', on='Distance')
#
# # 距離優惠券不消費次數
# temp = coupon_no_consume.groupby('Distance').size().reset_index(name='distance_1')
# X = pd.merge(X, temp, how='left', on='Distance')
#
# # 距離優惠券領取次數
# X['distance_2'] = X.distance_0 + X.distance_1
#
# # 距離優惠券消費率
# X['distance_3'] = X.distance_0 / X.distance_2
# temp = coupon_consume[coupon_consume.Distance != 11].groupby('Distance').size()
# temp['d4'] = temp.Distance.sum() / len(temp)
# X = pd.merge(X, temp, how='left', on='Distance')
'''user features'''
# 優惠券消費次數
temp = user_coupon_consume.size().reset_index(name='u2')
X = pd.merge(X, temp, how='left', on='User_id')
# X.u2.fillna(0, inplace=True)
# X.u2 = X.u2.astype(int)
# 優惠券不消費次數
temp = coupon_no_consume.groupby('User_id').size().reset_index(name='u3')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用優惠券次數與沒使用優惠券次數比值
X['u19'] = X.u2 / X.u3
# 領取優惠券次數
X['u1'] = X.u2.fillna(0) + X.u3.fillna(0)
# 優惠券核銷率
X['u4'] = X.u2 / X.u1
# 普通消費次數
temp = offline[(offline.Coupon_id == 0) & (offline.Date != date_null)]
temp1 = temp.groupby('User_id').size().reset_index(name='u5')
X = pd.merge(X, temp1, how='left', on='User_id')
# 一共消費多少次
X['u25'] = X.u2 + X.u5
# 使用者使用優惠券消費佔比
X['u20'] = X.u2 / X.u25
# 正常消費平均間隔
temp = pd.merge(temp, temp.groupby('User_id').Date.max().reset_index(name='max'))
temp = pd.merge(temp, temp.groupby('User_id').Date.min().reset_index(name='min'))
temp = pd.merge(temp, temp.groupby('User_id').size().reset_index(name='len'))
temp['u6'] = ((temp['max'] - temp['min']).dt.days / (temp['len'] - 1))
temp = temp.drop_duplicates('User_id')
X = pd.merge(X, temp[['User_id', 'u6']], how='left', on='User_id')
# 優惠券消費平均間隔
temp = pd.merge(coupon_consume, user_coupon_consume.Date.max().reset_index(name='max'))
temp = pd.merge(temp, temp.groupby('User_id').Date.min().reset_index(name='min'))
temp = pd.merge(temp, temp.groupby('User_id').size().reset_index(name='len'))
temp['u7'] = ((temp['max'] - temp['min']).dt.days / (temp['len'] - 1))
temp = temp.drop_duplicates('User_id')
X = pd.merge(X, temp[['User_id', 'u7']], how='left', on='User_id')
# 15天內平均會普通消費幾次
X['u8'] = X.u6 / 15
# 15天內平均會優惠券消費幾次
X['u9'] = X.u7 / 15
# 領取優惠券到使用優惠券的平均間隔時間
temp = coupon_consume.copy()
temp['days'] = (temp.Date - temp.Date_received).dt.days
temp = (temp.groupby('User_id').days.sum() / temp.groupby('User_id').size()).reset_index(name='u10')
X = pd.merge(X, temp, how='left', on='User_id')
# 在15天內使用掉優惠券的值大小
X['u11'] = X.u10 / 15
# 領取優惠券到使用優惠券間隔小於15天的次數
temp = coupon_consume.copy()
temp['days'] = (temp.Date - temp.Date_received).dt.days
temp = temp[temp.days <= 15]
temp = temp.groupby('User_id').size().reset_index(name='u21')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者15天使用掉優惠券的次數除以使用優惠券的次數
X['u22'] = X.u21 / X.u2
# 使用者15天使用掉優惠券的次數除以領取優惠券未消費的次數
X['u23'] = X.u21 / X.u3
# 使用者15天使用掉優惠券的次數除以領取優惠券的總次數
X['u24'] = X.u21 / X.u1
# 消費優惠券的平均折率
temp = user_coupon_consume.discount_rate.mean().reset_index(name='u45')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者核銷優惠券的最低消費折率
temp = user_coupon_consume.discount_rate.min().reset_index(name='u27')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者核銷優惠券的最高消費折率
temp = user_coupon_consume.discount_rate.max().reset_index(name='u28')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者核銷過的不同優惠券數量
temp = coupon_consume.groupby(['User_id', 'Coupon_id']).size()
temp = temp.groupby('User_id').size().reset_index(name='u32')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者領取所有不同優惠券數量
temp = offline[offline.Date_received != date_null]
temp = temp.groupby(['User_id', 'Coupon_id']).size().reset_index(name='u47')
X = pd.merge(X, temp, how='left', on=['User_id', 'Coupon_id'])
# 使用者核銷過的不同優惠券數量佔所有不同優惠券的比重
X['u33'] = X.u32 / X.u47
# 使用者平均每種優惠券核銷多少張
X['u34'] = X.u2 / X.u47
# 核銷優惠券使用者-商家平均距離
temp = offline[(offline.Coupon_id != 0) & (offline.Date != date_null) & (offline.Distance != 11)]
temp = temp.groupby('User_id').Distance
temp = pd.merge(temp.count().reset_index(name='x'), temp.sum().reset_index(name='y'), on='User_id')
temp['u35'] = temp.y / temp.x
temp = temp[['User_id', 'u35']]
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者核銷優惠券中的最小使用者-商家距離
temp = coupon_consume[coupon_consume.Distance != 11]
temp = temp.groupby('User_id').Distance.min().reset_index(name='u36')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者核銷優惠券中的最大使用者-商家距離
temp = coupon_consume[coupon_consume.Distance != 11]
temp = temp.groupby('User_id').Distance.max().reset_index(name='u37')
X = pd.merge(X, temp, how='left', on='User_id')
# 優惠券型別
discount_types = [
'0.2', '0.5', '0.6', '0.7', '0.75', '0.8', '0.85', '0.9', '0.95', '30:20', '50:30', '10:5',
'20:10', '100:50', '200:100', '50:20', '30:10', '150:50', '100:30', '20:5', '200:50', '5:1',
'50:10', '100:20', '150:30', '30:5', '300:50', '200:30', '150:20', '10:1', '50:5', '100:10',
'200:20', '300:30', '150:10', '300:20', '500:30', '20:1', '100:5', '200:10', '30:1', '150:5',
'300:10', '200:5', '50:1', '100:1',
]
X['discount_type'] = -1
for k, v in enumerate(discount_types):
X.loc[X.Discount_rate == v, 'discount_type'] = k
# 不同優惠券領取次數
temp = offline.groupby(['User_id', 'Discount_rate']).size().reset_index(name='u41')
X = pd.merge(X, temp, how='left', on=['User_id', 'Discount_rate'])
# 不同優惠券使用次數
temp = coupon_consume.groupby(['User_id', 'Discount_rate']).size().reset_index(name='u42')
X = pd.merge(X, temp, how='left', on=['User_id', 'Discount_rate'])
# 不同優惠券不使用次數
temp = coupon_no_consume.groupby(['User_id', 'Discount_rate']).size().reset_index(name='u43')
X = pd.merge(X, temp, how='left', on=['User_id', 'Discount_rate'])
# 不同打折優惠券使用率
X['u44'] = X.u42 / X.u41
# 滿減型別優惠券領取次數
temp = offline[offline.Discount_rate.str.contains(':') == True]
temp = temp.groupby('User_id').size().reset_index(name='u48')
X = pd.merge(X, temp, how='left', on='User_id')
# 打折型別優惠券領取次數
temp = offline[offline.Discount_rate.str.contains('\.') == True]
temp = temp.groupby('User_id').size().reset_index(name='u49')
X = pd.merge(X, temp, how='left', on='User_id')
'''offline merchant features'''
# 商戶消費次數
temp = offline[offline.Date != date_null].groupby('Merchant_id').size().reset_index(name='m0')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家優惠券被領取後核銷次數
temp = coupon_consume.groupby('Merchant_id').size().reset_index(name='m1')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商戶正常消費筆數
X['m2'] = X.m0.fillna(0) - X.m1.fillna(0)
# 商家優惠券被領取次數
temp = offline[offline.Date_received != date_null].groupby('Merchant_id').size().reset_index(name='m3')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家優惠券被領取後核銷率
X['m4'] = X.m1 / X.m3
# 商家優惠券被領取後不核銷次數
temp = coupon_no_consume.groupby('Merchant_id').size().reset_index(name='m7')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商戶當天優惠券領取次數
temp = X[X.Date_received != date_null]
temp = temp.groupby(['Merchant_id', 'Date_received']).size().reset_index(name='m5')
X = pd.merge(X, temp, how='left', on=['Merchant_id', 'Date_received'])
# 商戶當天優惠券領取人數
temp = X[X.Date_received != date_null]
temp = temp.groupby(['User_id', 'Merchant_id', 'Date_received']).size().reset_index()
temp = temp.groupby(['Merchant_id', 'Date_received']).size().reset_index(name='m6')
X = pd.merge(X, temp, how='left', on=['Merchant_id', 'Date_received'])
# 商家優惠券核銷的平均消費折率
temp = coupon_consume.groupby('Merchant_id').discount_rate.mean().reset_index(name='m8')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家優惠券核銷的最小消費折率
temp = coupon_consume.groupby('Merchant_id').discount_rate.max().reset_index(name='m9')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家優惠券核銷的最大消費折率
temp = coupon_consume.groupby('Merchant_id').discount_rate.min().reset_index(name='m10')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家優惠券核銷不同的使用者數量
temp = coupon_consume.groupby(['Merchant_id', 'User_id']).size()
temp = temp.groupby('Merchant_id').size().reset_index(name='m11')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家優惠券領取不同的使用者數量
temp = offline[offline.Date_received != date_null].groupby(['Merchant_id', 'User_id']).size()
temp = temp.groupby('Merchant_id').size().reset_index(name='m12')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 核銷商家優惠券的不同使用者數量其佔領取不同的使用者比重
X['m13'] = X.m11 / X.m12
# 商家優惠券平均每個使用者核銷多少張
X['m14'] = X.m1 / X.m12
# 商家被核銷過的不同優惠券數量
temp = coupon_consume.groupby(['Merchant_id', 'Coupon_id']).size()
temp = temp.groupby('Merchant_id').size().reset_index(name='m15')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家領取過的不同優惠券數量的比重
temp = offline[offline.Date_received != date_null].groupby(['Merchant_id', 'Coupon_id']).size()
temp = temp.groupby('Merchant_id').count().reset_index(name='m18')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家被核銷過的不同優惠券數量佔所有領取過的不同優惠券數量的比重
X['m19'] = X.m15 / X.m18
# 商家被核銷優惠券的平均時間
temp = pd.merge(coupon_consume, coupon_consume.groupby('Merchant_id').Date.max().reset_index(name='max'))
temp = pd.merge(temp, temp.groupby('Merchant_id').Date.min().reset_index(name='min'))
temp = pd.merge(temp, temp.groupby('Merchant_id').size().reset_index(name='len'))
temp['m20'] = ((temp['max'] - temp['min']).dt.days / (temp['len'] - 1))
temp = temp.drop_duplicates('Merchant_id')
X = pd.merge(X, temp[['Merchant_id', 'm20']], how='left', on='Merchant_id')
# 商家被核銷優惠券中的使用者-商家平均距離
temp = coupon_consume[coupon_consume.Distance != 11].groupby('Merchant_id').Distance
temp = pd.merge(temp.count().reset_index(name='x'), temp.sum().reset_index(name='y'), on='Merchant_id')
temp['m21'] = temp.y / temp.x
temp = temp[['Merchant_id', 'm21']]
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家被核銷優惠券中的使用者-商家最小距離
temp = coupon_consume[coupon_consume.Distance != 11]
temp = temp.groupby('Merchant_id').Distance.min().reset_index(name='m22')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家被核銷優惠券中的使用者-商家最大距離
temp = coupon_consume[coupon_consume.Distance != 11]
temp = temp.groupby('Merchant_id').Distance.max().reset_index(name='m23')
X = pd.merge(X, temp, how='left', on='Merchant_id')
"""offline coupon features"""
# 此優惠券一共發行多少張
temp = offline[offline.Coupon_id != 0].groupby('Coupon_id').size().reset_index(name='c1')
X = pd.merge(X, temp, how='left', on='Coupon_id')
# 此優惠券一共被使用多少張
temp = coupon_consume.groupby('Coupon_id').size().reset_index(name='c2')
X = pd.merge(X, temp, how='left', on='Coupon_id')
# 優惠券使用率
X['c3'] = X.c2 / X.c1
# 沒有使用的數目
X['c4'] = X.c1 - X.c2
# 此優惠券在當天發行了多少張
temp = X.groupby(['Coupon_id', 'Date_received']).size().reset_index(name='c5')
X = pd.merge(X, temp, how='left', on=['Coupon_id', 'Date_received'])
# 優惠券型別(直接優惠為0, 滿減為1)
X['c6'] = 0
X.loc[X.Discount_rate.str.contains(':') == True, 'c6'] = 1
# 不同打折優惠券領取次數
temp = offline.groupby('Discount_rate').size().reset_index(name='c8')
X = pd.merge(X, temp, how='left', on='Discount_rate')
# 不同打折優惠券使用次數
temp = coupon_consume.groupby('Discount_rate').size().reset_index(name='c9')
X = pd.merge(X, temp, how='left', on='Discount_rate')
# 不同打折優惠券不使用次數
temp = coupon_no_consume.groupby('Discount_rate').size().reset_index(name='c10')
X = pd.merge(X, temp, how='left', on='Discount_rate')
# 不同打折優惠券使用率
X['c11'] = X.c9 / X.c8
# 優惠券核銷平均時間
temp = pd.merge(coupon_consume, coupon_consume.groupby('Coupon_id').Date.max().reset_index(name='max'))
temp = pd.merge(temp, temp.groupby('Coupon_id').Date.min().reset_index(name='min'))
temp = pd.merge(temp, temp.groupby('Coupon_id').size().reset_index(name='count'))
temp['c12'] = ((temp['max'] - temp['min']).dt.days / (temp['count'] - 1))
temp = temp.drop_duplicates('Coupon_id')
X = pd.merge(X, temp[['Coupon_id', 'c12']], how='left', on='Coupon_id')
'''user merchant feature'''
# 使用者領取商家的優惠券次數
temp = offline[offline.Coupon_id != 0]
temp = temp.groupby(['User_id', 'Merchant_id']).size().reset_index(name='um1')
X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])
# 使用者領取商家的優惠券後不核銷次數
temp = coupon_no_consume.groupby(['User_id', 'Merchant_id']).size().reset_index(name='um2')
X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])
# 使用者領取商家的優惠券後核銷次數
temp = coupon_consume.groupby(['User_id', 'Merchant_id']).size().reset_index(name='um3')
X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])
# 使用者領取商家的優惠券後核銷率
X['um4'] = X.um3 / X.um1
# 使用者對每個商家的不核銷次數佔使用者總的不核銷次數的比重
temp = coupon_no_consume.groupby('User_id').size().reset_index(name='temp')
X = pd.merge(X, temp, how='left', on='User_id')
X['um5'] = X.um2 / X.temp
X.drop(columns='temp', inplace=True)
# 使用者在商店總共消費過幾次
temp = offline[offline.Date != date_null].groupby(['User_id', 'Merchant_id']).size().reset_index(name='um6')
X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])
# 使用者在商店普通消費次數
temp = offline[(offline.Coupon_id == 0) & (offline.Date != date_null)]
temp = temp.groupby(['User_id', 'Merchant_id']).size().reset_index(name='um7')
X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])
# 使用者當天在此商店領取的優惠券數目
temp = offline[offline.Date_received != date_null]
temp = temp.groupby(['User_id', 'Merchant_id', 'Date_received']).size().reset_index(name='um8')
X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id', 'Date_received'])
# 使用者領取優惠券不同商家數量
temp = offline[offline.Coupon_id == offline.Coupon_id]
temp = temp.groupby(['User_id', 'Merchant_id']).size().reset_index()
temp = temp.groupby('User_id').size().reset_index(name='um9')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者核銷優惠券不同商家數量
temp = coupon_consume.groupby(['User_id', 'Merchant_id']).size()
temp = temp.groupby('User_id').size().reset_index(name='um10')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者核銷過優惠券的不同商家數量佔所有不同商家的比重
X['um11'] = X.um10 / X.um9
# 使用者平均核銷每個商家多少張優惠券
X['um12'] = X.u2 / X.um9
'''other feature'''
# 使用者領取的所有優惠券數目
temp = X.groupby('User_id').size().reset_index(name='o1')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者領取的特定優惠券數目
temp = X.groupby(['User_id', 'Coupon_id']).size().reset_index(name='o2')
X = pd.merge(X, temp, how='left', on=['User_id', 'Coupon_id'])
# multiple threads
# data split
stop = len(X)
step = int(ceil(stop / cpu_jobs))
X_chunks = [X[i:i + step] for i in range(0, stop, step)]
X_list = [X] * cpu_jobs
counters = [i for i in range(cpu_jobs)]
start = datetime.datetime.now()
with ProcessPoolExecutor() as e:
X = pd.concat(e.map(task, X_chunks, X_list, counters))
print('time:', str(datetime.datetime.now() - start).split('.')[0])
# multiple threads
# 使用者領取優惠券平均時間間隔
temp = pd.merge(X, X.groupby('User_id').Date_received.max().reset_index(name='max'))
temp = pd.merge(temp, temp.groupby('User_id').Date_received.min().reset_index(name='min'))
temp = pd.merge(temp, temp.groupby('User_id').size().reset_index(name='len'))
temp['o7'] = ((temp['max'] - temp['min']).dt.days / (temp['len'] - 1))
temp = temp.drop_duplicates('User_id')
X = pd.merge(X, temp[['User_id', 'o7']], how='left', on='User_id')
# 使用者領取特定商家的優惠券數目
temp = X.groupby(['User_id', 'Merchant_id']).size().reset_index(name='o8')
X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])
# 使用者領取的不同商家數目
temp = X.groupby(['User_id', 'Merchant_id']).size()
temp = temp.groupby('User_id').size().reset_index(name='o9')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者當天領取的優惠券數目
temp = X.groupby(['User_id', 'Date_received']).size().reset_index(name='o10')
X = pd.merge(X, temp, how='left', on=['User_id', 'Date_received'])
# 使用者當天領取的特定優惠券數目
temp = X.groupby(['User_id', 'Coupon_id', 'Date_received']).size().reset_index(name='o11')
X = pd.merge(X, temp, how='left', on=['User_id', 'Coupon_id', 'Date_received'])
# 使用者領取的所有優惠券種類數目
temp = X.groupby(['User_id', 'Coupon_id']).size()
temp = temp.groupby('User_id').size().reset_index(name='o12')
X = pd.merge(X, temp, how='left', on='User_id')
# 商家被領取的優惠券數目
temp = X.groupby('Merchant_id').size().reset_index(name='o13')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家被領取的特定優惠券數目
temp = X.groupby(['Merchant_id', 'Coupon_id']).size().reset_index(name='o14')
X = pd.merge(X, temp, how='left', on=['Merchant_id', 'Coupon_id'])
# 商家被多少不同使用者領取的數目
temp = X.groupby(['Merchant_id', 'User_id']).size()
temp = temp.groupby('Merchant_id').size().reset_index(name='o15')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家發行的所有優惠券種類數目
temp = X.groupby(['Merchant_id', 'Coupon_id']).size()
temp = temp.groupby('Merchant_id').size().reset_index(name='o16')
X = pd.merge(X, temp, how='left', on='Merchant_id')
print(len(X), len(X.columns))
return X
def get_online_features(online, X):
# temp = online[online.Coupon_id == online.Coupon_id]
# coupon_consume = temp[temp.Date == temp.Date]
# coupon_no_consume = temp[temp.Date != temp.Date]
# 使用者線上操作次數
temp = online.groupby('User_id').size().reset_index(name='on_u1')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者線上點選次數
temp = online[online.Action == 0].groupby('User_id').size().reset_index(name='on_u2')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者線上點選率
X['on_u3'] = X.on_u2 / X.on_u1
# 使用者線上購買次數
temp = online[online.Action == 1].groupby('User_id').size().reset_index(name='on_u4')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者線上購買率
X['on_u5'] = X.on_u4 / X.on_u1
# 使用者線上領取次數
temp = online[online.Coupon_id != 0].groupby('User_id').size().reset_index(name='on_u6')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者線上領取率
X['on_u7'] = X.on_u6 / X.on_u1
# 使用者線上不消費次數
temp = online[(online.Date == date_null) & (online.Coupon_id != 0)]
temp = temp.groupby('User_id').size().reset_index(name='on_u8')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者線上優惠券核銷次數
temp = online[(online.Date != date_null) & (online.Coupon_id != 0)]
temp = temp.groupby('User_id').size().reset_index(name='on_u9')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者線上優惠券核銷率
X['on_u10'] = X.on_u9 / X.on_u6
# 使用者線下不消費次數佔線上線下總的不消費次數的比重
X['on_u11'] = X.u3 / (X.on_u8 + X.u3)
# 使用者線下的優惠券核銷次數佔線上線下總的優惠券核銷次數的比重
X['on_u12'] = X.u2 / (X.on_u9 + X.u2)
# 使用者線下領取的記錄數量佔總的記錄數量的比重
X['on_u13'] = X.u1 / (X.on_u6 + X.u1)
# # 消費優惠券的平均折率
# temp = coupon_consume.groupby('User_id').discount_rate.mean().reset_index(name='ou14')
# X = pd.merge(X, temp, how='left', on='User_id')
#
# # 使用者核銷優惠券的最低消費折率
# temp = coupon_consume.groupby('User_id').discount_rate.min().reset_index(name='ou15')
# X = pd.merge(X, temp, how='left', on='User_id')
#
# # 使用者核銷優惠券的最高消費折率
# temp = coupon_consume.groupby('User_id').discount_rate.max().reset_index(name='ou16')
# X = pd.merge(X, temp, how='left', on='User_id')
#
# # 不同打折優惠券領取次數
# temp = online.groupby('Discount_rate').size().reset_index(name='oc1')
# X = pd.merge(X, temp, how='left', on='Discount_rate')
#
# # 不同打折優惠券使用次數
# temp = coupon_consume.groupby('Discount_rate').size().reset_index(name='oc2')
# X = pd.merge(X, temp, how='left', on='Discount_rate')
#
# # 不同打折優惠券不使用次數
# temp = coupon_no_consume.groupby('Discount_rate').size().reset_index(name='oc3')
# X = pd.merge(X, temp, how='left', on='Discount_rate')
#
# # 不同打折優惠券使用率
# X['oc4'] = X.oc2 / X.oc1
print(len(X), len(X.columns))
print('----------')
return X
def get_train_data():
path = 'cache_%s_train.csv' % os.path.basename(__file__)
if os.path.exists(path):
data = pd.read_csv(path)
else:
offline, online = get_preprocess_data()
# date received 2016-01-01 - 2016-06-15
# date consumed 2016-01-01 - 2016-06-30
# train data 1
# 2016-04-16 ~ 2016-05-15
data_1 = offline[('2016-04-16' <= offline.Date_received) & (offline.Date_received <= '2016-05-15')].copy()
data_1['label'] = 0
data_1.loc[
(data_1.Date != date_null) & (data_1.Date - data_1.Date_received <= datetime.timedelta(15)), 'label'] = 1
# feature data 1
# 領券 2016-01-01 ~ 2016-03-31
end = '2016-03-31'
data_off_1 = offline[offline.Date_received <= end]
data_on_1 = online[online.Date_received <= end]
# 普通消費 2016-01-01 ~ 2016-04-15
end = '2016-04-15'
data_off_2 = offline[(offline.Coupon_id == 0) & (offline.Date <= end)]
data_on_2 = online[(online.Coupon_id == 0) & (online.Date <= end)]
data_1 = get_offline_features(data_1, pd.concat([data_off_1, data_off_2]))
data_1 = get_online_features(pd.concat([data_on_1, data_on_2]), data_1)
# train data 2
# 2016-05-16 ~ 2016-06-15
data_2 = offline['2016-05-16' <= offline.Date_received].copy()
data_2['label'] = 0
data_2.loc[
(data_2.Date != date_null) & (data_2.Date - data_2.Date_received <= datetime.timedelta(15)), 'label'] = 1
# feature data 2
# 領券
start = '2016-02-01'
end = '2016-04-30'
data_off_1 = offline[(start <= offline.Date_received) & (offline.Date_received <= end)]
data_on_1 = online[(start <= online.Date_received) & (online.Date_received <= end)]
# 普通消費
start = '2016-02-01'
end = '2016-05-15'
data_off_2 = offline[(offline.Coupon_id == 0) & (start <= offline.Date) & (offline.Date <= end)]
data_on_2 = online[(online.Coupon_id == 0) & (start <= online.Date) & (online.Date <= end)]
data_2 = get_offline_features(data_2, pd.concat([data_off_1, data_off_2]))
data_2 = get_online_features(pd.concat([data_on_1, data_on_2]), data_2)
data = pd.concat([data_1, data_2])
# undersampling
# if undersampling:
# temp = X_1[X_1.label == 1].groupby('User_id').size().reset_index()
# temp = X_1[X_1.User_id.isin(temp.User_id)]
# X_1 = pd.concat([temp, X_1[~X_1.User_id.isin(temp.User_id)].sample(4041)])
# data.drop_duplicates(inplace=True)
drop_columns(data)
data.fillna(0, inplace=True)
data.to_csv(path, index=False)
return data
def analysis():
offline, online = get_preprocess_data()
# t = offline.groupby('Discount_rate').size().reset_index(name='receive_count')
# t1 = offline[(offline.Coupon_id != 0) & (offline.Date != date_null)]
# t1 = t1.groupby('Discount_rate').size().reset_index(name='consume_count')
# t = pd.merge(t, t1, on='Discount_rate')
# t['consume_rate'] = t.consume_count / t.receive_count
# t = offline.groupby('Merchant_id').size().reset_index(name='receive_count')
# t1 = offline[(offline.Coupon_id != 0) & (offline.Date != date_null)]
# t1 = t1.groupby('Merchant_id').size().reset_index(name='consume_count')
# t = pd.merge(t, t1, on='Merchant_id')
# t['consume_rate'] = t.consume_count / t.receive_count
t = offline.groupby('Distance').size().reset_index(name='receive_count')
t1 = offline[(offline.Coupon_id != 0) & (offline.Date != date_null)]
t1 = t1.groupby('Distance').size().reset_index(name='consume_count')
t = pd.merge(t, t1, on='Distance')
t['consume_rate'] = t.consume_count / t.receive_count
t.to_csv('note.csv')
# plt.bar(temp.Discount_rate.values, temp.total.values)
# plt.bar(range(num), y1, bottom=y2, fc='r')
# plt.show()
exit()
def detect_duplicate_columns():
X = get_train_data()
X = X[:1000]
for index1 in range(len(X.columns) - 1):
for index2 in range(index1 + 1, len(X.columns)):
column1 = X.columns[index1]
column2 = X.columns[index2]
X[column1] = X[column1].astype(str)
X[column2] = X[column2].astype(str)
temp = len(X[X[column1] == X[column2]])
if temp == len(X):
print(column1, column2, temp)
exit()
def feature_importance_score():
clf = train_xgb()
fscores = pd.Series(clf.get_booster().get_fscore()).sort_values(ascending=False)
fscores.plot(kind='bar', title='Feature Importance')
plt.ylabel('Feature Importance Score')
plt.show()
exit()
def feature_selection():
data = get_train_data()
train_data, test_data = train_test_split(data,
train_size=100000,
random_state=0
)
X = train_data.copy().drop(columns='Coupon_id')
y = X.pop('label')
# sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
# X = sel.fit_transform(X)
# print(X.shape)
# Create the RFE object and rank each pixel
def fit_eval_metric(estimator, X, y, name=None):
if name is None:
name = estimator.__class__.__name__
if name is 'XGBClassifier' or name is 'LGBMClassifier':
estimator.fit(X, y, eval_metric='auc')
else:
estimator.fit(X, y)
return estimator
def grid_search(estimator, param_grid):
start = datetime.datetime.now()
print('--------------------------------------------')
print(start.strftime('%Y-%m-%d %H:%M:%S'))
print(param_grid)
print()
data = get_train_data()
data, _ = train_test_split(data, train_size=100000, random_state=0)
X = data.copy().drop(columns='Coupon_id')
y = X.pop('label')
estimator_name = estimator.__class__.__name__
n_jobs = cpu_jobs
clf = GridSearchCV(estimator=estimator, param_grid=param_grid, scoring='roc_auc', n_jobs=n_jobs
# cv=5
)
clf = fit_eval_metric(clf, X, y, estimator_name)
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
print('%0.5f (+/-%0.05f) for %r' % (mean, std * 2, params))
print()
print('best params', clf.best_params_)
print('best score', clf.best_score_)
print('time: %s' % str((datetime.datetime.now() - start)).split('.')[0])
print()
return clf.best_params_, clf.best_score_
def grid_search_auto(steps, params, estimator):
global log
old_params = params.copy()
while 1:
for name, step in steps.items():
score = 0
start = params[name] - step['step']
if start <= step['min']:
start = step['min']
stop = params[name] + step['step']
if step['max'] != 'inf' and stop >= step['max']:
stop = step['max']
while 1:
if str(step['step']).count('.') == 1:
stop += step['step'] / 10
else:
stop += step['step']
param_grid = {
name: np.arange(start, stop, step['step']),
}
best_params, best_score = grid_search(estimator.set_params(**params), param_grid)
if best_params[name] == params[name] or score > best_score:
print(estimator.__class__.__name__, params)
break
direction = (best_params[name] - params[name]) // abs(best_params[name] - params[name])
start = stop = best_params[name] + step['step'] * direction
score = best_score
params[name] = best_params[name]
print(estimator.__class__.__name__, params)
if best_params[name] - step['step'] < step['min'] or (
step['max'] != 'inf' and best_params[name] + step['step'] > step['max']):
break
if old_params == params:
break
old_params = params
print('--------------------------------------------')
print('new grid search')
print('--------------------------------------------')
log += 'grid search: %s\n%r\n' % (estimator.__class__.__name__, params)
def grid_search_gbdt(get_param=False):
params = {
# 10
'learning_rate': 1e-2,
'n_estimators': 1900,
'max_depth': 9,
'min_samples_split': 200,
'min_samples_leaf': 50,
'subsample': .8,
# 'learning_rate': 1e-1,
# 'n_estimators': 200,
# 'max_depth': 8,
# 'min_samples_split': 200,
# 'min_samples_leaf': 50,
# 'subsample': .8,
}
if get_param:
return params
steps = {
'n_estimators': {'step': 100, 'min': 1, 'max': 'inf'},
'max_depth': {'step': 1, 'min': 1, 'max': 'inf'},
'min_samples_split': {'step': 10, 'min': 2, 'max': 'inf'},
'min_samples_leaf': {'step': 10, 'min': 1, 'max': 'inf'},
'subsample': {'step': .1, 'min': .1, 'max': 1},
}
grid_search_auto(steps, params, GradientBoostingClassifier())
def grid_search_xgb(get_param=False):
params = {
'learning_rate': 1e-2,
'n_estimators': 1260,
'max_depth': 8,
'min_child_weight': 4,
'gamma': .2,
'subsample': .6,
'colsample_bytree': .8,
'scale_pos_weight': 1,
'reg_alpha': 0,
}
if get_param:
return params
steps = {
'n_estimators': {'step': 10, 'min': 1, 'max': 'inf'},
'max_depth': {'step': 1, 'min': 1, 'max': 'inf'},
'min_child_weight': {'step': 1, 'min': 1, 'max': 'inf'},
'gamma': {'step': .1, 'min': 0, 'max': 1},
'subsample': {'step': .1, 'min': .1, 'max': 1},
'colsample_bytree': {'step': .1, 'min': .1, 'max': 1},
'scale_pos_weight': {'step': 1, 'min': 1, 'max': 10},
'reg_alpha': {'step': .1, 'min': 0, 'max': 1},
}
grid_search_auto(steps, params, XGBClassifier())
def grid_search_lgb(get_param=False):
params = {
# 10
'learning_rate': 1e-2,
'n_estimators': 1200,
'num_leaves': 51,
'min_split_gain': 0,
'min_child_weight': 1e-3,
'min_child_samples': 22,
'subsample': .8,
'colsample_bytree': .8,
# 'learning_rate': .1,
# 'n_estimators': 90,
# 'num_leaves': 50,
# 'min_split_gain': 0,
# 'min_child_weight': 1e-3,
# 'min_child_samples': 21,
# 'subsample': .8,
# 'colsample_bytree': .8,
}
if get_param:
return params
steps = {
'n_estimators': {'step': 100, 'min': 1, 'max': 'inf'},
'num_leaves': {'step': 1, 'min': 1, 'max': 'inf'},
'min_split_gain': {'step': .1, 'min': 0, 'max': 1},
'min_child_weight': {'step': 1e-3, 'min': 1e-3, 'max': 'inf'},
'min_child_samples': {'step': 1, 'min': 1, 'max': 'inf'},
# 'subsample': {'step': .1, 'min': .1, 'max': 1},
'colsample_bytree': {'step': .1, 'min': .1, 'max': 1},
}
grid_search_auto(steps, params, LGBMClassifier())
def grid_search_cat(get_param=False):
params = {
# 10
'learning_rate': 1e-2,
'n_estimators': 3600,
'max_depth': 8,
'max_bin': 127,
'reg_lambda': 2,
'subsample': .7,
# 'learning_rate': 1e-1,
# 'iterations': 460,
# 'depth': 8,
# 'l2_leaf_reg': 8,
# 'border_count': 37,
# 'ctr_border_count': 16,
'one_hot_max_size': 2,
'bootstrap_type': 'Bernoulli',
'leaf_estimation_method': 'Newton',
'verbose': False,
'eval_metric': 'AUC',
'thread_count': cpu_jobs
}
if get_param:
return params
steps = {
'n_estimators': {'step': 100, 'min': 1, 'max': 'inf'},
'max_depth': {'step': 1, 'min': 1, 'max': 'inf'},
'max_bin': {'step': 1, 'min': 1, 'max': 255},
'reg_lambda': {'step': 1, 'min': 0, 'max': 'inf'},
'subsample': {'step': .1, 'min': .1, 'max': 1},
'one_hot_max_size': {'step': 1, 'min': 0, 'max': 255},
}
grid_search_auto(steps, params, CatBoostClassifier())
def grid_search_rf(criterion='gini', get_param=False):
if criterion == 'gini':
params = {
# 10
'n_estimators': 3090,
'max_depth': 15,
'min_samples_split': 2,
'min_samples_leaf': 1,
'criterion': 'gini',
}
else:
params = {
'n_estimators': 3110,
'max_depth': 13,
'min_samples_split': 70,
'min_samples_leaf': 10,
'criterion': 'entropy',
}
if get_param:
return params
steps = {
'n_estimators': {'step': 10, 'min': 1, 'max': 'inf'},
'max_depth': {'step': 1, 'min': 1, 'max': 'inf'},
'min_samples_split': {'step': 2, 'min': 2, 'max': 'inf'},
'min_samples_leaf': {'step': 2, 'min': 1, 'max': 'inf'},
}
grid_search_auto(steps, params, RandomForestClassifier())
def grid_search_et(criterion='gini', get_param=False):
if criterion == 'gini':
params = {
# 10
'n_estimators': 3060,
'max_depth': 22,
'min_samples_split': 12,
'min_samples_leaf': 1,
'criterion': 'gini',
}
else:
params = {
'n_estimators': 3100,
'max_depth': 13,
'min_samples_split': 70,
'min_samples_leaf': 10,
'criterion': 'entropy',
}
if get_param:
return params
steps = {
'n_estimators': {'step': 10, 'min': 1, 'max': 'inf'},
'max_depth': {'step': 1, 'min': 1, 'max': 'inf'},
'min_samples_split': {'step': 2, 'min': 2, 'max': 'inf'},
'min_samples_leaf': {'step': 2, 'min': 1, 'max': 'inf'},
}
grid_search_auto(steps, params, ExtraTreesClassifier())
def train_gbdt(model=False):
global log
params = grid_search_gbdt(True)
clf = GradientBoostingClassifier().set_params(**params)
if model:
return clf
params = clf.get_params()
log += 'gbdt'
log += ', learning_rate: %.3f' % params['learning_rate']
log += ', n_estimators: %d' % params['n_estimators']
log += ', max_depth: %d' % params['max_depth']
log += ', min_samples_split: %d' % params['min_samples_split']
log += ', min_samples_leaf: %d' % params['min_samples_leaf']
log += ', subsample: %.1f' % params['subsample']
log += '\n\n'
return train(clf)
def train_xgb(model=False):
global log
params = grid_search_xgb(True)
clf = XGBClassifier().set_params(**params)
if model:
return clf
params = clf.get_params()
log += 'xgb'
log += ', learning_rate: %.3f' % params['learning_rate']
log += ', n_estimators: %d' % params['n_estimators']
log += ', max_depth: %d' % params['max_depth']
log += ', min_child_weight: %d' % params['min_child_weight']
log += ', gamma: %.1f' % params['gamma']
log += ', subsample: %.1f' % params['subsample']
log += ', colsample_bytree: %.1f' % params['colsample_bytree']
log += '\n\n'
return train(clf)
def train_lgb(model=False):
global log
params = grid_search_lgb(True)
clf = LGBMClassifier().set_params(**params)
if model:
return clf
params = clf.get_params()
log += 'lgb'
log += ', learning_rate: %.3f' % params['learning_rate']
log += ', n_estimators: %d' % params['n_estimators']
log += ', num_leaves: %d' % params['num_leaves']
log += ', min_split_gain: %.1f' % params['min_split_gain']
log += ', min_child_weight: %.4f' % params['min_child_weight']
log += ', min_child_samples: %d' % params['min_child_samples']
log += ', subsample: %.1f' % params['subsample']
log += ', colsample_bytree: %.1f' % params['colsample_bytree']
log += '\n\n'
return train(clf)
def train_cat(model=False):
global log
params = grid_search_cat(True)
clf = CatBoostClassifier().set_params(**params)
if model:
return clf
params = clf.get_params()
log += 'cat'
log += ', learning_rate: %.3f' % params['learning_rate']
log += ', iterations: %d' % params['iterations']
log += ', depth: %d' % params['depth']
log += ', l2_leaf_reg: %d' % params['l2_leaf_reg']
log += ', border_count: %d' % params['border_count']
log += ', subsample: %d' % params['subsample']
log += ', one_hot_max_size: %d' % params['one_hot_max_size']
log += '\n\n'
return train(clf)
def train_rf(clf):
global log
params = clf.get_params()
log += 'rf'
log += ', n_estimators: %d' % params['n_estimators']
log += ', max_depth: %d' % params['max_depth']
log += ', min_samples_split: %d' % params['min_samples_split']
log += ', min_samples_leaf: %d' % params['min_samples_leaf']
log += ', criterion: %s' % params['criterion']
log += '\n\n'
return train(clf)
def train_rf_gini(model=False):
clf = RandomForestClassifier().set_params(**grid_search_rf('gini', True))
if model:
return clf
return train_rf(clf)
def train_rf_entropy():
clf = RandomForestClassifier().set_params(**grid_search_rf('entropy', True))
return train_rf(clf)
def train_et(clf):
global log
params = clf.get_params()
log += 'et'
log += ', n_estimators: %d' % params['n_estimators']
log += ', max_depth: %d' % params['max_depth']
log += ', min_samples_split: %d' % params['min_samples_split']
log += ', min_samples_leaf: %d' % params['min_samples_leaf']
log += ', criterion: %s' % params['criterion']
log += '\n\n'
return train(clf)
def train_et_gini(model=False):
clf = ExtraTreesClassifier().set_params(**grid_search_et('gini', True))
if model:
return clf
return train_et(clf)
def train_et_entropy():
clf = ExtraTreesClassifier().set_params(**{
'n_estimators': 310,
'max_depth': 13,
'min_samples_split': 70,
'min_samples_leaf': 10,
'criterion': 'entropy',
})
return train_et(clf)
def train(clf):
global log
data = get_train_data()
train_data, test_data = train_test_split(data,
train_size=100000,
random_state=0
)
_, test_data = train_test_split(data, random_state=0)
X_train = train_data.copy().drop(columns='Coupon_id')
y_train = X_train.pop('label')
clf = fit_eval_metric(clf, X_train, y_train)
X_test = test_data.copy().drop(columns='Coupon_id')
y_test = X_test.pop('label')
y_true, y_pred = y_test, clf.predict(X_test)
# log += '%s\n' % classification_report(y_test, y_pred)
log += ' accuracy: %f\n' % accuracy_score(y_true, y_pred)
y_score = clf.predict_proba(X_test)[:, 1]
log += ' auc: %f\n' % roc_auc_score(y_true, y_score)
# coupon average auc
coupons = test_data.groupby('Coupon_id').size().reset_index(name='total')
aucs = []
for _, coupon in coupons.iterrows():
if coupon.total > 1:
X_test = test_data[test_data.Coupon_id == coupon.Coupon_id].copy()
X_test.drop(columns='Coupon_id', inplace=True)
if len(X_test.label.unique()) != 2:
continue
y_true = X_test.pop('label')
y_score = clf.predict_proba(X_test)[:, 1]
aucs.append(roc_auc_score(y_true, y_score))
log += 'coupon auc: %f\n\n' % np.mean(aucs)
return clf
def predict(model):
path = 'cache_%s_predict.csv' % os.path.basename(__file__)
if os.path.exists(path):
X = pd.read_csv(path, parse_dates=['Date_received'])
else:
offline, online = get_preprocess_data()
# 2016-03-16 ~ 2016-06-30
start = '2016-03-16'
offline = offline[(offline.Coupon_id == 0) & (start <= offline.Date) | (start <= offline.Date_received)]
online = online[(online.Coupon_id == 0) & (start <= online.Date) | (start <= online.Date_received)]
X = get_preprocess_data(True)
X = get_offline_features(X, offline)
X = get_online_features(online, X)
X.drop_duplicates(inplace=True)
X.fillna(0, inplace=True)
X.to_csv(path, index=False)
sample_submission = X[['User_id', 'Coupon_id', 'Date_received']].copy()
sample_submission.Date_received = sample_submission.Date_received.dt.strftime('%Y%m%d')
drop_columns(X, True)
if model is 'blending':
predict = blending(X)
else:
clf = eval('train_%s' % model)()
predict = clf.predict_proba(X)[:, 1]
sample_submission['Probability'] = predict
sample_submission.to_csv('submission_%s.csv' % model,
# float_format='%.5f',
index=False, header=False)
def blending(predict_X=None):
global log
log += '\n'
X = get_train_data().drop(columns='Coupon_id')
y = X.pop('label')
X = np.asarray(X)
y = np.asarray(y)
_, X_submission, _, y_test_blend = train_test_split(X, y,
random_state=0
)
if predict_X is not None:
X_submission = np.asarray(predict_X)
X, _, y, _ = train_test_split(X, y,
train_size=100000,
random_state=0
)
# np.random.seed(0)
# idx = np.random.permutation(y.size)
# X = X[idx]
# y = y[idx]
skf = StratifiedKFold()
# clfs = ['gbdt', 'xgb', 'lgb', 'cat',
# # 'rf_gini', 'et_gini'
# ]
clfs = ['gbdt', 'cat', 'lgb']
blend_X_train = np.zeros((X.shape[0], len(clfs)))
blend_X_test = np.zeros((X_submission.shape[0], len(clfs)))
for j, v in enumerate(clfs):
clf = eval('train_%s' % v)(True)
aucs = []
dataset_blend_test_j = []
for train_index, test_index in skf.split(X, y):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
clf = fit_eval_metric(clf, X_train, y_train)
y_submission = clf.predict_proba(X_test)[:, 1]
aucs.append(roc_auc_score(y_test, y_submission))
blend_X_train[test_index, j] = y_submission
dataset_blend_test_j.append(clf.predict_proba(X_submission)[:, 1])
log += '%7s' % v + ' auc: %f\n' % np.mean(aucs)
blend_X_test[:, j] = np.asarray(dataset_blend_test_j).T.mean(1)
print('blending')
clf = LogisticRegression()
# clf = GradientBoostingClassifier()
clf.fit(blend_X_train, y)
y_submission = clf.predict_proba(blend_X_test)[:, 1]
# Linear stretch of predictions to [0,1]
y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())
if predict_X is not None:
return y_submission
log += '\n blend auc: %f\n\n' % roc_auc_score(y_test_blend, y_submission)
if __name__ == '__main__':
start = datetime.datetime.now()
print(start.strftime('%Y-%m-%d %H:%M:%S'))
log = '%s\n' % start.strftime('%Y-%m-%d %H:%M:%S')
cpu_jobs = os.cpu_count() - 1
date_null = pd.to_datetime('1970-01-01', format='%Y-%m-%d')
predict('blending')
log += 'time: %s\n' % str((datetime.datetime.now() - start)).split('.')[0]
log += '----------------------------------------------------\n'
open('%s.log' % os.path.basename(__file__), 'a').write(log)
print(log)
模型2:分數0.79
# 解決lgb報錯
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
import datetime
import os
from concurrent.futures import ProcessPoolExecutor
from math import ceil
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from xgboost.sklearn import XGBClassifier
os.chdir(r'E:\專案檔案\o2o優惠券使用預測')
pd.set_option('expand_frame_repr', False)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)
def drop_columns(X, predict=False):
columns = [
'User_id', 'Merchant_id', 'Discount_rate', 'Date_received', 'discount_rate_x', 'discount_rate_y',
# 'u33', 'u34'
]
if predict:
columns.append('Coupon_id')
else:
columns.append('Date')
X.drop(columns=columns, inplace=True)
def get_preprocess_data(predict=False):
if predict:
offline = pd.read_csv('ccf_offline_stage1_test_revised.csv', parse_dates=['Date_received'])
else:
offline = pd.read_csv('ccf_offline_stage1_train.csv', parse_dates=['Date_received', 'Date'])
offline.Distance.fillna(11, inplace=True)
offline.Distance = offline.Distance.astype(int)
offline.Coupon_id.fillna(0, inplace=True)
offline.Coupon_id = offline.Coupon_id.astype(int)
offline.Date_received.fillna(date_null, inplace=True)
offline[['discount_rate_x', 'discount_rate_y']] = offline[offline.Discount_rate.str.contains(':') == True][
'Discount_rate'].str.split(':', expand=True).astype(int)
offline['discount_rate'] = 1 - offline.discount_rate_y / offline.discount_rate_x
offline.discount_rate = offline.discount_rate.fillna(offline.Discount_rate).astype(float)
if predict:
return offline
offline.Date.fillna(date_null, inplace=True)
# online
online = pd.read_csv('ccf_online_stage1_train.csv', parse_dates=['Date_received', 'Date'])
online.Coupon_id.fillna(0, inplace=True)
# online.Coupon_id = online.Coupon_id.astype(int)
online.Date_received.fillna(date_null, inplace=True)
online.Date.fillna(date_null, inplace=True)
return offline, online
def task(X_chunk, X, counter):
print(counter, end=',', flush=True)
X_chunk = X_chunk.copy()
X_chunk['o17'] = -1
X_chunk['o18'] = -1
for i, user in X_chunk.iterrows():
temp = X[X.User_id == user.User_id]
temp1 = temp[temp.Date_received < user.Date_received]
temp2 = temp[temp.Date_received > user.Date_received]
# 使用者此次之後/前領取的所有優惠券數目
X_chunk.loc[i, 'o3'] = len(temp1)
X_chunk.loc[i, 'o4'] = len(temp2)
# 使用者此次之後/前領取的特定優惠券數目
X_chunk.loc[i, 'o5'] = len(temp1[temp1.Coupon_id == user.Coupon_id])
X_chunk.loc[i, 'o6'] = len(temp2[temp2.Coupon_id == user.Coupon_id])
# 使用者上/下一次領取的時間間隔
temp1 = temp1.sort_values(by='Date_received', ascending=False)
if len(temp1):
X_chunk.loc[i, 'o17'] = (user.Date_received - temp1.iloc[0].Date_received).days
temp2 = temp2.sort_values(by='Date_received')
if len(temp2):
X_chunk.loc[i, 'o18'] = (temp2.iloc[0].Date_received - user.Date_received).days
return X_chunk
def get_offline_features(X, offline):
# X = X[:1000]
print(len(X), len(X.columns))
temp = offline[offline.Coupon_id != 0]
coupon_consume = temp[temp.Date != date_null]
coupon_no_consume = temp[temp.Date == date_null]
user_coupon_consume = coupon_consume.groupby('User_id')
X['weekday'] = X.Date_received.dt.weekday
X['day'] = X.Date_received.dt.day
# # 距離優惠券消費次數
# temp = coupon_consume.groupby('Distance').size().reset_index(name='distance_0')
# X = pd.merge(X, temp, how='left', on='Distance')
#
# # 距離優惠券不消費次數
# temp = coupon_no_consume.groupby('Distance').size().reset_index(name='distance_1')
# X = pd.merge(X, temp, how='left', on='Distance')
#
# # 距離優惠券領取次數
# X['distance_2'] = X.distance_0 + X.distance_1
#
# # 距離優惠券消費率
# X['distance_3'] = X.distance_0 / X.distance_2
# temp = coupon_consume[coupon_consume.Distance != 11].groupby('Distance').size()
# temp['d4'] = temp.Distance.sum() / len(temp)
# X = pd.merge(X, temp, how='left', on='Distance')
'''user features'''
# 優惠券消費次數
temp = user_coupon_consume.size().reset_index(name='u2')
X = pd.merge(X, temp, how='left', on='User_id')
# X.u2.fillna(0, inplace=True)
# X.u2 = X.u2.astype(int)
# 優惠券不消費次數
temp = coupon_no_consume.groupby('User_id').size().reset_index(name='u3')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用優惠券次數與沒使用優惠券次數比值
X['u19'] = X.u2 / X.u3
# 領取優惠券次數
X['u1'] = X.u2.fillna(0) + X.u3.fillna(0)
# 優惠券核銷率
X['u4'] = X.u2 / X.u1
# 普通消費次數
temp = offline[(offline.Coupon_id == 0) & (offline.Date != date_null)]
temp1 = temp.groupby('User_id').size().reset_index(name='u5')
X = pd.merge(X, temp1, how='left', on='User_id')
# 一共消費多少次
X['u25'] = X.u2 + X.u5
# 使用者使用優惠券消費佔比
X['u20'] = X.u2 / X.u25
# 正常消費平均間隔
temp = pd.merge(temp, temp.groupby('User_id').Date.max().reset_index(name='max'))
temp = pd.merge(temp, temp.groupby('User_id').Date.min().reset_index(name='min'))
temp = pd.merge(temp, temp.groupby('User_id').size().reset_index(name='len'))
temp['u6'] = ((temp['max'] - temp['min']).dt.days / (temp['len'] - 1))
temp = temp.drop_duplicates('User_id')
X = pd.merge(X, temp[['User_id', 'u6']], how='left', on='User_id')
# 優惠券消費平均間隔
temp = pd.merge(coupon_consume, user_coupon_consume.Date.max().reset_index(name='max'))
temp = pd.merge(temp, temp.groupby('User_id').Date.min().reset_index(name='min'))
temp = pd.merge(temp, temp.groupby('User_id').size().reset_index(name='len'))
temp['u7'] = ((temp['max'] - temp['min']).dt.days / (temp['len'] - 1))
temp = temp.drop_duplicates('User_id')
X = pd.merge(X, temp[['User_id', 'u7']], how='left', on='User_id')
# 15天內平均會普通消費幾次
X['u8'] = X.u6 / 15
# 15天內平均會優惠券消費幾次
X['u9'] = X.u7 / 15
# 領取優惠券到使用優惠券的平均間隔時間
temp = coupon_consume.copy()
temp['days'] = (temp.Date - temp.Date_received).dt.days
temp = (temp.groupby('User_id').days.sum() / temp.groupby('User_id').size()).reset_index(name='u10')
X = pd.merge(X, temp, how='left', on='User_id')
# 在15天內使用掉優惠券的值大小
X['u11'] = X.u10 / 15
# 領取優惠券到使用優惠券間隔小於15天的次數
temp = coupon_consume.copy()
temp['days'] = (temp.Date - temp.Date_received).dt.days
temp = temp[temp.days <= 15]
temp = temp.groupby('User_id').size().reset_index(name='u21')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者15天使用掉優惠券的次數除以使用優惠券的次數
X['u22'] = X.u21 / X.u2
# 使用者15天使用掉優惠券的次數除以領取優惠券未消費的次數
X['u23'] = X.u21 / X.u3
# 使用者15天使用掉優惠券的次數除以領取優惠券的總次數
X['u24'] = X.u21 / X.u1
# 消費優惠券的平均折率
temp = user_coupon_consume.discount_rate.mean().reset_index(name='u45')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者核銷優惠券的最低消費折率
temp = user_coupon_consume.discount_rate.min().reset_index(name='u27')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者核銷優惠券的最高消費折率
temp = user_coupon_consume.discount_rate.max().reset_index(name='u28')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者核銷過的不同優惠券數量
temp = coupon_consume.groupby(['User_id', 'Coupon_id']).size()
temp = temp.groupby('User_id').size().reset_index(name='u32')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者領取所有不同優惠券數量
temp = offline[offline.Date_received != date_null]
temp = temp.groupby(['User_id', 'Coupon_id']).size().reset_index(name='u47')
X = pd.merge(X, temp, how='left', on=['User_id', 'Coupon_id'])
# 使用者核銷過的不同優惠券數量佔所有不同優惠券的比重
X['u33'] = X.u32 / X.u47
# 使用者平均每種優惠券核銷多少張
X['u34'] = X.u2 / X.u47
# 核銷優惠券使用者-商家平均距離
temp = offline[(offline.Coupon_id != 0) & (offline.Date != date_null) & (offline.Distance != 11)]
temp = temp.groupby('User_id').Distance
temp = pd.merge(temp.count().reset_index(name='x'), temp.sum().reset_index(name='y'), on='User_id')
temp['u35'] = temp.y / temp.x
temp = temp[['User_id', 'u35']]
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者核銷優惠券中的最小使用者-商家距離
temp = coupon_consume[coupon_consume.Distance != 11]
temp = temp.groupby('User_id').Distance.min().reset_index(name='u36')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者核銷優惠券中的最大使用者-商家距離
temp = coupon_consume[coupon_consume.Distance != 11]
temp = temp.groupby('User_id').Distance.max().reset_index(name='u37')
X = pd.merge(X, temp, how='left', on='User_id')
# 優惠券型別
discount_types = [
'0.2', '0.5', '0.6', '0.7', '0.75', '0.8', '0.85', '0.9', '0.95', '30:20', '50:30', '10:5',
'20:10', '100:50', '200:100', '50:20', '30:10', '150:50', '100:30', '20:5', '200:50', '5:1',
'50:10', '100:20', '150:30', '30:5', '300:50', '200:30', '150:20', '10:1', '50:5', '100:10',
'200:20', '300:30', '150:10', '300:20', '500:30', '20:1', '100:5', '200:10', '30:1', '150:5',
'300:10', '200:5', '50:1', '100:1',
]
X['discount_type'] = -1
for k, v in enumerate(discount_types):
X.loc[X.Discount_rate == v, 'discount_type'] = k
# 不同優惠券領取次數
temp = offline.groupby(['User_id', 'Discount_rate']).size().reset_index(name='u41')
X = pd.merge(X, temp, how='left', on=['User_id', 'Discount_rate'])
# 不同優惠券使用次數
temp = coupon_consume.groupby(['User_id', 'Discount_rate']).size().reset_index(name='u42')
X = pd.merge(X, temp, how='left', on=['User_id', 'Discount_rate'])
# 不同優惠券不使用次數
temp = coupon_no_consume.groupby(['User_id', 'Discount_rate']).size().reset_index(name='u43')
X = pd.merge(X, temp, how='left', on=['User_id', 'Discount_rate'])
# 不同打折優惠券使用率
X['u44'] = X.u42 / X.u41
# 滿減型別優惠券領取次數
temp = offline[offline.Discount_rate.str.contains(':') == True]
temp = temp.groupby('User_id').size().reset_index(name='u48')
X = pd.merge(X, temp, how='left', on='User_id')
# 打折型別優惠券領取次數
temp = offline[offline.Discount_rate.str.contains('\.') == True]
temp = temp.groupby('User_id').size().reset_index(name='u49')
X = pd.merge(X, temp, how='left', on='User_id')
'''offline merchant features'''
# 商戶消費次數
temp = offline[offline.Date != date_null].groupby('Merchant_id').size().reset_index(name='m0')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家優惠券被領取後核銷次數
temp = coupon_consume.groupby('Merchant_id').size().reset_index(name='m1')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商戶正常消費筆數
X['m2'] = X.m0.fillna(0) - X.m1.fillna(0)
# 商家優惠券被領取次數
temp = offline[offline.Date_received != date_null].groupby('Merchant_id').size().reset_index(name='m3')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家優惠券被領取後核銷率
X['m4'] = X.m1 / X.m3
# 商家優惠券被領取後不核銷次數
temp = coupon_no_consume.groupby('Merchant_id').size().reset_index(name='m7')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商戶當天優惠券領取次數
temp = X[X.Date_received != date_null]
temp = temp.groupby(['Merchant_id', 'Date_received']).size().reset_index(name='m5')
X = pd.merge(X, temp, how='left', on=['Merchant_id', 'Date_received'])
# 商戶當天優惠券領取人數
temp = X[X.Date_received != date_null]
temp = temp.groupby(['User_id', 'Merchant_id', 'Date_received']).size().reset_index()
temp = temp.groupby(['Merchant_id', 'Date_received']).size().reset_index(name='m6')
X = pd.merge(X, temp, how='left', on=['Merchant_id', 'Date_received'])
# 商家優惠券核銷的平均消費折率
temp = coupon_consume.groupby('Merchant_id').discount_rate.mean().reset_index(name='m8')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家優惠券核銷的最小消費折率
temp = coupon_consume.groupby('Merchant_id').discount_rate.max().reset_index(name='m9')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家優惠券核銷的最大消費折率
temp = coupon_consume.groupby('Merchant_id').discount_rate.min().reset_index(name='m10')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家優惠券核銷不同的使用者數量
temp = coupon_consume.groupby(['Merchant_id', 'User_id']).size()
temp = temp.groupby('Merchant_id').size().reset_index(name='m11')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家優惠券領取不同的使用者數量
temp = offline[offline.Date_received != date_null].groupby(['Merchant_id', 'User_id']).size()
temp = temp.groupby('Merchant_id').size().reset_index(name='m12')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 核銷商家優惠券的不同使用者數量其佔領取不同的使用者比重
X['m13'] = X.m11 / X.m12
# 商家優惠券平均每個使用者核銷多少張
X['m14'] = X.m1 / X.m12
# 商家被核銷過的不同優惠券數量
temp = coupon_consume.groupby(['Merchant_id', 'Coupon_id']).size()
temp = temp.groupby('Merchant_id').size().reset_index(name='m15')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家領取過的不同優惠券數量的比重
temp = offline[offline.Date_received != date_null].groupby(['Merchant_id', 'Coupon_id']).size()
temp = temp.groupby('Merchant_id').count().reset_index(name='m18')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家被核銷過的不同優惠券數量佔所有領取過的不同優惠券數量的比重
X['m19'] = X.m15 / X.m18
# 商家被核銷優惠券的平均時間
temp = pd.merge(coupon_consume, coupon_consume.groupby('Merchant_id').Date.max().reset_index(name='max'))
temp = pd.merge(temp, temp.groupby('Merchant_id').Date.min().reset_index(name='min'))
temp = pd.merge(temp, temp.groupby('Merchant_id').size().reset_index(name='len'))
temp['m20'] = ((temp['max'] - temp['min']).dt.days / (temp['len'] - 1))
temp = temp.drop_duplicates('Merchant_id')
X = pd.merge(X, temp[['Merchant_id', 'm20']], how='left', on='Merchant_id')
# 商家被核銷優惠券中的使用者-商家平均距離
temp = coupon_consume[coupon_consume.Distance != 11].groupby('Merchant_id').Distance
temp = pd.merge(temp.count().reset_index(name='x'), temp.sum().reset_index(name='y'), on='Merchant_id')
temp['m21'] = temp.y / temp.x
temp = temp[['Merchant_id', 'm21']]
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家被核銷優惠券中的使用者-商家最小距離
temp = coupon_consume[coupon_consume.Distance != 11]
temp = temp.groupby('Merchant_id').Distance.min().reset_index(name='m22')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家被核銷優惠券中的使用者-商家最大距離
temp = coupon_consume[coupon_consume.Distance != 11]
temp = temp.groupby('Merchant_id').Distance.max().reset_index(name='m23')
X = pd.merge(X, temp, how='left', on='Merchant_id')
"""offline coupon features"""
# 此優惠券一共發行多少張
temp = offline[offline.Coupon_id != 0].groupby('Coupon_id').size().reset_index(name='c1')
X = pd.merge(X, temp, how='left', on='Coupon_id')
# 此優惠券一共被使用多少張
temp = coupon_consume.groupby('Coupon_id').size().reset_index(name='c2')
X = pd.merge(X, temp, how='left', on='Coupon_id')
# 優惠券使用率
X['c3'] = X.c2 / X.c1
# 沒有使用的數目
X['c4'] = X.c1 - X.c2
# 此優惠券在當天發行了多少張
temp = X.groupby(['Coupon_id', 'Date_received']).size().reset_index(name='c5')
X = pd.merge(X, temp, how='left', on=['Coupon_id', 'Date_received'])
# 優惠券型別(直接優惠為0, 滿減為1)
X['c6'] = 0
X.loc[X.Discount_rate.str.contains(':') == True, 'c6'] = 1
# 不同打折優惠券領取次數
temp = offline.groupby('Discount_rate').size().reset_index(name='c8')
X = pd.merge(X, temp, how='left', on='Discount_rate')
# 不同打折優惠券使用次數
temp = coupon_consume.groupby('Discount_rate').size().reset_index(name='c9')
X = pd.merge(X, temp, how='left', on='Discount_rate')
# 不同打折優惠券不使用次數
temp = coupon_no_consume.groupby('Discount_rate').size().reset_index(name='c10')
X = pd.merge(X, temp, how='left', on='Discount_rate')
# 不同打折優惠券使用率
X['c11'] = X.c9 / X.c8
# 優惠券核銷平均時間
temp = pd.merge(coupon_consume, coupon_consume.groupby('Coupon_id').Date.max().reset_index(name='max'))
temp = pd.merge(temp, temp.groupby('Coupon_id').Date.min().reset_index(name='min'))
temp = pd.merge(temp, temp.groupby('Coupon_id').size().reset_index(name='count'))
temp['c12'] = ((temp['max'] - temp['min']).dt.days / (temp['count'] - 1))
temp = temp.drop_duplicates('Coupon_id')
X = pd.merge(X, temp[['Coupon_id', 'c12']], how='left', on='Coupon_id')
'''user merchant feature'''
# 使用者領取商家的優惠券次數
temp = offline[offline.Coupon_id != 0]
temp = temp.groupby(['User_id', 'Merchant_id']).size().reset_index(name='um1')
X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])
# 使用者領取商家的優惠券後不核銷次數
temp = coupon_no_consume.groupby(['User_id', 'Merchant_id']).size().reset_index(name='um2')
X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])
# 使用者領取商家的優惠券後核銷次數
temp = coupon_consume.groupby(['User_id', 'Merchant_id']).size().reset_index(name='um3')
X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])
# 使用者領取商家的優惠券後核銷率
X['um4'] = X.um3 / X.um1
# 使用者對每個商家的不核銷次數佔使用者總的不核銷次數的比重
temp = coupon_no_consume.groupby('User_id').size().reset_index(name='temp')
X = pd.merge(X, temp, how='left', on='User_id')
X['um5'] = X.um2 / X.temp
X.drop(columns='temp', inplace=True)
# 使用者在商店總共消費過幾次
temp = offline[offline.Date != date_null].groupby(['User_id', 'Merchant_id']).size().reset_index(name='um6')
X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])
# 使用者在商店普通消費次數
temp = offline[(offline.Coupon_id == 0) & (offline.Date != date_null)]
temp = temp.groupby(['User_id', 'Merchant_id']).size().reset_index(name='um7')
X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])
# 使用者當天在此商店領取的優惠券數目
temp = offline[offline.Date_received != date_null]
temp = temp.groupby(['User_id', 'Merchant_id', 'Date_received']).size().reset_index(name='um8')
X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id', 'Date_received'])
# 使用者領取優惠券不同商家數量
temp = offline[offline.Coupon_id == offline.Coupon_id]
temp = temp.groupby(['User_id', 'Merchant_id']).size().reset_index()
temp = temp.groupby('User_id').size().reset_index(name='um9')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者核銷優惠券不同商家數量
temp = coupon_consume.groupby(['User_id', 'Merchant_id']).size()
temp = temp.groupby('User_id').size().reset_index(name='um10')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者核銷過優惠券的不同商家數量佔所有不同商家的比重
X['um11'] = X.um10 / X.um9
# 使用者平均核銷每個商家多少張優惠券
X['um12'] = X.u2 / X.um9
'''other feature'''
# 使用者領取的所有優惠券數目
temp = X.groupby('User_id').size().reset_index(name='o1')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者領取的特定優惠券數目
temp = X.groupby(['User_id', 'Coupon_id']).size().reset_index(name='o2')
X = pd.merge(X, temp, how='left', on=['User_id', 'Coupon_id'])
# multiple threads
# data split
stop = len(X)
step = int(ceil(stop / cpu_jobs))
X_chunks = [X[i:i + step] for i in range(0, stop, step)]
X_list = [X] * cpu_jobs
counters = [i for i in range(cpu_jobs)]
start = datetime.datetime.now()
with ProcessPoolExecutor() as e:
X = pd.concat(e.map(task, X_chunks, X_list, counters))
print('time:', str(datetime.datetime.now() - start).split('.')[0])
# multiple threads
# 使用者領取優惠券平均時間間隔
temp = pd.merge(X, X.groupby('User_id').Date_received.max().reset_index(name='max'))
temp = pd.merge(temp, temp.groupby('User_id').Date_received.min().reset_index(name='min'))
temp = pd.merge(temp, temp.groupby('User_id').size().reset_index(name='len'))
temp['o7'] = ((temp['max'] - temp['min']).dt.days / (temp['len'] - 1))
temp = temp.drop_duplicates('User_id')
X = pd.merge(X, temp[['User_id', 'o7']], how='left', on='User_id')
# 使用者領取特定商家的優惠券數目
temp = X.groupby(['User_id', 'Merchant_id']).size().reset_index(name='o8')
X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])
# 使用者領取的不同商家數目
temp = X.groupby(['User_id', 'Merchant_id']).size()
temp = temp.groupby('User_id').size().reset_index(name='o9')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者當天領取的優惠券數目
temp = X.groupby(['User_id', 'Date_received']).size().reset_index(name='o10')
X = pd.merge(X, temp, how='left', on=['User_id', 'Date_received'])
# 使用者當天領取的特定優惠券數目
temp = X.groupby(['User_id', 'Coupon_id', 'Date_received']).size().reset_index(name='o11')
X = pd.merge(X, temp, how='left', on=['User_id', 'Coupon_id', 'Date_received'])
# 使用者領取的所有優惠券種類數目
temp = X.groupby(['User_id', 'Coupon_id']).size()
temp = temp.groupby('User_id').size().reset_index(name='o12')
X = pd.merge(X, temp, how='left', on='User_id')
# 商家被領取的優惠券數目
temp = X.groupby('Merchant_id').size().reset_index(name='o13')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家被領取的特定優惠券數目
temp = X.groupby(['Merchant_id', 'Coupon_id']).size().reset_index(name='o14')
X = pd.merge(X, temp, how='left', on=['Merchant_id', 'Coupon_id'])
# 商家被多少不同使用者領取的數目
temp = X.groupby(['Merchant_id', 'User_id']).size()
temp = temp.groupby('Merchant_id').size().reset_index(name='o15')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家發行的所有優惠券種類數目
temp = X.groupby(['Merchant_id', 'Coupon_id']).size()
temp = temp.groupby('Merchant_id').size().reset_index(name='o16')
X = pd.merge(X, temp, how='left', on='Merchant_id')
print(len(X), len(X.columns))
return X
def get_online_features(online, X):
# temp = online[online.Coupon_id == online.Coupon_id]
# coupon_consume = temp[temp.Date == temp.Date]
# coupon_no_consume = temp[temp.Date != temp.Date]
# 使用者線上操作次數
temp = online.groupby('User_id').size().reset_index(name='on_u1')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者線上點選次數
temp = online[online.Action == 0].groupby('User_id').size().reset_index(name='on_u2')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者線上點選率
X['on_u3'] = X.on_u2 / X.on_u1
# 使用者線上購買次數
temp = online[online.Action == 1].groupby('User_id').size().reset_index(name='on_u4')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者線上購買率
X['on_u5'] = X.on_u4 / X.on_u1
# 使用者線上領取次數
temp = online[online.Coupon_id != 0].groupby('User_id').size().reset_index(name='on_u6')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者線上領取率
X['on_u7'] = X.on_u6 / X.on_u1
# 使用者線上不消費次數
temp = online[(online.Date == date_null) & (online.Coupon_id != 0)]
temp = temp.groupby('User_id').size().reset_index(name='on_u8')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者線上優惠券核銷次數
temp = online[(online.Date != date_null) & (online.Coupon_id != 0)]
temp = temp.groupby('User_id').size().reset_index(name='on_u9')
X = pd.merge(X, temp, how='left', on='User_id')
# 使用者線上優惠券核銷率
X['on_u10'] = X.on_u9 / X.on_u6
# 使用者線下不消費次數佔線上線下總的不消費次數的比重
X['on_u11'] = X.u3 / (X.on_u8 + X.u3)
# 使用者線下的優惠券核銷次數佔線上線下總的優惠券核銷次數的比重
X['on_u12'] = X.u2 / (X.on_u9 + X.u2)
# 使用者線下領取的記錄數量佔總的記錄數量的比重
X['on_u13'] = X.u1 / (X.on_u6 + X.u1)
# # 消費優惠券的平均折率
# temp = coupon_consume.groupby('User_id').discount_rate.mean().reset_index(name='ou14')
# X = pd.merge(X, temp, how='left', on='User_id')
#
# # 使用者核銷優惠券的最低消費折率
# temp = coupon_consume.groupby('User_id').discount_rate.min().reset_index(name='ou15')
# X = pd.merge(X, temp, how='left', on='User_id')
#
# # 使用者核銷優惠券的最高消費折率
# temp = coupon_consume.groupby('User_id').discount_rate.max().reset_index(name='ou16')
# X = pd.merge(X, temp, how='left', on='User_id')
#
# # 不同打折優惠券領取次數
# temp = online.groupby('Discount_rate').size().reset_index(name='oc1')
# X = pd.merge(X, temp, how='left', on='Discount_rate')
#
# # 不同打折優惠券使用次數
# temp = coupon_consume.groupby('Discount_rate').size().reset_index(name='oc2')
# X = pd.merge(X, temp, how='left', on='Discount_rate')
#
# # 不同打折優惠券不使用次數
# temp = coupon_no_consume.groupby('Discount_rate').size().reset_index(name='oc3')
# X = pd.merge(X, temp, how='left', on='Discount_rate')
#
# # 不同打折優惠券使用率
# X['oc4'] = X.oc2 / X.oc1
print(len(X), len(X.columns))
print('----------')
return X
def get_train_data():
path = 'cache_%s_train.csv' % os.path.basename(__file__)
if os.path.exists(path):
data = pd.read_csv(path)
else:
offline, online = get_preprocess_data()
# date received 2016-01-01 - 2016-06-15
# date consumed 2016-01-01 - 2016-06-30
# train data 1
# 2016-04-16 ~ 2016-05-15
data_1 = offline[('2016-04-16' <= offline.Date_received) & (offline.Date_received <= '2016-05-15')].copy()
data_1['label'] = 0
data_1.loc[
(data_1.Date != date_null) & (data_1.Date - data_1.Date_received <= datetime.timedelta(15)), 'label'] = 1
# feature data 1
# 領券 2016-01-01 ~ 2016-03-31
end = '2016-03-31'
data_off_1 = offline[offline.Date_received <= end]
data_on_1 = online[online.Date_received <= end]
# 普通消費 2016-01-01 ~ 2016-04-15
end = '2016-04-15'
data_off_2 = offline[(offline.Coupon_id == 0) & (offline.Date <= end)]
data_on_2 = online[(online.Coupon_id == 0) & (online.Date <= end)]
data_1 = get_offline_features(data_1, pd.concat([data_off_1, data_off_2]))
data_1 = get_online_features(pd.concat([data_on_1, data_on_2]), data_1)
# train data 2
# 2016-05-16 ~ 2016-06-15
data_2 = offline['2016-05-16' <= offline.Date_received].copy()
data_2['label'] = 0
data_2.loc[
(data_2.Date != date_null) & (data_2.Date - data_2.Date_received <= datetime.timedelta(15)), 'label'] = 1
# feature data 2
# 領券
start = '2016-02-01'
end = '2016-04-30'
data_off_1 = offline[(start <= offline.Date_received) & (offline.Date_received <= end)]
data_on_1 = online[(start <= online.Date_received) & (online.Date_received <= end)]
# 普通消費
start = '2016-02-01'
end = '2016-05-15'
data_off_2 = offline[(offline.Coupon_id == 0) & (start <= offline.Date) & (offline.Date <= end)]
data_on_2 = online[(online.Coupon_id == 0) & (start <= online.Date) & (online.Date <= end)]
data_2 = get_offline_features(data_2, pd.concat([data_off_1, data_off_2]))
data_2 = get_online_features(pd.concat([data_on_1, data_on_2]), data_2)
data = pd.concat([data_1, data_2])
# undersampling
# if undersampling:
# temp = X_1[X_1.label == 1].groupby('User_id').size().reset_index()
# temp = X_1[X_1.User_id.isin(temp.User_id)]
# X_1 = pd.concat([temp, X_1[~X_1.User_id.isin(temp.User_id)].sample(4041)])
# data.drop_duplicates(inplace=True)
drop_columns(data)
data.fillna(0, inplace=True)
data.to_csv(path, index=False)
return data
def analysis():
offline, online = get_preprocess_data()
# t = offline.groupby('Discount_rate').size().reset_index(name='receive_count')
# t1 = offline[(offline.Coupon_id != 0) & (offline.Date != date_null)]
# t1 = t1.groupby('Discount_rate').size().reset_index(name='consume_count')
# t = pd.merge(t, t1, on='Discount_rate')
# t['consume_rate'] = t.consume_count / t.receive_count
# t = offline.groupby('Merchant_id').size().reset_index(name='receive_count')
# t1 = offline[(offline.Coupon_id != 0) & (offline.Date != date_null)]
# t1 = t1.groupby('Merchant_id').size().reset_index(name='consume_count')
# t = pd.merge(t, t1, on='Merchant_id')
# t['consume_rate'] = t.consume_count / t.receive_count
t = offline.groupby('Distance').size().reset_index(name='receive_count')
t1 = offline[(offline.Coupon_id != 0) & (offline.Date != date_null)]
t1 = t1.groupby('Distance').size().reset_index(name='consume_count')
t = pd.merge(t, t1, on='Distance')
t['consume_rate'] = t.consume_count / t.receive_count
t.to_csv('note.csv')
# plt.bar(temp.Discount_rate.values, temp.total.values)
# plt.bar(range(num), y1, bottom=y2, fc='r')
# plt.show()
exit()
def detect_duplicate_columns():
X = get_train_data()
X = X[:1000]
for index1 in range(len(X.columns) - 1):
for index2 in range(index1 + 1, len(X.columns)):
column1 = X.columns[index1]
column2 = X.columns[index2]
X[column1] = X[column1].astype(str)
X[column2] = X[column2].astype(str)
temp = len(X[X[column1] == X[column2]])
if temp == len(X):
print(column1, column2, temp)
exit()
def feature_importance_score():
clf = train_xgb()
fscores = pd.Series(clf.get_booster().get_fscore()).sort_values(ascending=False)
fscores.plot(kind='bar', title='Feature Importance')
plt.ylabel('Feature Importance Score')
plt.show()
exit()
def feature_selection():
data = get_train_data()
train_data, test_data = train_test_split(data,
train_size=100000,
random_state=0
)
X = train_data.copy().drop(columns='Coupon_id')
y = X.pop('label')
# sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
# X = sel.fit_transform(X)
# print(X.shape)
# Create the RFE object and rank each pixel
def fit_eval_metric(estimator, X, y, name=None):
if name is None:
name = estimator.__class__.__name__
if name is 'XGBClassifier' or name is 'LGBMClassifier':
estimator.fit(X, y, eval_metric='auc')
else:
estimator.fit(X, y)
return estimator
def grid_search(estimator, param_grid):
start = datetime.datetime.now()
print('--------------------------------------------')
print(start.strftime('%Y-%m-%d %H:%M:%S'))
print(param_grid)
print()
data = get_train_data()
data, _ = train_test_split(data, train_size=100000, random_state=0)
X = data.copy().drop(columns='Coupon_id')
y = X.pop('label')
estimator_name = estimator.__class__.__name__
n_jobs = cpu_jobs
if estimator_name is 'XGBClassifier' or estimator_name is 'LGBMClassifier' or estimator_name is 'CatBoostClassifier':
n_jobs = 1
clf = GridSearchCV(estimator=estimator, param_grid=param_grid, scoring='roc_auc', n_jobs=n_jobs
# cv=5
)
clf = fit_eval_metric(clf, X, y, estimator_name)
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
print('%0.5f (+/-%0.05f) for %r' % (mean, std * 2, params))
print()
print('best params', clf.best_params_)
print('best score', clf.best_score_)
print('time: %s' % str((datetime.datetime.now() - start)).split('.')[0])
print()
return clf.best_params_, clf.best_score_
def grid_search_auto(steps, params, estimator):
global log
old_params = params.copy()
while 1:
for name, step in steps.items():
score = 0
start = params[name] - step['step']
if start <= step['min']:
start = step['min']
stop = params[name] + step['step']
if step['max'] != 'inf' and stop >= step['max']:
stop = step['max']
while 1:
if str(step['step']).count('.') == 1:
stop += step['step'] / 10
else:
stop += step['step']
param_grid = {
name: np.arange(start, stop, step['step']),
}
best_params, best_score = grid_search(estimator.set_params(**params), param_grid)
if best_params[name] == params[name] or score > best_score:
print(estimator.__class__.__name__, params)
break
direction = (best_params[name] - params[name]) // abs(best_params[name] - params[name])
start = stop = best_params[name] + step['step'] * direction
score = best_score
params[name] = best_params[name]
print(estimator.__class__.__name__, params)
if best_params[name] - step['step'] < step['min'] or (
step['max'] != 'inf' and best_params[name] + step['step'] > step['max']):
break
if old_params == params:
break
old_params = params
print('--------------------------------------------')
print('new grid search')
print('--------------------------------------------')
log += 'grid search: %s\n%r\n' % (estimator.__class__.__name__, params)
def grid_search_gbdt(get_param=False):
params = {
# 10
'learning_rate': 1e-2,
'n_estimators': 1900,
'max_depth': 9,
'min_samples_split': 200,
'min_samples_leaf': 50,
'subsample': .8,
}
if get_param:
return params
steps = {
'n_estimators': {'step': 100, 'min': 1, 'max': 'inf'},
'max_depth': {'step': 1, 'min': 1, 'max': 'inf'},
'min_samples_split': {'step': 10, 'min': 2, 'max': 'inf'},
'min_samples_leaf': {'step': 10, 'min': 1, 'max': 'inf'},
'subsample': {'step': .1, 'min': .1, 'max': 1},
}
grid_search_auto(steps, params, GradientBoostingClassifier())
def grid_search_xgb(get_param=False):
params = {
# 8名的引數
'booster': 'gbtree',
'objective': 'rank:pairwise',
'min_child_weight': 1.1,
'colsample_bylevel': .7,
'reg_lambda': 1,
'learning_rate': 1e-2,
'n_estimators': 3500,
'max_depth': 5,
'gamma': .1,
'subsample': .7,
'colsample_bytree': .7,
'scale_pos_weight': 1,
'reg_alpha': 0,
'nthread': 12,
}
if get_param:
return params
steps = {
'n_estimators': {'step': 10, 'min': 1, 'max': 'inf'},
'max_depth': {'step': 1, 'min': 1, 'max': 'inf'},
'min_child_weight': {'step': 1, 'min': 1, 'max': 'inf'},
'gamma': {'step': .1, 'min': 0, 'max': 1},
'subsample': {'step': .1, 'min': .1, 'max': 1},
'colsample_bytree': {'step': .1, 'min': .1, 'max': 1},
'scale_pos_weight': {'step': 1, 'min': 1, 'max': 10},
'reg_alpha': {'step': .1, 'min': 0, 'max': 1},
}
grid_search_auto(steps, params, XGBClassifier())
def grid_search_lgb(get_param=False):
params = {
# 10
'learning_rate': 1e-2,
'n_estimators': 1200,
'num_leaves': 51,
'min_split_gain': 0,
'min_child_weight': 1e-3,
'min_child_samples': 22,
'subsample': .8,
'colsample_bytree': .8,
}
if get_param:
return params
steps = {
'n_estimators': {'step': 100, 'min': 1, 'max': 'inf'},
'num_leaves': {'step': 1, 'min': 1, 'max': 'inf'},
'min_split_gain': {'step': .1, 'min': 0, 'max': 1},
'min_child_weight': {'step': 1e-3, 'min': 1e-3, 'max': 'inf'},
'min_child_samples': {'step': 1, 'min': 1, 'max': 'inf'},
# 'subsample': {'step': .1, 'min': .1, 'max': 1},
'colsample_bytree': {'step': .1, 'min': .1, 'max': 1},
}
grid_search_auto(steps, params, LGBMClassifier())
def grid_search_cat(get_param=False):
params = {
# 10
'learning_rate': 1e-2,
'n_estimators': 3600,
'max_depth': 8,
'max_bin': 127,
'reg_lambda': 2,
'subsample': .7,
'one_hot_max_size': 2,
'bootstrap_type': 'Bernoulli',
'leaf_estimation_method': 'Newton',
'verbose': False,
'eval_metric': 'AUC',
'thread_count': cpu_jobs
}
if get_param:
return params
steps = {
'n_estimators': {'step': 150, 'min': 1, 'max': 'inf'},
'max_depth': {'step': 1, 'min': 1, 'max': 'inf'},
'max_bin': {'step': 1, 'min': 1, 'max': 255},
'reg_lambda': {'step': 1, 'min': 0, 'max': 'inf'},
'subsample': {'step': .1, 'min': .1, 'max': 1},
'one_hot_max_size': {'step': 1, 'min': 0, 'max': 255},
}
grid_search_auto(steps, params, CatBoostClassifier())
def grid_search_rf(criterion='gini', get_param=False):
if criterion == 'gini':
params = {
# 10
'n_estimators': 3090,
'max_depth': 15,
'min_samples_split': 2,
'min_samples_leaf': 1,
'criterion': 'gini',
}
else:
params = {
'n_estimators': 3110,
'max_depth': 13,
'min_samples_split': 70,
'min_samples_leaf': 10,
'criterion': 'entropy',
}
if get_param:
return params
steps = {
'n_estimators': {'step': 10, 'min': 1, 'max': 'inf'},
'max_depth': {'step': 1, 'min': 1, 'max': 'inf'},
'min_samples_split': {'step': 2, 'min': 2, 'max': 'inf'},
'min_samples_leaf': {'step': 2, 'min': 1, 'max': 'inf'},
}
grid_search_auto(steps, params, RandomForestClassifier())
def grid_search_et(criterion='gini', get_param=False):
if criterion == 'gini':
params = {
# 10
'n_estimators': 3060,
'max_depth': 22,
'min_samples_split': 12,
'min_samples_leaf': 1,
'criterion': 'gini',
}
else:
params = {
'n_estimators': 3100,
'max_depth': 13,
'min_samples_split': 70,
'min_samples_leaf': 10,
'criterion': 'entropy',
}
if get_param:
return params
steps = {
'n_estimators': {'step': 10, 'min': 1, 'max': 'inf'},
'max_depth': {'step': 1, 'min': 1, 'max': 'inf'},
'min_samples_split': {'step': 2, 'min': 2, 'max': 'inf'},
'min_samples_leaf': {'step': 2, 'min': 1, 'max': 'inf'},
}
grid_search_auto(steps, params, ExtraTreesClassifier())
def train_gbdt(model=False):
global log
params = grid_search_gbdt(True)
clf = GradientBoostingClassifier().set_params(**params)
if model:
return clf
params = clf.get_params()
log += 'gbdt'
log += ', learning_rate: %.3f' % params['learning_rate']
log += ', n_estimators: %d' % params['n_estimators']
log += ', max_depth: %d' % params['max_depth']
log += ', min_samples_split: %d' % params['min_samples_split']
log += ', min_samples_leaf: %d' % params['min_samples_leaf']
log += ', subsample: %.1f' % params['subsample']
log += '\n\n'
return train(clf)
def train_xgb(model=False):
global log
params = grid_search_xgb(True)
clf = XGBClassifier().set_params(**params)
if model:
return clf
params = clf.get_params()
log += 'xgb'
log += ', learning_rate: %.3f' % params['learning_rate']
log += ', n_estimators: %d' % params['n_estimators']
log += ', max_depth: %d' % params['max_depth']
log += ', min_child_weight: %d' % params['min_child_weight']
log += ', gamma: %.1f' % params['gamma']
log += ', subsample: %.1f' % params['subsample']
log += ', colsample_bytree: %.1f' % params['colsample_bytree']
log += '\n\n'
return train(clf)
def train_lgb(model=False):
global log
params = grid_search_lgb(True)
clf = LGBMClassifier().set_params(**params)
if model:
return clf
params = clf.get_params()
log += 'lgb'
log += ', learning_rate: %.3f' % params['learning_rate']
log += ', n_estimators: %d' % params['n_estimators']
log += ', num_leaves: %d' % params['num_leaves']
log += ', min_split_gain: %.1f' % params['min_split_gain']
log += ', min_child_weight: %.4f' % params['min_child_weight']
log += ', min_child_samples: %d' % params['min_child_samples']
log += ', subsample: %.1f' % params['subsample']
log += ', colsample_bytree: %.1f' % params['colsample_bytree']
log += '\n\n'
return train(clf)
def train_cat(model=False):
global log
params = grid_search_cat(True)
clf = CatBoostClassifier().set_params(**params)
if model:
return clf
params = clf.get_params()
log += 'cat'
log += ', learning_rate: %.3f' % params['learning_rate']
log += ', iterations: %d' % params['iterations']
log += ', depth: %d' % params['depth']
log += ', l2_leaf_reg: %d' % params['l2_leaf_reg']
log += ', border_count: %d' % params['border_count']
log += ', subsample: %d' % params['subsample']
log += ', one_hot_max_size: %d' % params['one_hot_max_size']
log += '\n\n'
return train(clf)
def train_rf(clf):
global log
params = clf.get_params()
log += 'rf'
log += ', n_estimators: %d' % params['n_estimators']
log += ', max_depth: %d' % params['max_depth']
log += ', min_samples_split: %d' % params['min_samples_split']
log += ', min_samples_leaf: %d' % params['min_samples_leaf']
log += ', criterion: %s' % params['criterion']
log += '\n\n'
return train(clf)
def train_rf_gini(model=False):
clf = RandomForestClassifier().set_params(**grid_search_rf('gini', True))
if model:
return clf
return train_rf(clf)
def train_rf_entropy():
clf = RandomForestClassifier().set_params(**grid_search_rf('entropy', True))
return train_rf(clf)
def train_et(clf):
global log
params = clf.get_params()
log += 'et'
log += ', n_estimators: %d' % params['n_estimators']
log += ', max_depth: %d' % params['max_depth']
log += ', min_samples_split: %d' % params['min_samples_split']
log += ', min_samples_leaf: %d' % params['min_samples_leaf']
log += ', criterion: %s' % params['criterion']
log += '\n\n'
return train(clf)
def train_et_gini(model=False):
clf = ExtraTreesClassifier().set_params(**grid_search_et('gini', True))
if model:
return clf
return train_et(clf)
def train_et_entropy():
clf = ExtraTreesClassifier().set_params(**{
'n_estimators': 3100,
'max_depth': 13,
'min_samples_split': 70,
'min_samples_leaf': 10,
'criterion': 'entropy',
'random_state': 0
})
return train_et(clf)
def train(clf):
global log
data = get_train_data()
train_data, test_data = train_test_split(data,
train_size=100000,
random_state=0
)
_, test_data = train_test_split(data, random_state=0)
X_train = train_data.copy().drop(columns='Coupon_id')
y_train = X_train.pop('label')
clf = fit_eval_metric(clf, X_train, y_train)
X_test = test_data.copy().drop(columns='Coupon_id')
y_test = X_test.pop('label')
y_true, y_pred = y_test, clf.predict(X_test)
# log += '%s\n' % classification_report(y_test, y_pred)
log += ' accuracy: %f\n' % accuracy_score(y_true, y_pred)
y_score = clf.predict_proba(X_test)[:, 1]
log += ' auc: %f\n' % roc_auc_score(y_true, y_score)
# coupon average auc
coupons = test_data.groupby('Coupon_id').size().reset_index(name='total')
aucs = []
for _, coupon in coupons.iterrows():
if coupon.total > 1:
X_test = test_data[test_data.Coupon_id == coupon.Coupon_id].copy()
X_test.drop(columns='Coupon_id', inplace=True)
if len(X_test.label.unique()) != 2:
continue
y_true = X_test.pop('label')
y_score = clf.predict_proba(X_test)[:, 1]
aucs.append(roc_auc_score(y_true, y_score))
log += 'coupon auc: %f\n\n' % np.mean(aucs)
return clf
def predict(model):
path = 'cache_%s_predict.csv' % os.path.basename(__file__)
if os.path.exists(path):
X = pd.read_csv(path, parse_dates=['Date_received'])
else:
offline, online = get_preprocess_data()
# 2016-03-16 ~ 2016-06-30
start = '2016-03-16'
offline = offline[(offline.Coupon_id == 0) & (start <= offline.Date) | (start <= offline.Date_received)]
online = online[(online.Coupon_id == 0) & (start <= online.Date) | (start <= online.Date_received)]
X = get_preprocess_data(True)
X = get_offline_features(X, offline)
X = get_online_features(online, X)
X.drop_duplicates(inplace=True)
X.fillna(0, inplace=True)
X.to_csv(path, index=False)
sample_submission = X[['User_id', 'Coupon_id', 'Date_received']].copy()
sample_submission.Date_received = sample_submission.Date_received.dt.strftime('%Y%m%d')
drop_columns(X, True)
if model is 'blending':
predict = blending(X)
else:
clf = eval('train_%s' % model)()
predict = clf.predict_proba(X)[:, 1]
sample_submission['Probability'] = predict
sample_submission.to_csv('submission_%s.csv' % model,
# float_format='%.5f',
index=False, header=False)
def blending(predict_X=None):
global log
log += '\n'
X = get_train_data().drop(columns='Coupon_id')
y = X.pop('label')
X = np.asarray(X)
y = np.asarray(y)
_, X_submission, _, y_test_blend = train_test_split(X, y,
random_state=0
)
if predict_X is not None:
X_submission = np.asarray(predict_X)
X, _, y, _ = train_test_split(X, y,
train_size=100000,
random_state=0
)
# np.random.seed(0)
# idx = np.random.permutation(y.size)
# X = X[idx]
# y = y[idx]
skf = StratifiedKFold()
clfs = ['gbdt', 'xgb',
'rf_gini', 'et_gini', 'lgb', 'cat'
]
blend_X_train = np.zeros((X.shape[0], len(clfs)))
blend_X_test = np.zeros((X_submission.shape[0], len(clfs)))
for j, v in enumerate(clfs):
clf = eval('train_%s' % v)(True)
aucs = []
dataset_blend_test_j = []
for train_index, test_index in skf.split(X, y):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
clf = fit_eval_metric(clf, X_train, y_train)
y_submission = clf.predict_proba(X_test)[:, 1]
aucs.append(roc_auc_score(y_test, y_submission))
blend_X_train[test_index, j] = y_submission
dataset_blend_test_j.append(clf.predict_proba(X_submission)[:, 1])
log += '%7s' % v + ' auc: %f\n' % np.mean(aucs)
blend_X_test[:, j] = np.asarray(dataset_blend_test_j).T.mean(1)
print('blending')
clf = LogisticRegression()
# clf = GradientBoostingClassifier()
clf.fit(blend_X_train, y)
y_submission = clf.predict_proba(blend_X_test)[:, 1]
# Linear stretch of predictions to [0,1]
y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())
if predict_X is not None:
return y_submission
log += '\n blend auc: %f\n\n' % roc_auc_score(y_test_blend, y_submission)
print(log)
if __name__ == '__main__':
start = datetime.datetime.now()
print(start.strftime('%Y-%m-%d %H:%M:%S'))
log = '%s\n' % start.strftime('%Y-%m-%d %H:%M:%S')
cpu_jobs = os.cpu_count() - 1
date_null = pd.to_datetime('1970-01-01', format='%Y-%m-%d')
blending()
predict('blending')
log += 'time: %s\n' % str((datetime.datetime.now() - start)).split('.')[0]
log += '----------------------------------------------------\n'
open('%s.log' % os.path.basename(__file__), 'a').write(log)
print(log)
模型三:0.5
import os
import pickle
from datetime import date
import numpy as np
import pandas as pd
from sklearn.linear_model import SGDClassifier
from Motor_fault.model_utils import build_model_etr, build_model_rf
os.chdir(r'E:\專案檔案\o2o優惠券使用預測')
dfoff = pd.read_csv('ccf_offline_stage1_train.csv')
dftest = pd.read_csv('ccf_offline_stage1_test_revised.csv')
dfon = pd.read_csv('ccf_online_stage1_train.csv')
# 1. 將滿xx減yy型別(`xx:yy`)的券變成折扣率 : `1 - yy/xx`,同時建立折扣券相關的特徵 `discount_rate, discount_man, discount_jian, discount_type`
# 2. 將距離 `str` 轉為 `int`
# convert Discount_rate and Distance
def getDiscountType(row):
if pd.isnull(row):
return np.nan
elif ':' in row:
return 1
else:
return 0
def convertRate(row):
"""Convert discount to rate"""
if pd.isnull(row):
return 1.0
elif ':' in str(row):
rows = row.split(':')
return 1.0 - float(rows[1]) / float(rows[0])
else:
return float(row)
def getDiscountMan(row):
if ':' in str(row):
rows = row.split(':')
return int(rows[0])
else:
return 0
def getDiscountJian(row):
if ':' in str(row):
rows = row.split(':')
return int(rows[1])
else:
return 0
print("tool is ok.")
def processData(df):
# convert discunt_rate
df['discount_rate'] = df['Discount_rate'].apply(convertRate)
df['discount_man'] = df['Discount_rate'].apply(getDiscountMan)
df['discount_jian'] = df['Discount_rate'].apply(getDiscountJian)
df['discount_type'] = df['Discount_rate'].apply(getDiscountType)
print(df['discount_rate'].unique())
# convert distance
df['distance'] = df['Distance'].fillna(-1).astype(int)
return df
dfoff = processData(dfoff)
dftest = processData(dftest)
date_received = dfoff['Date_received'].unique()
date_received = sorted(date_received[pd.notnull(date_received)])
date_buy = dfoff['Date'].unique()
date_buy = sorted(date_buy[pd.notnull(date_buy)])
date_buy = sorted(dfoff[dfoff['Date'].notnull()]['Date'])
couponbydate = dfoff[dfoff['Date_received'].notnull()][['Date_received', 'Date']].groupby(['Date_received'],
as_index=False).count()
couponbydate.columns = ['Date_received', 'count']
buybydate = dfoff[(dfoff['Date'].notnull()) & (dfoff['Date_received'].notnull())][['Date_received', 'Date']].groupby(
['Date_received'], as_index=False).count()
buybydate.columns = ['Date_received', 'count']
print("end")
def getWeekday(row):
if row == 'nan':
return np.nan
else:
return date(int(row[0:4]), int(row[4:6]), int(row[6:8])).weekday() + 1
dfoff['weekday'] = dfoff['Date_received'].astype(str).apply(getWeekday)
dftest['weekday'] = dftest['Date_received'].astype(str).apply(getWeekday)
# weekday_type : 週六和週日為1,其他為0
dfoff['weekday_type'] = dfoff['weekday'].apply(lambda x: 1 if x in [6, 7] else 0)
dftest['weekday_type'] = dftest['weekday'].apply(lambda x: 1 if x in [6, 7] else 0)
# change weekday to one-hot encoding
weekdaycols = ['weekday_' + str(i) for i in range(1, 8)]
tmpdf = pd.get_dummies(dfoff['weekday'].replace('nan', np.nan))
tmpdf.columns = weekdaycols
dfoff[weekdaycols] = tmpdf
tmpdf = pd.get_dummies(dftest['weekday'].replace('nan', np.nan))
tmpdf.columns = weekdaycols
dftest[weekdaycols] = tmpdf
def label(row):
if pd.isnull(row['Date_received']):
return -1
if pd.notnull(row['Date']):
td = pd.to_datetime(row['Date'], format='%Y%m%d') - pd.to_datetime(row['Date_received'], format='%Y%m%d')
if td <= pd.Timedelta(15, 'D'):
return 1
return 0
dfoff['label'] = dfoff.apply(label, axis=1)
print("end")
# data split
print("-----data split------")
df = dfoff[dfoff['label'] != -1].copy()
train = df[(df['Date_received'] < 20160516)].copy()
valid = df[(df['Date_received'] >= 20160516) & (df['Date_received'] <= 20160615)].copy()
print("end")
# feature
original_feature = ['discount_rate', 'discount_type', 'discount_man', 'discount_jian', 'distance', 'weekday',
'weekday_type'] + weekdaycols
print("----train-----")
x_train, y_train =train[original_feature], train['label']
x_val = dftest[original_feature]
model_rf = build_model_rf(x_train, y_train)
model_etr = build_model_etr(x_train, y_train)
train_etr_pred = model_etr.predict(x_train)
train_rf_pred = model_rf.predict(x_train)
Strak_X_train = pd.DataFrame()
Strak_X_train['Method_2'] = train_rf_pred
Strak_X_train['Method_4'] = train_etr_pred
#
# # 第二層
model = build_model_etr(Strak_X_train, y_train)
val_rf = model_rf.predict(x_val)
val_etr = model_etr.predict(x_val)
Strak_X_val = pd.DataFrame()
Strak_X_val['Method_1'] = val_rf
Strak_X_val['Method_4'] = val_etr
# test prediction for submission
y_test_pred = model.predict_proba(Strak_X_val)
dftest1 = dftest[['User_id', 'Coupon_id', 'Date_received']].copy()
dftest1['label'] = y_test_pred[:, 1]
dftest1.to_csv('submit.csv', index=False, header=False)
dftest1.head()
import joblib
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier, \
AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, accuracy_score, roc_auc_score, precision_recall_curve, auc, roc_curve, \
f1_score, recall_score, cohen_kappa_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
from utils.read_write import writeOneCsv
src = r'E:\專案檔案\電機故障診斷\data\\'
def build_model_dt(x_train, y_train):
estimator = DecisionTreeClassifier(random_state=7)
param_grid = {
'max_depth': range(10, 25, 1),
}
model = GridSearchCV(estimator, param_grid, cv=3)
model.fit(x_train, y_train)
print('dt')
print(model.best_params_)
writeParams('dt', model.best_params_)
return model
def build_model_rf(x_train, y_train):
estimator = RandomForestClassifier()
param_grid = {
'max_depth': range(42, 43, 1),
'n_estimators': range(79, 80, 1),
}
model = GridSearchCV(estimator, param_grid, cv=3)
model.fit(x_train, y_train)
print('rf')
print(model.best_params_)
writeParams('rf', model.best_params_)
return model
def build_model_etr(x_train, y_train):
# 極端隨機森林迴歸 n_estimators 即ExtraTreesRegressor最大的決策樹個數
estimator = ExtraTreesClassifier()
param_grid = {
'max_depth': range(33, 34, 1),
'n_estimators': range(108, 109, 1),
}
model = GridSearchCV(estimator, param_grid, cv=3)
model.fit(x_train, y_train)
print('etr')
print(model.best_params_)
writeParams('etr', model.best_params_)
return model
def build_model_xgb(x_train, y_train):
estimator = XGBClassifier(gamma=0, colsample_bytree=0.9, subsample=0.91)
param_grid = {
'learning_rate': [ 0.27],
'max_depth': range(12, 13, 1),
'n_estimators': range(34, 35, 3),
}
model = GridSearchCV(estimator, param_grid, cv=3)
model.fit(x_train, y_train)
print('xgb')
print(model.best_params_)
writeParams('xgb', model.best_params_)
return model
def build_model_lgb(x_train, y_train):
estimator = LGBMClassifier()
param_grid = {
'learning_rate': [0.18],
'n_estimators': range(100, 101, 1),
'num_leaves': range(75, 80, 5)
}
gbm = GridSearchCV(estimator, param_grid)
gbm.fit(x_train, y_train.ravel())
print('lgb')
print(gbm.best_params_)
writeParams('lgb', gbm.best_params_)
return gbm
def build_model_mlpr(x_train, y_train):
from sklearn.neural_network import MLPClassifier
'''啟用函式用relu,梯度下降方法用lbfgs,效果是最好的'''
mlp = MLPClassifier(activation='relu', solver='lbfgs')
param_grid = {
'alpha': [0.002, 0.001],
'hidden_layer_sizes': [(38, 19)],
'max_iter': range(75, 85, 1),
}
model = GridSearchCV(mlp, param_grid, cv=3)
model.fit(x_train, y_train.ravel())
print('mlpr')
print(model.best_params_)
writeParams('mlpr', model.best_params_)
return model
def build_model_ada(x_train, y_train):
estimator = AdaBoostClassifier()
param_grid = {
'learning_rate': [0.23],
'n_estimators': range(13, 14, 1),
}
model = GridSearchCV(estimator, param_grid, cv=3)
model.fit(x_train, y_train)
print('ada')
print(model.best_params_)
writeParams('ada', model.best_params_)
return model
def build_model_gbdt(x_train, y_train):
estimator = GradientBoostingClassifier(min_samples_leaf=0.1, min_samples_split=10, subsample=0.998)
param_grid = {
'learning_rate': [0.75],
'max_depth': range(25, 30, 1),
'n_estimators': range(80, 85, 1)
}
gbdt = GridSearchCV(estimator, param_grid, cv=3)
gbdt.fit(x_train, y_train.ravel())
print('gbdt')
print(gbdt.best_params_)
writeParams('gbdt', gbdt.best_params_)
return gbdt
def build_model_liner_svc(x_train, y_train):
svm_reg = LinearSVC(max_iter=-1)
param_grid = {
'C': range(1, 2, 1),
}
model = GridSearchCV(svm_reg, param_grid, cv=3)
model.fit(x_train, y_train)
print('LinearSVC')
print(model.best_params_)
return model
def train_logistic_classifier(x_train, y_train):
model = LogisticRegression()
param_grid = {
'C': range(2, 3, 1),
'penalty': ['l2'],
}
model = GridSearchCV(model, param_grid, cv=3)
model.fit(x_train, y_train.ravel())
print('LR')
print(model.best_params_)
return model
def build_model_svc(x_train, y_train):
model = SVC(max_iter=-1)
param_grid = {
'C': range(1, 2, 2),
'kernel': ['poly', 'rbf', 'precomputed'],
'cache_size': range(200, 210, 20),
}
model = GridSearchCV(model, param_grid, cv=3)
model.fit(x_train, y_train.ravel())
print('SVC')
print(model.best_params_)
return model
def score_model(test, predict, model, data_type):
accuracy = round(accuracy_score(test, predict), 6)
print(data_type + ',accuracy,', accuracy)
writeOneCsv(['staking', data_type, 'accuracy', accuracy], src + '調參記錄.csv')
pre_score = precision_score(test, predict, average="macro")
print(data_type + ",precision,", round(pre_score, 6))
writeOneCsv(['staking', data_type, 'precision', round(pre_score, 6)], src + '調參記錄.csv')
roc_auc = round(roc_auc_score(test, predict), 6)
print(data_type + ",roc_auc,", roc_auc)
writeOneCsv(['staking', data_type, 'roc_auc', roc_auc], src + '調參記錄.csv')
f1 = f1_score(predict, test)
print(data_type + ",f1,", round(f1, 6))
writeOneCsv(['staking', data_type, 'f1', round(f1, 6)], src + '調參記錄.csv')
recall = recall_score(predict, test)
print(data_type + ",recall,", round(recall, 6))
writeOneCsv(['staking', data_type, 'recall', round(recall, 6)], src + '調參記錄.csv')
cohen_kappa = cohen_kappa_score(predict, test)
print(data_type + ",cohen_kappa,", round(cohen_kappa, 6))
writeOneCsv(['staking', data_type, 'cohen_kappa', round(cohen_kappa, 6)], src + '調參記錄.csv')
def save_load(model, save_or_load):
path = src + 'etr.pkl'
# save model
if save_or_load == 'save':
joblib.dump(model, path)
else:
# load model
model_etr = joblib.load(path)
return model_etr
def fit_size(x, y):
from sklearn import preprocessing
x_min = preprocessing.MinMaxScaler()
y_min = preprocessing.MinMaxScaler()
y = np.array(y).reshape(len(y), 1)
x = x_min.fit_transform(x)
y = y_min.fit_transform(y)
return x, y
def scatter_line(y_val, y_pre):
xx = range(0, len(y_val))
plt.scatter(xx, y_val, color="red", label="actual", linewidth=3)
plt.plot(xx, y_pre, color="orange", label="predicted", linewidth=2)
plt.legend()
plt.show()
def draw_ROC_curve(y_test, y_predict):
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_predict)
roc_auc = auc(false_positive_rate, true_positive_rate)
plt.title('ROC')
plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.2f' % roc_auc)
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.ylabel('TPR')
plt.xlabel('FPR')
plt.legend()
plt.show()
plt.close(0)
def pr(y_val, predict_proba):
precision, recall, thresholds = precision_recall_curve(y_val, predict_proba)
plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('2-class Precision-Recall curve')
plt.legend()
plt.show()
def writeParams(model, best):
if model in ['gbdt', 'xgb']:
writeOneCsv([model, best['max_depth'], best['n_estimators'], best['learning_rate']], src + '調參記錄.csv')
elif model == 'mlpr':
writeOneCsv([model, best['hidden_layer_sizes'], best['max_iter'], best['alpha']], src + '調參記錄.csv')
elif model == 'ada':
writeOneCsv([model, 0, best['n_estimators'], best['learning_rate']], src + '調參記錄.csv')
elif model == 'lgb':
writeOneCsv([model, best['num_leaves'], best['n_estimators'], best['learning_rate']], src + '調參記錄.csv')
elif model == 'dt':
writeOneCsv([model, best['max_depth'], 0, 0], src + '調參記錄.csv')
else:
writeOneCsv([model, best['max_depth'], best['n_estimators'], 0], src + '調參記錄.csv')
def write_mae(model, data_type, mae):
writeOneCsv([model, data_type, 'mae', mae], src + '調參記錄.csv')
喜歡記得一鍵三連
相關文章
- 資料探勘實戰 - 天池新人賽o2o優惠券使用預測
- 天池 O2O 優惠券使用預測思路解析與程式碼實戰
- 《阿里雲天池大賽賽題解析》——O2O優惠卷預測阿里
- 天池大賽o2o優惠券第一名程式碼解讀(4)
- 基於XGBoost模型的幸福度預測——阿里天池學習賽模型阿里
- 天池FashionAI全球挑戰賽小小嚐試AI
- 競賽1-阿里天池-口碑客流量預測-題解總結阿里
- 【天池競賽系列】菜鳥-需求預測與分倉規劃初賽冠軍解決方案
- 阿里天池大資料競賽阿里大資料
- 實戰人品預測之一_國內大資料競賽平臺大資料
- RNN實戰:股票預測2RNN
- 天池金融風控-貸款違約挑戰賽 Task5 模型融合模型
- 競賽釋出 | AI戰疫·小分子成藥屬性預測大賽開賽!AI
- 天池大資料比賽總結大資料
- 618新人199IT知識星球優惠券
- 天池大賽津南挑戰賽圓滿收官 多項技術成果助力津南“智造”
- Graviti攜手UC Berkeley探索自動駕駛預測模型, INTERACTION預測挑戰賽正式開啟自動駕駛模型
- 經典比賽濃縮成書:阿里雲天池釋出深度學習實戰技術寶典阿里深度學習
- 機器學習股票價格預測初級實戰機器學習
- Kaggle 入門並實戰房價預測
- 阿里移動推薦,新人離線賽-python實現阿里Python
- 零基礎入門金融風控之貸款違約預測挑戰賽——簡單實現
- C語言 · 3000米排名預測C語言
- 終於!我找到了開發的得力助手!阿里雲天池雲原生程式設計挑戰賽參賽攻略阿里程式設計
- 第一屆天池 PolarDB 資料庫效能大賽資料庫
- 天池中介軟體大賽Golang版Service Mesh思路分享Golang
- Kaggle 自行車租賃預測比賽專案實現
- 天池中介軟體大賽百萬佇列儲存設計總結【複賽】佇列
- 機器學習實戰專案-預測數值型迴歸機器學習
- 2、房價預測實戰中學到的經驗
- [機器學習實戰-Logistic迴歸]使用Logistic迴歸預測各種例項機器學習
- 阿里天池大賽 stacking 保姆級別手把手教授阿里
- 智算之道——2020人工智慧應用挑戰賽(初賽)疾病預測結構化資料人工智慧
- 機器學習入門實戰——基於knn的airbnb房租預測機器學習KNNAI
- Kaggle機器學習入門實戰 -- Titanic乘客生還預測機器學習
- 疾病預測和天氣分析練習賽
- 分子AI預測賽Task1筆記AI筆記
- 【CSDN競賽第27期】贏圖書《阿里雲天池大賽賽題解析—機器學習篇》和定製周邊阿里機器學習