天池大賽o2o優惠券第一名程式碼解讀(4)

楊Laughing發表於2017-07-26

飲水思源,感謝

#使用者的相關資訊
"""
3.user related: 
      count_merchant. 
      user_avg_distance, user_min_distance,user_max_distance. 
      buy_use_coupon. buy_total. coupon_received.
      buy_use_coupon/coupon_received. 
      buy_use_coupon/buy_total
      user_date_datereceived_gap


"""
def get_user_date_datereceived_gap(s):
    s = s.split(':')
    return (date(int(s[0][0:4]),int(s[0][4:6]),int(s[0][6:8])) - date(int(s[1][0:4]),int(s[1][4:6]),int(s[1][6:8]))).days

#對於資料集3
user3 = feature3[['user_id','merchant_id','coupon_id','discount_rate','distance','date_received','date']]

t = user3[['user_id']]
#去掉資料中重複的使用者Id
t.drop_duplicates(inplace=True)

#使用者購買商品的種類數
t1 = user3[user3.date!='null'][['user_id','merchant_id']]
#同樣去掉重複用的使用者id和商品id
t1.drop_duplicates(inplace=True)
t1.merchant_id = 1
t1 = t1.groupby('user_id').agg('sum').reset_index()
t1.rename(columns={'merchant_id':'count_merchant'},inplace=True)


#使用了優惠券購買商品的使用者id和距離
t2 = user3[(user3.date!='null')&(user3.coupon_id!='null')][['user_id','distance']]
#將null值替換為-1
t2.replace('null',-1,inplace=True)
t2.distance = t2.distance.astype('int')#轉換資料型別為int
t2.replace(-1,np.nan,inplace=True)

#得到使用優惠券購買商品的使用者離店鋪的最短距離
t3 = t2.groupby('user_id').agg('min').reset_index()
t3.rename(columns={'distance':'user_min_distance'},inplace=True)

#得到最大距離
t4 = t2.groupby('user_id').agg('max').reset_index()
t4.rename(columns={'distance':'user_max_distance'},inplace=True)

#得到平均距離
t5 = t2.groupby('user_id').agg('mean').reset_index()
t5.rename(columns={'distance':'user_mean_distance'},inplace=True)

#得到中間距離
t6 = t2.groupby('user_id').agg('median').reset_index()
t6.rename(columns={'distance':'user_median_distance'},inplace=True)

#每個使用者使用優惠券購買的物品數量
t7 = user3[(user3.date != 'null')&(user3.coupon_id != 'null')][['user_id']]
t7['buy_use_coupon'] = 1
t7 = t7.groupby('user_id').agg('sum').reset_index()

#購買物品的總數
t8 = user3[user3.date != 'null'][['user_id']]
t8['buy_total'] = 1
t8 = t8.groupby('user_id').agg('sum').reset_index()

#接受的優惠券的總數
t9 = user3[user3.coupon_id != 'null'][['user_id']]
t9['coupon_received'] = 1
t9 = t9.groupby('user_id').agg('sum').reset_index()

#接受到優惠券的日期和使用之間的間隔
t10 = user3[(user3.date_received != 'null')&(user3.date != 'null')][['user_id','date_received','date']]
t10['user_date_datereceived_gap'] = t10.date + ':'+ t10.date_received
t10.user_date_datereceived_gap = t10.user_date_datereceived_gap.apply(get_user_date_datereceived_gap)
t10 = t10[['user_id','user_date_datereceived_gap']]

#將使用者優惠券使用時間的間隔取平均數
t11 = t10.groupby('user_id').agg('mean').reset_index()
t11.rename(columns={'user_date_datereceived_gap':'avg_user_date_datereceived_gap'},inplace=True)

#間隔天數的最小值
t12 = t10.groupby('user_id').agg('min').reset_index()
t12.rename(columns={'user_date_datereceived_gap':'min_user_date_datereceived_gap'},inplace=True)

#間隔天數的最大值
t13 = t10.groupby('user_id').agg('max').reset_index()
t13.rename(columns={'user_date_datareceived_gap':'max_user_date_datereceived_gap'},inplace=True)

#將提取的特徵合併
user3_feature = pd.merge(t,t1,on='user_id',how='left')
user3_feature = pd.merge(user3_feature,t3,on='user_id',how='left')
user3_feature = pd.merge(user3_feature,t4,on='user_id',how='left')
user3_feature = pd.merge(user3_feature,t5,on='user_id',how='left')
user3_feature = pd.merge(user3_feature,t6,on='user_id',how='left')
user3_feature = pd.merge(user3_feature,t7,on='user_id',how='left')
user3_feature = pd.merge(user3_feature,t8,on='user_id',how='left')
user3_feature = pd.merge(user3_feature,t9,on='user_id',how='left')
user3_feature = pd.merge(user3_feature,t11,on='user_id',how='left')
user3_feature = pd.merge(user3_feature,t12,on='user_id',how='left')
user3_feature = pd.merge(user3_feature,t13,on='user_id',how='left')

user3_feature.count_merchant = user3_feature.count_merchant.replace(np.nan,0)
user3_feature.buy_user_coupon = user3_feature.buy_use_coupon.replace(np.nan,0)
user3_feature['buy_use_coupon_rate'] = user3_feature.buy_use_coupon.astype('float') / user3_feature.buy_total.astype('float')#使用優惠券購買的商品佔總數的多少
user3_feature['user_coupon_transfer_rate'] = user3_feature.buy_use_coupon.astype('float') / user3_feature.coupon_received.astype('float')
user3_feature.buy_total = user3_feature.buy_total.replace(np.nan,0)#將資料中的NaN值轉為0
user3_feature.coupon_received = user3_feature.coupon_received.replace(np.nan,0)
print(user3_feature)

相關文章