最近在為機器學習結合推薦演算法的優化方法和資料來源想辦法。抱著學習的態度繼續解讀19-AnalytiCup的冠軍原始碼。
第一部分itemcf解讀的連線:https://www.cnblogs.com/missouter/p/12701875.html
第二、三部分主要是特徵提取和排序。在這篇部落格中將作展開。
1、generate_static_features.ipynb 標題簡潔明瞭 提取靜態特徵
import pandas as pd import numpy as np def reduce_mem_usage(df): """ iterate through all the columns of a dataframe and modify the data type to reduce memory usage. """ start_mem = df.memory_usage().sum() print('Memory usage of dataframe is {:.2f} MB'.format(start_mem)) for col in df.columns: col_type = df[col].dtype if col_type != object: c_min = df[col].min() c_max = df[col].max() if str(col_type)[:3] == 'int': if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: df[col] = df[col].astype(np.int8) elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: df[col] = df[col].astype(np.int16) elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: df[col] = df[col].astype(np.int32) elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max: df[col] = df[col].astype(np.int64) else: if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: df[col] = df[col].astype(np.float16) elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max: df[col] = df[col].astype(np.float32) else: df[col] = df[col].astype(np.float64) else: df[col] = df[col].astype('category') end_mem = df.memory_usage().sum() print('Memory usage after optimization is: {:.2f} MB'.format(end_mem)) print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem)) return df def load_data(path): user = reduce_mem_usage(pd.read_csv(path + 'user.csv',header=None)) item = reduce_mem_usage(pd.read_csv(path + 'item.csv',header=None)) data = pd.read_csv(path + 'user_behavior.csv',header=None) data.columns = ['userID','itemID','behavior','timestamp'] data['day'] = data['timestamp'] // 86400 data['hour'] = data['timestamp'] // 3600 % 24 ## 生成behavior的onehot for i in ['pv','fav','cart','buy']: data[i] = 0 data.loc[data['behavior'] == i, i] = 1 ## 生成behavior的加權 data['day_hour'] = data['day'] + data['hour'] / float(24) data.loc[data['behavior']=='pv','behavior'] = 1 data.loc[data['behavior']=='fav','behavior'] = 2 data.loc[data['behavior']=='cart','behavior'] = 3 data.loc[data['behavior']=='buy','behavior'] = 1 max_day = max(data['day']) min_day = min(data['day']) data['behavior'] = (1 - (max_day-data['day_hour']+2)/(max_day-min_day+2)) * data['behavior'] item.columns = ['itemID','category','shop','brand'] user.columns = ['userID','sex','age','ability'] data = reduce_mem_usage(data) data = pd.merge(left=data, right=item, on='itemID',how='left') data = pd.merge(left=data, right=user, on='userID',how='left') return user, item, data
讀取資料記憶體優化這塊已經是老生常談。loaddata()函式順便完成了對各類行為權重的轉換,值得一提的是購買權重被分配為1.而瀏覽、收藏等行為則被分配為1、2、3;目的是為了不向顧客推薦已購買過的商品。
主函式部分:
path = '../ECommAI_EUIR_round2_train_20190816/' user, item, data = load_data(path = path) for count_feature in ['itemID', 'shop', 'category','brand']: data[['behavior', count_feature]].groupby(count_feature, as_index=False).agg( {'behavior':'count'}).rename(columns={'behavior':count_feature + '_count'}).to_csv(str(count_feature)+'_count.csv', index=False) for count_feature in ['itemID', 'shop', 'category','brand']: data[['behavior', count_feature]].groupby(count_feature, as_index=False).agg( {'behavior':'sum'}).rename(columns={'behavior':count_feature + '_sum'}).to_csv(str(count_feature)+'_sum.csv', index=False)
確定路徑後,對item、shop、category與brand的特徵進行提取。使用groupby().agg()分別提取使用者行為權重的次數與累加和(agg引數'count'與'sum')。生成檔案分別儲存於csv檔案中。
temp = data[['behavior','category']].groupby('category', as_index=False).agg({'behavior': ['median','std','skew']}) temp.columns = ['category','category_median','category_std','category_skew'] temp.to_csv('category_higher.csv',index=False) temp = data[['behavior','itemID']].groupby('itemID', as_index=False).agg({'behavior': ['median','std','skew']}) temp.columns = ['itemID','itemID_median','itemID_std','itemID_skew'] temp.to_csv('itemID_higher.csv',index=False)
上述程式碼使用groupby().agg()提取每個單獨category、單獨id的行為中值、標準差與偏斜。
data['age'] = data['age'] // 10 train = data[data['day'] < 15] for count_feature in ['sex','ability','age']: data[['behavior','itemID',count_feature]].groupby(['itemID', count_feature], as_index=False).agg( {'behavior': 'count'}).rename(columns={'behavior':'user_to_' + count_feature + '_count'}).to_csv('item_to_' + str(count_feature)+'_count_online.csv', index=False)
這段以每個使用者的基本資料(性別、對推薦系統的影響力、年齡)為基準,對其對應的行為次數進行特徵提取。
itemcount = pd.read_csv('itemID_count.csv') temp = pd.merge(left=item, right=itemcount, how='left', on='itemID') item_rank = [] for eachcat in temp.groupby('category'): each_df = eachcat[1].sort_values('itemID_count', ascending=False).reset_index(drop=True) each_df['rank'] = each_df.index + 1 lenth = each_df.shape[0] each_df['rank_percent'] = (each_df.index + 1) / lenth item_rank.append(each_df[['itemID','rank','rank_percent']])
使用merge對item與item的行為次數進行拼接。使用groupby按照商品類別進行分類。每個類別內商品按照商品的行為次數進行排序,算出商品的類內排名與排名百分比,
item_rank = pd.concat(item_rank, sort=False) item_rank.to_csv('item_rank.csv',index=False)
將生成的類內排序使用concat()去除多餘標籤,寫入檔案。
def unique_count(x): return len(set(x)) cat1 = item.groupby('category',as_index=False).agg({'itemID': unique_count}).rename(columns={'itemID':'itemnum_undercat'}) cat2 = item.groupby('category',as_index=False).agg({'brand': unique_count}).rename(columns={'brand':'brandnum_undercat'}) cat3 = item.groupby('category',as_index=False).agg({'shop': unique_count}).rename(columns={'shop':'shopnum_undercat'}) pd.concat([cat1, cat2[['brandnum_undercat']], cat3[['shopnum_undercat']]], axis=1).to_csv('category_lower.csv',index=False)
這裡先定義一個統計集合內元素數量的函式,應用在agg()中作為引數,用groupby以類別進行分類,統計每個類別中商品、品牌與商家的數量,寫入csv檔案。
2、generate_dynamic_feature.ipynb 提取動態特徵
import pandas as pd import numpy as np def reduce_mem_usage(df): """ iterate through all the columns of a dataframe and modify the data type to reduce memory usage. """ start_mem = df.memory_usage().sum() print('Memory usage of dataframe is {:.2f} MB'.format(start_mem)) for col in df.columns: col_type = df[col].dtype if col_type != object: c_min = df[col].min() c_max = df[col].max() if str(col_type)[:3] == 'int': if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: df[col] = df[col].astype(np.int8) elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: df[col] = df[col].astype(np.int16) elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: df[col] = df[col].astype(np.int32) elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max: df[col] = df[col].astype(np.int64) else: if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: df[col] = df[col].astype(np.float16) elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max: df[col] = df[col].astype(np.float32) else: df[col] = df[col].astype(np.float64) else: df[col] = df[col].astype('category') end_mem = df.memory_usage().sum() print('Memory usage after optimization is: {:.2f} MB'.format(end_mem)) print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem)) return df def load_data(path): user = reduce_mem_usage(pd.read_csv(path + 'user.csv',header=None)) item = reduce_mem_usage(pd.read_csv(path + 'item.csv',header=None)) data = pd.read_csv(path + 'user_behavior.csv',header=None) data.columns = ['userID','itemID','behavior','timestamp'] data['day'] = data['timestamp'] // 86400 data['hour'] = data['timestamp'] // 3600 % 24 ## 生成behavior的onehot for i in ['pv','fav','cart','buy']: data[i] = 0 data.loc[data['behavior'] == i, i] = 1 ## 生成behavior的加權 data['day_hour'] = data['day'] + data['hour'] / float(24) data.loc[data['behavior']=='pv','behavior'] = 1 data.loc[data['behavior']=='fav','behavior'] = 2 data.loc[data['behavior']=='cart','behavior'] = 3 data.loc[data['behavior']=='buy','behavior'] = 1 max_day = max(data['day']) min_day = min(data['day']) data['behavior'] = (1 - (max_day-data['day_hour']+2)/(max_day-min_day+2)) * data['behavior'] item.columns = ['itemID','category','shop','brand'] user.columns = ['userID','sex','age','ability'] data = reduce_mem_usage(data) data = pd.merge(left=data, right=item, on='itemID',how='left') data = pd.merge(left=data, right=user, on='userID',how='left') return user, item, data
與靜態特徵提取一樣。
主函式部分:
#path = '..\\data\\' path = '../ECommAI_EUIR_round2_train_20190816/' user, item, data = load_data(path = path) train = data[data['day'] < 15] online_features = [] for count_feature in ['category','shop','brand']: train[['behavior','userID',count_feature]].groupby(['userID', count_feature], as_index=False).agg( {'behavior': 'count'}).rename(columns={'behavior':'user_to_' + count_feature + '_count'}).to_csv('user_to_' + str(count_feature)+'_count.csv', index=False) for count_feature in ['category','shop','brand']: train[['behavior','userID',count_feature]].groupby(['userID', count_feature], as_index=False).agg( {'behavior': 'sum'}).rename(columns={'behavior':'user_to_' + count_feature + '_sum'}).to_csv('user_to_' + str(count_feature)+'_sum.csv', index=False) for count_feature in ['category','shop','brand']: for behavior_type in ['pv','fav','cart','buy']: train[[behavior_type,'userID',count_feature]].groupby(['userID', count_feature], as_index=False).agg( {behavior_type: 'sum'}).rename(columns={behavior_type:'user_to_' + count_feature + '_count_' + behavior_type}).to_csv('user_to_' + str(count_feature) + '_count_' + behavior_type + '.csv', index=False)
將過去十五天的使用者資料進行特徵提取。同第一個檔案一樣的特徵提取方式,只不過第二步提取的主體是使用者。分別對使用者與其產生行為的類別、商家與品牌進行次數、行為加權的特徵提取。再對使用者的四種行為型別與類別、商家與品牌進行累加和(次數?但它agg引數使用了sum)提取。最後寫入csv檔案。
yestday = data[data['day'] == 14] for count_feature in ['category','shop','brand']: yestday[['behavior','userID',count_feature]].groupby(['userID', count_feature], as_index=False).agg( {'behavior': 'count'}).rename(columns={'behavior':'user_to_' + count_feature + '_count_yestday'}).to_csv('user_to_' + str(count_feature)+'_count_yestday.csv', index=False) for count_feature in ['category','shop','brand']: for behavior_type in ['pv','fav','cart','buy']: yestday[[behavior_type,'userID',count_feature]].groupby(['userID', count_feature], as_index=False).agg( {behavior_type: 'sum'}).rename(columns={behavior_type:'user_to_' + count_feature + '_count_' + behavior_type+'_yestday'}).to_csv('user_to_' + str(count_feature) + '_count_' + behavior_type + '_yestday.csv', index=False)
單獨對昨天的使用者資料進行提取,針對行為次數與類別寫入csv檔案。
a5days = data[(data['day'] > 15 - 5) & (data['day'] < 15)] for count_feature in ['category','shop','brand']: a5days[['behavior','userID',count_feature]].groupby(['userID', count_feature], as_index=False).agg( {'behavior': 'count'}).rename(columns={'behavior':'user_to_' + count_feature + '_count_5days'}).to_csv('user_to_' + str(count_feature)+'_count_5days.csv', index=False) for count_feature in ['category','shop','brand']: for behavior_type in ['pv','fav','cart','buy']: a5days[[behavior_type,'userID',count_feature]].groupby(['userID', count_feature], as_index=False).agg( {behavior_type: 'sum'}).rename(columns={behavior_type:'user_to_' + count_feature + '_count_' + behavior_type+'_5days'}).to_csv('user_to_' + str(count_feature) + '_count_' + behavior_type + '_5days.csv', index=False)
針對近五天的使用者資料進行提取,針對行為次數與類別寫入csv檔案。
start_timestamp = max(data[data['day'] < 15]['timestamp']) time_features = [] test = data[data['day'] < 15] for time_feature in ['shop', 'category','brand']: time_features.append(test[['last_time','userID',time_feature,'day']].groupby(['userID',time_feature], as_index=False).agg({'last_time': 'min', 'day':'max'}).rename(columns={'last_time': 'user_to_'+ time_feature + '_lasttime', 'day':'user_to_'+ time_feature + '_lastday'})) for f in time_features: f.to_csv(str(f.columns[2])+'.csv', index=False) for f in time_features: print(str(f.columns[2])+'.csv')
對每個使用者訪問商戶、品牌與類別的最新時間進行提取,寫入csv中。
for count_feature in ['sex','ability','age']: train[['behavior','itemID',count_feature]].groupby(['itemID', count_feature], as_index=False).agg( {'behavior': 'count'}).rename(columns={'behavior':'user_to_'+ count_feature + '_count'}).to_csv('item_to_' + str(count_feature)+'_count.csv', index=False)
最後以每個使用者的基本資料(性別、對推薦系統的影響力、年齡)為基準,對其對應的行為次數進行特徵提取,生成一個與第一步對應的線下特徵檔案。
3、generate_time_feature.ipynb 提取時間特徵
def reduce_mem_usage(df): """ iterate through all the columns of a dataframe and modify the data type to reduce memory usage. """ start_mem = df.memory_usage().sum() print('Memory usage of dataframe is {:.2f} MB'.format(start_mem)) for col in df.columns: col_type = df[col].dtype if col_type != object: c_min = df[col].min() c_max = df[col].max() if str(col_type)[:3] == 'int': if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: df[col] = df[col].astype(np.int8) elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: df[col] = df[col].astype(np.int16) elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: df[col] = df[col].astype(np.int32) elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max: df[col] = df[col].astype(np.int64) else: if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: df[col] = df[col].astype(np.float16) elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max: df[col] = df[col].astype(np.float32) else: df[col] = df[col].astype(np.float64) else: df[col] = df[col].astype('category') end_mem = df.memory_usage().sum() print('Memory usage after optimization is: {:.2f} MB'.format(end_mem)) print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem)) return df def load_data(path): user = reduce_mem_usage(pd.read_csv(path + 'user.csv',header=None)) item = reduce_mem_usage(pd.read_csv(path + 'item.csv',header=None)) data = pd.read_csv(path + 'user_behavior.csv',header=None) data.columns = ['userID','itemID','behavior','timestamp'] data['day'] = data['timestamp'] // 86400 data['hour'] = data['timestamp'] // 3600 % 24 ## 生成behavior的onehot for i in ['pv','fav','cart','buy']: data[i] = 0 data.loc[data['behavior'] == i, i] = 1 ## 生成behavior的加權 data['day_hour'] = data['day'] + data['hour'] / float(24) data.loc[data['behavior']=='pv','behavior'] = 1 data.loc[data['behavior']=='fav','behavior'] = 2 data.loc[data['behavior']=='cart','behavior'] = 3 data.loc[data['behavior']=='buy','behavior'] = 1 max_day = max(data['day']) min_day = min(data['day']) data['behavior'] = (1 - (max_day-data['day_hour']+2)/(max_day-min_day+2)) * data['behavior'] item.columns = ['itemID','category','shop','brand'] user.columns = ['userID','sex','age','ability'] data = reduce_mem_usage(data) data = pd.merge(left=data, right=item, on='itemID',how='left') data = pd.merge(left=data, right=user, on='userID',how='left') return user, item, data
一樣的讀取步驟。
path = '../ECommAI_EUIR_round2_train_20190816/' user, item, data = load_data(path = path) train = data[data['day'] < 15] start_timestamp = max(train['timestamp']) train['last_time'] = start_timestamp - train['timestamp'] timefeatures = [] for time_feature in ['itemID', 'shop', 'category','brand']: name = time_feature + '_last_time_underline.csv' tf = train[['last_time', time_feature]].groupby( time_feature, as_index=False).agg({'last_time':'min'}).rename(columns={'last_time': time_feature + 'last_time'}) tf[time_feature + 'last_time_hour_ed'] = tf[time_feature + 'last_time'] // 3600 % 24 timefeatures.append((name, tf)) for f in timefeatures: f[1].to_csv(f[0], index=False)
這裡作者演示了一種提取某個商品/店鋪/類別/品牌 距離第15、16天的最後一次點選的方法。通過計算最大時間戳減去每個訪問的時間戳得到last_time,通過groupby()分類,agg()提取最小的last_time列得到最後一次點選的商品。
至此,特徵提取的原始碼分析就結束了。這部分的程式碼給我的感覺是groupby().agg()使用的非常熟練老道,特徵工程的構建有很多值得學習的地方。
原始碼直接跑起來會出現一些意想不到的bug,我們非常感謝原作者薛傳雨提供的幫助。