智算之道-肝炎預測

蒟蒻瑟瑟發抖發表於2020-10-19

肝炎預測

  1. 資料處理
    將缺失較少的資料,使用眾數或者平均數填充,缺失較多的數使用隨機森林進行預估.

  2. 模型訓練
    使用catboost進行預測,程式碼如下:

import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
#Ignore RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility

import pandas as pd
import numpy as np 
from pandas import Series,DataFrame

data_train = pd.read_csv('/home/kesci/data/competition_A/train_set.csv',engine = 'python',encoding='UTF-8')
path = '/home/kesci/data/competition_A/'
data_test  = pd.read_csv(path+'test_set.csv') 
data_train.info()
import pandas as pd
pd.set_option('max_columns',1000)
pd.set_option('max_row',300)
pd.set_option('display.float_format', lambda x:'%.5f' % x)
data_train.head()
data_train.describe()
import pandas as pd
import numpy as np
pd.set_option('max_columns',1000)
pd.set_option('max_row',300)
pd.set_option('display.float_format', lambda x:'%.5f' % x)
data_train.head()
data_train.describe()
data_train.shape[0]
data_shengao_west = data_train['身高'][data_train['區域']=='west'].mean()
data_shengao_west
data_shengao_east = data_train['身高'][data_train['區域']=='east'].mean()
data_shengao_east
data_shengao_north = data_train['身高'][data_train['區域']=='north'].mean()
data_shengao_north
data_shengao_south = data_train['身高'][data_train['區域']=='south'].mean()
data_shengao_south
for i in range(6000):
    if(data_train['區域'].iloc[i]=='west' ):
        if pd.isnull(data_train.at[i,'身高']):
            data_train.at[i,'身高']=data_shengao_west
    if(data_train['區域'].iloc[i]=='east'):
        if pd.isnull(data_train.at[i,'身高']):
            data_train.at[i,'身高']=data_shengao_east
    if(data_train['區域'].iloc[i]=='north'):
        if pd.isnull(data_train.at[i,'身高']):
            data_train.at[i,'身高']=data_shengao_north
    if(data_train['區域'].iloc[i]=='south'):
        if pd.isnull(data_train.at[i,'身高']):
            data_train.at[i,'身高']=data_shengao_south
data_train['身高'][data_train['性別']=='F'].mean()
data_train['身高'][data_train['性別']=='M'].mean()
data_weight_west = data_train['體重'][data_train['區域']=='west'].mean()
data_weight_west
data_weight_east = data_train['體重'][data_train['區域']=='east'].mean()
data_weight_east
data_weight_north = data_train['體重'][data_train['區域']=='north'].mean()
data_weight_north
data_weight_south = data_train['體重'][data_train['區域']=='south'].mean()
data_weight_south

for i in range(6000):
    if(data_train['區域'].iloc[i]=='west' ):
        if pd.isnull(data_train.at[i,'體重']):
            data_train.at[i,'體重']=data_weight_west
    if(data_train['區域'].iloc[i]=='east'):
        if pd.isnull(data_train.at[i,'體重']):
            data_train.at[i,'體重']=data_weight_east
    if(data_train['區域'].iloc[i]=='north'):
        if pd.isnull(data_train.at[i,'體重']):
            data_train.at[i,'體重']=data_weight_north
    if(data_train['區域'].iloc[i]=='south'):
        if pd.isnull(data_train.at[i,'體重']):
            data_train.at[i,'體重']=data_weight_south

for i in range(6000):
    if(data_train['性別'].iloc[i]=='F' ):
        if pd.isnull(data_train.at[i,'體重指數']):
            data_train.at[i,'體重指數'] = (data_train.at[i,'身高']-80)*0.7
    if(data_train['性別'].iloc[i]=='M' ):
        if pd.isnull(data_train.at[i,'體重指數']):
            data_train.at[i,'體重指數'] = (data_train.at[i,'身高']-70)*0.6
for i in range(6000):    
    if pd.isnull(data_train.at[i,'肥胖腰圍']):
        if(data_train['體重指數'].iloc[i] > 30 ):
            data_train.at[i,'肥胖腰圍']=1.0
        else:
            data_train.at[i,'肥胖腰圍']=0.0
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
num_columns = ['年齡','體重','身高','體重指數', '腰圍', '最高血壓', '最低血壓',
                '好膽固醇', '壞膽固醇', '總膽固醇','體育活動']
zero_to_one_columns = ['血脂異常','PVD','家庭高血壓', '家族糖尿病']
str_columns = ['家族肝炎', '慢性疲勞','糖尿病','教育','未婚','收入','肥胖腰圍','性別','區域','護理來源','視力不佳','飲酒','高血壓','肝炎']

for i in tqdm(str_columns):#可用於顯示進度
    lbl = LabelEncoder()
    data_train[i] = lbl.fit_transform(data_train[i].astype(str))
data_train['高血壓'].fillna(data_train['高血壓'].mode())
for i in range(6000):    
    if pd.isnull(data_train.at[i,'ALF']):
        if(data_train['高血壓'].iloc[i] == 0.0):
            data_train.at[i,'ALF']=0.0
for i in range(6000):    
    if pd.isnull(data_train.at[i,'ALF']):
        if(data_train['區域'].iloc[i] == 'west'):
            data_train.at[i,'ALF']=0.0
data_train.info()

pd.set_option('max_columns',1000)
pd.set_option('max_row',300)
pd.set_option('display.float_format', lambda x:'%.5f' % x)
data_test.head()
data_test.describe()
data_test.shape[0]
data_shengao_west = data_test['身高'][data_test['區域']=='west'].mean()
data_shengao_west
data_shengao_east = data_test['身高'][data_test['區域']=='east'].mean()
data_shengao_east
data_shengao_north = data_test['身高'][data_test['區域']=='north'].mean()
data_shengao_north
data_shengao_south = data_test['身高'][data_test['區域']=='south'].mean()
data_shengao_south
for i in range(2785):
    if(data_test['區域'].iloc[i]=='west' ):
        if pd.isnull(data_test.at[i,'身高']):
            data_test.at[i,'身高']=data_shengao_west
    if(data_test['區域'].iloc[i]=='east'):
        if pd.isnull(data_test.at[i,'身高']):
            data_test.at[i,'身高']=data_shengao_east
    if(data_test['區域'].iloc[i]=='north'):
        if pd.isnull(data_test.at[i,'身高']):
            data_test.at[i,'身高']=data_shengao_north
    if(data_test['區域'].iloc[i]=='south'):
        if pd.isnull(data_test.at[i,'身高']):
            data_test.at[i,'身高']=data_shengao_south
data_test['身高'][data_test['性別']=='F'].mean()
data_test['身高'][data_test['性別']=='M'].mean()
data_weight_west = data_test['體重'][data_test['區域']=='west'].mean()
data_weight_west
data_weight_east = data_test['體重'][data_test['區域']=='east'].mean()
data_weight_east
data_weight_north = data_test['體重'][data_test['區域']=='north'].mean()
data_weight_north
data_weight_south = data_test['體重'][data_test['區域']=='south'].mean()
data_weight_south

for i in range(2785):
    if(data_test['區域'].iloc[i]=='west' ):
        if pd.isnull(data_test.at[i,'體重']):
            data_test.at[i,'體重']=data_weight_west
    if(data_test['區域'].iloc[i]=='east'):
        if pd.isnull(data_test.at[i,'體重']):
            data_test.at[i,'體重']=data_weight_east
    if(data_test['區域'].iloc[i]=='north'):
        if pd.isnull(data_test.at[i,'體重']):
            data_test.at[i,'體重']=data_weight_north
    if(data_test['區域'].iloc[i]=='south'):
        if pd.isnull(data_test.at[i,'體重']):
            data_test.at[i,'體重']=data_weight_south

for i in range(2785):
    if(data_test['性別'].iloc[i]=='F' ):
        if pd.isnull(data_test.at[i,'體重指數']):
            data_test.at[i,'體重指數'] = (data_test.at[i,'身高']-80)*0.7
    if(data_test['性別'].iloc[i]=='M' ):
        if pd.isnull(data_test.at[i,'體重指數']):
            data_test.at[i,'體重指數'] = (data_test.at[i,'身高']-70)*0.6
for i in range(2785):    
    if pd.isnull(data_test.at[i,'肥胖腰圍']):
        if(data_test['體重指數'].iloc[i] > 30 ):
            data_test.at[i,'肥胖腰圍']=1.0
        else:
            data_test.at[i,'肥胖腰圍']=0.0
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
num_columns = ['年齡','體重','身高','體重指數', '腰圍', '最高血壓', '最低血壓',
                '好膽固醇', '壞膽固醇', '總膽固醇','體育活動']
zero_to_one_columns = ['血脂異常','PVD','家庭高血壓', '家族糖尿病']
str_columns = ['家族肝炎', '慢性疲勞','糖尿病','教育','未婚','收入','肥胖腰圍','性別','區域','護理來源','視力不佳','飲酒','高血壓']

for i in tqdm(str_columns):#可用於顯示進度
    lbl = LabelEncoder()
    data_test[i] = lbl.fit_transform(data_test[i].astype(str))
data_test['高血壓'].fillna(data_test['高血壓'].mode())
for i in range(2785):    
    if pd.isnull(data_test.at[i,'ALF']):
        if(data_test['高血壓'].iloc[i] == 0.0):
            data_test.at[i,'ALF']=0.0
for i in range(2785):    
    if pd.isnull(data_test.at[i,'ALF']):
        if(data_test['區域'].iloc[i] == 'west'):
            data_test.at[i,'ALF']=0.0
data_test.info()
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
num_columns = ['年齡','體重','身高','體重指數', '腰圍', '最高血壓', '最低血壓',
                '好膽固醇', '壞膽固醇', '總膽固醇','體育活動']
zero_to_one_columns = ['血脂異常','PVD','家庭高血壓', '家族糖尿病']
str_columns = ['家族肝炎', '慢性疲勞','糖尿病','教育','未婚','收入','肥胖腰圍','性別','區域','護理來源','視力不佳','飲酒','高血壓','肝炎']
for i in str_columns:
    lbl = LabelEncoder()
    data_train[i] = lbl.fit_transform(data_train[i].astype(str))
for s in str_columns:
    data_train[s].fillna(data_train[s].mode(),inplace=True)
for n in num_columns:
    data_train[n].fillna(data_train[n].mean(),inplace=True)
for n in zero_to_one_columns:
    data_train[n].fillna(data_train[n].mode(),inplace=True)
data_train.info()
num_columns = ['年齡','體重','身高','體重指數', '腰圍', '最高血壓', '最低血壓',
                '好膽固醇', '壞膽固醇', '總膽固醇','體育活動']
zero_to_one_columns = ['血脂異常','PVD','家庭高血壓', '家族糖尿病']
str_columns = ['家族肝炎', '慢性疲勞','糖尿病','教育','未婚','收入','肥胖腰圍','性別','區域','護理來源','視力不佳','飲酒','高血壓']
for i in str_columns:
    lbl = LabelEncoder()
    data_test[i] = lbl.fit_transform(data_test[i].astype(str))
for s in str_columns:
    data_test[s].fillna(data_test[s].mode(),inplace=True)
for n in num_columns:
    data_test[n].fillna(data_test[n].mean(),inplace=True)
for n in zero_to_one_columns:
    data_test[n].fillna(data_test[n].mode(),inplace=True)
data_train.info()

from sklearn.ensemble import RandomForestRegressor
num_columns = ['年齡','體重','身高','體重指數', '腰圍', '最高血壓', '最低血壓',
                '好膽固醇', '壞膽固醇', '總膽固醇','體育活動']
zero_to_one_columns = ['血脂異常','PVD','家庭高血壓', '家族糖尿病']
str_columns = ['家族肝炎', '慢性疲勞','糖尿病','肥胖腰圍','性別','區域','飲酒','高血壓']

#使用PCA主成分分析

n = ['ALF']+num_columns+zero_to_one_columns+str_columns
print(n)
### 使用 RandomForestClassifier 填補缺失的alf屬性
def set_missing_alf(df):

    # 把已有的數值型特徵取出來丟進Random Forest Regressor中
    alf_df = df[n]

    # 分成已知alf和未知alf兩部分
    known_alf = alf_df[alf_df.ALF.notnull()].values
    unknown_alf = alf_df[alf_df.ALF.isnull()].values

    # y即目標alf
    y = known_alf[:, 0]
    # print(y)
    # X即特徵屬性值
    X = known_alf[:, 1:]

    # fit到RandomForestRegressor之中
    rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1)
    rfr.fit(X, y)

    # 用得到的模型進行未知alf結果預測
    predictedAges = rfr.predict(unknown_alf[:, 1::])

    # 用得到的預測結果填補原缺失資料
    df.loc[ (df['ALF'].isnull()), 'ALF' ] = predictedAges 

    return df

data_train = set_missing_alf(data_train)
data_test = set_missing_alf(data_test)

import os
import pandas as pd
import warnings
from itertools import combinations
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

%matplotlib inline
warnings.filterwarnings('ignore')
# pd.set_option('display.max_rows',None)
# pd.set_option('display.max_columns',None)


submission  =  pd.read_csv(path+'submission_example.csv') 
data_train.head()

# 字元編碼
for i in str_columns:
    lbl = LabelEncoder()
    data_train[i] = lbl.fit_transform(data_train[i].astype(str))
    data_test[i] = lbl.fit_transform(data_test[i].astype(str))
# 數值歸一化
data_train[num_columns] = MinMaxScaler().fit_transform(data_train[num_columns])
data_test[num_columns]  = MinMaxScaler().fit_transform(data_test[num_columns])


columns = [i for i in data_train.columns if i not in ['肝炎','ID']]

train_x,train_y = data_train[columns].values,data_train['肝炎'].values
test_x  = data_test[columns].values
submission['hepatitis'] =0


kfold = StratifiedKFold(n_splits=6, shuffle=False)
model = CatBoostClassifier(
    iterations=600,#可修改
    learning_rate=0.04,#可修改
    loss_function='Logloss'
    )
for train, valid in kfold.split(train_x, train_y):
    X_train, Y_train = train_x[train], train_y[train]
    X_valid, Y_valid = train_x[valid], train_y[valid]
    model.fit(X_train,Y_train, eval_set=(X_valid, Y_valid),use_best_model=True)
    submission['hepatitis'] += model.predict_proba(test_x)[:,1] / 6

人開始變懶了,就不寫詳細註釋了,有問題的評論區滴滴我(雖然不一定有人看)
在這裡插入圖片描述

相關文章