智算之道-肝炎預測
肝炎預測
-
資料處理
將缺失較少的資料,使用眾數或者平均數填充,缺失較多的數使用隨機森林進行預估. -
模型訓練
使用catboost進行預測,程式碼如下:
import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
#Ignore RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility
import pandas as pd
import numpy as np
from pandas import Series,DataFrame
data_train = pd.read_csv('/home/kesci/data/competition_A/train_set.csv',engine = 'python',encoding='UTF-8')
path = '/home/kesci/data/competition_A/'
data_test = pd.read_csv(path+'test_set.csv')
data_train.info()
import pandas as pd
pd.set_option('max_columns',1000)
pd.set_option('max_row',300)
pd.set_option('display.float_format', lambda x:'%.5f' % x)
data_train.head()
data_train.describe()
import pandas as pd
import numpy as np
pd.set_option('max_columns',1000)
pd.set_option('max_row',300)
pd.set_option('display.float_format', lambda x:'%.5f' % x)
data_train.head()
data_train.describe()
data_train.shape[0]
data_shengao_west = data_train['身高'][data_train['區域']=='west'].mean()
data_shengao_west
data_shengao_east = data_train['身高'][data_train['區域']=='east'].mean()
data_shengao_east
data_shengao_north = data_train['身高'][data_train['區域']=='north'].mean()
data_shengao_north
data_shengao_south = data_train['身高'][data_train['區域']=='south'].mean()
data_shengao_south
for i in range(6000):
if(data_train['區域'].iloc[i]=='west' ):
if pd.isnull(data_train.at[i,'身高']):
data_train.at[i,'身高']=data_shengao_west
if(data_train['區域'].iloc[i]=='east'):
if pd.isnull(data_train.at[i,'身高']):
data_train.at[i,'身高']=data_shengao_east
if(data_train['區域'].iloc[i]=='north'):
if pd.isnull(data_train.at[i,'身高']):
data_train.at[i,'身高']=data_shengao_north
if(data_train['區域'].iloc[i]=='south'):
if pd.isnull(data_train.at[i,'身高']):
data_train.at[i,'身高']=data_shengao_south
data_train['身高'][data_train['性別']=='F'].mean()
data_train['身高'][data_train['性別']=='M'].mean()
data_weight_west = data_train['體重'][data_train['區域']=='west'].mean()
data_weight_west
data_weight_east = data_train['體重'][data_train['區域']=='east'].mean()
data_weight_east
data_weight_north = data_train['體重'][data_train['區域']=='north'].mean()
data_weight_north
data_weight_south = data_train['體重'][data_train['區域']=='south'].mean()
data_weight_south
for i in range(6000):
if(data_train['區域'].iloc[i]=='west' ):
if pd.isnull(data_train.at[i,'體重']):
data_train.at[i,'體重']=data_weight_west
if(data_train['區域'].iloc[i]=='east'):
if pd.isnull(data_train.at[i,'體重']):
data_train.at[i,'體重']=data_weight_east
if(data_train['區域'].iloc[i]=='north'):
if pd.isnull(data_train.at[i,'體重']):
data_train.at[i,'體重']=data_weight_north
if(data_train['區域'].iloc[i]=='south'):
if pd.isnull(data_train.at[i,'體重']):
data_train.at[i,'體重']=data_weight_south
for i in range(6000):
if(data_train['性別'].iloc[i]=='F' ):
if pd.isnull(data_train.at[i,'體重指數']):
data_train.at[i,'體重指數'] = (data_train.at[i,'身高']-80)*0.7
if(data_train['性別'].iloc[i]=='M' ):
if pd.isnull(data_train.at[i,'體重指數']):
data_train.at[i,'體重指數'] = (data_train.at[i,'身高']-70)*0.6
for i in range(6000):
if pd.isnull(data_train.at[i,'肥胖腰圍']):
if(data_train['體重指數'].iloc[i] > 30 ):
data_train.at[i,'肥胖腰圍']=1.0
else:
data_train.at[i,'肥胖腰圍']=0.0
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
num_columns = ['年齡','體重','身高','體重指數', '腰圍', '最高血壓', '最低血壓',
'好膽固醇', '壞膽固醇', '總膽固醇','體育活動']
zero_to_one_columns = ['血脂異常','PVD','家庭高血壓', '家族糖尿病']
str_columns = ['家族肝炎', '慢性疲勞','糖尿病','教育','未婚','收入','肥胖腰圍','性別','區域','護理來源','視力不佳','飲酒','高血壓','肝炎']
for i in tqdm(str_columns):#可用於顯示進度
lbl = LabelEncoder()
data_train[i] = lbl.fit_transform(data_train[i].astype(str))
data_train['高血壓'].fillna(data_train['高血壓'].mode())
for i in range(6000):
if pd.isnull(data_train.at[i,'ALF']):
if(data_train['高血壓'].iloc[i] == 0.0):
data_train.at[i,'ALF']=0.0
for i in range(6000):
if pd.isnull(data_train.at[i,'ALF']):
if(data_train['區域'].iloc[i] == 'west'):
data_train.at[i,'ALF']=0.0
data_train.info()
pd.set_option('max_columns',1000)
pd.set_option('max_row',300)
pd.set_option('display.float_format', lambda x:'%.5f' % x)
data_test.head()
data_test.describe()
data_test.shape[0]
data_shengao_west = data_test['身高'][data_test['區域']=='west'].mean()
data_shengao_west
data_shengao_east = data_test['身高'][data_test['區域']=='east'].mean()
data_shengao_east
data_shengao_north = data_test['身高'][data_test['區域']=='north'].mean()
data_shengao_north
data_shengao_south = data_test['身高'][data_test['區域']=='south'].mean()
data_shengao_south
for i in range(2785):
if(data_test['區域'].iloc[i]=='west' ):
if pd.isnull(data_test.at[i,'身高']):
data_test.at[i,'身高']=data_shengao_west
if(data_test['區域'].iloc[i]=='east'):
if pd.isnull(data_test.at[i,'身高']):
data_test.at[i,'身高']=data_shengao_east
if(data_test['區域'].iloc[i]=='north'):
if pd.isnull(data_test.at[i,'身高']):
data_test.at[i,'身高']=data_shengao_north
if(data_test['區域'].iloc[i]=='south'):
if pd.isnull(data_test.at[i,'身高']):
data_test.at[i,'身高']=data_shengao_south
data_test['身高'][data_test['性別']=='F'].mean()
data_test['身高'][data_test['性別']=='M'].mean()
data_weight_west = data_test['體重'][data_test['區域']=='west'].mean()
data_weight_west
data_weight_east = data_test['體重'][data_test['區域']=='east'].mean()
data_weight_east
data_weight_north = data_test['體重'][data_test['區域']=='north'].mean()
data_weight_north
data_weight_south = data_test['體重'][data_test['區域']=='south'].mean()
data_weight_south
for i in range(2785):
if(data_test['區域'].iloc[i]=='west' ):
if pd.isnull(data_test.at[i,'體重']):
data_test.at[i,'體重']=data_weight_west
if(data_test['區域'].iloc[i]=='east'):
if pd.isnull(data_test.at[i,'體重']):
data_test.at[i,'體重']=data_weight_east
if(data_test['區域'].iloc[i]=='north'):
if pd.isnull(data_test.at[i,'體重']):
data_test.at[i,'體重']=data_weight_north
if(data_test['區域'].iloc[i]=='south'):
if pd.isnull(data_test.at[i,'體重']):
data_test.at[i,'體重']=data_weight_south
for i in range(2785):
if(data_test['性別'].iloc[i]=='F' ):
if pd.isnull(data_test.at[i,'體重指數']):
data_test.at[i,'體重指數'] = (data_test.at[i,'身高']-80)*0.7
if(data_test['性別'].iloc[i]=='M' ):
if pd.isnull(data_test.at[i,'體重指數']):
data_test.at[i,'體重指數'] = (data_test.at[i,'身高']-70)*0.6
for i in range(2785):
if pd.isnull(data_test.at[i,'肥胖腰圍']):
if(data_test['體重指數'].iloc[i] > 30 ):
data_test.at[i,'肥胖腰圍']=1.0
else:
data_test.at[i,'肥胖腰圍']=0.0
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
num_columns = ['年齡','體重','身高','體重指數', '腰圍', '最高血壓', '最低血壓',
'好膽固醇', '壞膽固醇', '總膽固醇','體育活動']
zero_to_one_columns = ['血脂異常','PVD','家庭高血壓', '家族糖尿病']
str_columns = ['家族肝炎', '慢性疲勞','糖尿病','教育','未婚','收入','肥胖腰圍','性別','區域','護理來源','視力不佳','飲酒','高血壓']
for i in tqdm(str_columns):#可用於顯示進度
lbl = LabelEncoder()
data_test[i] = lbl.fit_transform(data_test[i].astype(str))
data_test['高血壓'].fillna(data_test['高血壓'].mode())
for i in range(2785):
if pd.isnull(data_test.at[i,'ALF']):
if(data_test['高血壓'].iloc[i] == 0.0):
data_test.at[i,'ALF']=0.0
for i in range(2785):
if pd.isnull(data_test.at[i,'ALF']):
if(data_test['區域'].iloc[i] == 'west'):
data_test.at[i,'ALF']=0.0
data_test.info()
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
num_columns = ['年齡','體重','身高','體重指數', '腰圍', '最高血壓', '最低血壓',
'好膽固醇', '壞膽固醇', '總膽固醇','體育活動']
zero_to_one_columns = ['血脂異常','PVD','家庭高血壓', '家族糖尿病']
str_columns = ['家族肝炎', '慢性疲勞','糖尿病','教育','未婚','收入','肥胖腰圍','性別','區域','護理來源','視力不佳','飲酒','高血壓','肝炎']
for i in str_columns:
lbl = LabelEncoder()
data_train[i] = lbl.fit_transform(data_train[i].astype(str))
for s in str_columns:
data_train[s].fillna(data_train[s].mode(),inplace=True)
for n in num_columns:
data_train[n].fillna(data_train[n].mean(),inplace=True)
for n in zero_to_one_columns:
data_train[n].fillna(data_train[n].mode(),inplace=True)
data_train.info()
num_columns = ['年齡','體重','身高','體重指數', '腰圍', '最高血壓', '最低血壓',
'好膽固醇', '壞膽固醇', '總膽固醇','體育活動']
zero_to_one_columns = ['血脂異常','PVD','家庭高血壓', '家族糖尿病']
str_columns = ['家族肝炎', '慢性疲勞','糖尿病','教育','未婚','收入','肥胖腰圍','性別','區域','護理來源','視力不佳','飲酒','高血壓']
for i in str_columns:
lbl = LabelEncoder()
data_test[i] = lbl.fit_transform(data_test[i].astype(str))
for s in str_columns:
data_test[s].fillna(data_test[s].mode(),inplace=True)
for n in num_columns:
data_test[n].fillna(data_test[n].mean(),inplace=True)
for n in zero_to_one_columns:
data_test[n].fillna(data_test[n].mode(),inplace=True)
data_train.info()
from sklearn.ensemble import RandomForestRegressor
num_columns = ['年齡','體重','身高','體重指數', '腰圍', '最高血壓', '最低血壓',
'好膽固醇', '壞膽固醇', '總膽固醇','體育活動']
zero_to_one_columns = ['血脂異常','PVD','家庭高血壓', '家族糖尿病']
str_columns = ['家族肝炎', '慢性疲勞','糖尿病','肥胖腰圍','性別','區域','飲酒','高血壓']
#使用PCA主成分分析
n = ['ALF']+num_columns+zero_to_one_columns+str_columns
print(n)
### 使用 RandomForestClassifier 填補缺失的alf屬性
def set_missing_alf(df):
# 把已有的數值型特徵取出來丟進Random Forest Regressor中
alf_df = df[n]
# 分成已知alf和未知alf兩部分
known_alf = alf_df[alf_df.ALF.notnull()].values
unknown_alf = alf_df[alf_df.ALF.isnull()].values
# y即目標alf
y = known_alf[:, 0]
# print(y)
# X即特徵屬性值
X = known_alf[:, 1:]
# fit到RandomForestRegressor之中
rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1)
rfr.fit(X, y)
# 用得到的模型進行未知alf結果預測
predictedAges = rfr.predict(unknown_alf[:, 1::])
# 用得到的預測結果填補原缺失資料
df.loc[ (df['ALF'].isnull()), 'ALF' ] = predictedAges
return df
data_train = set_missing_alf(data_train)
data_test = set_missing_alf(data_test)
import os
import pandas as pd
import warnings
from itertools import combinations
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
%matplotlib inline
warnings.filterwarnings('ignore')
# pd.set_option('display.max_rows',None)
# pd.set_option('display.max_columns',None)
submission = pd.read_csv(path+'submission_example.csv')
data_train.head()
# 字元編碼
for i in str_columns:
lbl = LabelEncoder()
data_train[i] = lbl.fit_transform(data_train[i].astype(str))
data_test[i] = lbl.fit_transform(data_test[i].astype(str))
# 數值歸一化
data_train[num_columns] = MinMaxScaler().fit_transform(data_train[num_columns])
data_test[num_columns] = MinMaxScaler().fit_transform(data_test[num_columns])
columns = [i for i in data_train.columns if i not in ['肝炎','ID']]
train_x,train_y = data_train[columns].values,data_train['肝炎'].values
test_x = data_test[columns].values
submission['hepatitis'] =0
kfold = StratifiedKFold(n_splits=6, shuffle=False)
model = CatBoostClassifier(
iterations=600,#可修改
learning_rate=0.04,#可修改
loss_function='Logloss'
)
for train, valid in kfold.split(train_x, train_y):
X_train, Y_train = train_x[train], train_y[train]
X_valid, Y_valid = train_x[valid], train_y[valid]
model.fit(X_train,Y_train, eval_set=(X_valid, Y_valid),use_best_model=True)
submission['hepatitis'] += model.predict_proba(test_x)[:,1] / 6
人開始變懶了,就不寫詳細註釋了,有問題的評論區滴滴我(雖然不一定有人看)
相關文章
- 智算之道——2020人工智慧應用挑戰賽(初賽)疾病預測結構化資料人工智慧
- AI算力加速之道AI
- 計算正向智算跨越
- HEVC幀內預測學習(二) 35種預測模式的計算模式
- 多智時代,大資料發展趨勢預測大資料
- 超算與智算融合,中科院團隊構建光伏多時間尺度功率預測模型,可融合氣象資料模型
- 隱私計算FATE-離線預測
- 面向智算服務,構建可觀測體系最佳實踐
- 群邑智庫:2021年夏季版中國媒體行業預測行業
- 群邑智庫:2021年媒介價格漲幅預測及應對
- 小鳥雲:2022年雲端計算趨勢和預測
- IDC:2021年中國雲端計算10大預測
- 正式啟動!綠色智算典型案例徵集|中國智算產業綠色科技大會產業
- 測試無定法,測試必有法:軟體測試策略運用之道
- 重大活動網路攻擊面前,京東智聯雲的攻防之道
- 租金多少才算合理?智慧分析工具Sophon幫你預測房屋租金
- 預測分析 · 員工滿意度預測
- 綠色智算,新質發展——中國智算產業綠色科技大會盛大召開產業
- 效果超AlphaFold系列,量子計算方法用於蛋白質結構預測
- 【Matlab 041期】【數學建模】Matlab 電力預測預測之灰度預測組合預測指數平滑回歸分析Matlab
- 從計算到智算,計算產業掀起什麼樣的浪潮?產業
- 2024年全球決策者對社交媒體預算支出計劃預測(附原資料表)
- 國防預算概述|美國國防部2022財年預算申請
- 軟體測試架構師修煉之道 (一)架構
- 軟體測試架構師修煉之道 (二)架構
- IDC評測:淺談當下IDC生存之道
- 富文字儲存型XSS的模糊測試之道
- 中智諮詢:2021年企業人工成本預算管理實踐調研報告(附下載)
- 機器學習股票價格預測從爬蟲到預測-預測與調參機器學習爬蟲
- [譯] 開啟效能預算
- 浪潮儲存提出雲存智用、運籌新資料的新儲存之道
- 模型訓練時間預測,計算量估計 Scaling Laws for Neural Language Models模型
- IDC:2023年中國雲端計算市場十大預測
- 解讀IBM超級計算機在預測分析領域的前景YEIBM計算機
- 亞馬遜對2022年以後的雲端計算技術預測亞馬遜
- 灰色預測分析
- 人口預測模型模型
- 智算時代需要什麼樣的儲存?