泰坦尼克號生存預測邏輯迴歸,kaggle渣渣排名

microspore發表於2020-11-19

泰坦尼克號生存預測,kaggle得分0.77,4000多排名,實在沒有辦法提高排名了。

# -*- coding: utf-8 -*-
# @Time : 2020/11/2 22:17
# @Author : spore
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt

#程式碼只是為了讓pycharm輸出控制檯輸出時不進行省略
pd.set_option('display.max_columns',1000)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth',1000)


#讀取訓練資料
df = pd.read_csv(r"D:\test\train.csv")
pr= pd.read_csv(r"D:\test\test.csv")
df=df.append(pr,ignore_index=True,sort=True)
# #檢視資料前2行觀察資料
# print(df.head(2))
# #檢視資料大小
# print(np.shape(np.array(df)))
# #檢視資料,有字串和資料缺失情況
# print(df.info())
# # #檢視資料缺失的數量
# print(df.isnull().sum())
# #觀測 數量、平均值、標準差、四分位置、最大值,主要是觀察離群點
# print (df.describe())
#Age、Fare列用均值進行填充
df['Age']=df['Age'].fillna(df['Age'].mean())
df['Fare']=df["Fare"].fillna(df["Fare"].mean())
pr['Age']=pr['Age'].fillna(pr['Age'].mean())
pr['Fare']=pr["Fare"].fillna(pr["Fare"].mean())
#Embarked列缺失兩條資料,缺失比較少,採用眾數填充
#先獲得眾數
# print(df["Embarked"].value_counts())
#列印後可知S最多就用S填充.
df['Embarked']=df["Embarked"].fillna("S")
pr['Embarked']=pr["Embarked"].fillna("S")
#Cabin缺失資料較多用”U“表示未知"Uknow"
df['Cabin']=df["Cabin"].fillna("U")
pr['Cabin']=pr["Cabin"].fillna("U")
#關於Survived這是標籤,用於預測所以不需要處理,最後列印下確實資訊
# print(df.info())
#接下來將字串資料轉換,使用LabelEncoder函式
for name in ['Ticket',"Sex"]:
    encoder = LabelEncoder()
    df[name] = encoder.fit_transform(df[name])
for name in ["Ticket","Sex"]:
    encoder = LabelEncoder()
    pr[name] = encoder.fit_transform(pr[name])

#特徵工程

#對登船港口進行One-Hot編碼,#get_dummies  獨熱編碼 prefix 字首
embarkedDF=pd.DataFrame()
embarkedDF=pd.get_dummies (df['Embarked'],prefix='Embarked')




#新增虛擬變數至原資料集並刪除原有資料列'''
df=pd.concat([df,embarkedDF],axis=1)
df.drop('Embarked',axis=1,inplace=True)




#對客艙等級進行One-Hot編碼'''
pclassDF=pd.DataFrame()
pclassDF=pd.get_dummies(df['Pclass'],prefix='Pclass')





#新增虛擬變數至原資料集並刪除原有資料列'''
df=pd.concat([df,pclassDF],axis=1)
df.drop('Pclass',axis=1,inplace=True)




'''對客艙號進行首字母提取'''
df['Cabin']=df['Cabin'].map(lambda c: c[0])




'''對客艙號進行One-Hot編碼'''
cabinDF=pd.get_dummies(df['Cabin'],prefix='Cabin')



'''新增虛擬變數至原資料集並刪除原有資料列'''
df=pd.concat([df,cabinDF],axis=1)
df.drop('Cabin',axis=1,inplace=True)




#對直系親屬數進行特徵提取

'''計算乘船人員直系親屬總數'''
familyDF=pd.DataFrame()
familyDF['FamilySize']=df['Parch']+df['SibSp']+1



'''根據乘船人員直系親屬總數進行分類'''
familyDF['Family_Single']=familyDF['FamilySize'].map(lambda f:1 if f == 1 else 0)
familyDF['Family_Small']=familyDF['FamilySize'].map(lambda f:1 if 2<= f <=4 else 0)
familyDF['Family_Large']=familyDF['FamilySize'].map(lambda f:1 if f >= 5 else 0)



'''新增虛擬變數至原資料集'''
df=pd.concat([df,familyDF],axis=1)


#提取頭銜
#定義函式:從姓名中提取頭銜
def gettitle(name):
    str1=name.split(',')[1]
    str2=str1.split('.')[0]
    str3=str2.strip()
    return str3


#存放提取後的特徵

titleDF=pd.DataFrame()
titleDF['Title']=df['Name'].map(gettitle)


#
# #檢視頭銜分類統計
# titleCount=titleDF.groupby('Title')['Title'].count()
# print('\n登船人員頭銜統計:\n',titleCount)

#頭銜對映字典
title_mapDict={'Capt':'Officer',
               'Col':'Officer',
               'Major':'Officer',
               'Jonkheer':'Royalty',
               'Don':'Royalty',
               'Sir':'Royalty',
               'Dr':'Officer',
               'Rev':'Officer',
               'the Countess':'Royalty',
               'Dona':'Royalty',
               'Mme':'Mrs',
               'Mlle':'Miss',
               'Ms':'Mrs',
               'Mr':'Mr',
               'Mrs':'Mrs',
               'Miss':'Miss',
               'Master':'Master',
               'Lady':'Royalty'}

#對頭銜進行One-Hot編碼
titleDF['Title']=titleDF['Title'].map(title_mapDict)
titleDF=pd.get_dummies(titleDF['Title'],prefix='Title')


#新增虛擬變數至原資料集並刪除原有資料列'''
df=pd.concat([df,titleDF],axis=1)
df.drop('Name',axis=1,inplace=True)


#檢視結果
# print(df.head(5))

#劃分資料集
df1=df.drop("Survived",axis = 1)








#進行預測
train=df1.loc[0:890,:]
lab=df.loc[0:890,"Survived"]
pred_X=df1.loc[891:1308,:]
print(pred_X.head(3))
#資料集劃分
x_train, x_test, y_train, y_test = train_test_split(train, lab, test_size=0.1,random_state=666)
x_train=np.array(x_train)
x_test=np.array(x_test)
y_train=np.array(y_train)
y_test=np.array(y_test)
# ##尋找最優K值
# train1=[]
# k1=[]
# test1=[]
# #迴圈不同K值對模型的影響
# for k in range(1,30):
#     k1.append(k)
# # #KNN演算法
# #初始化KNN
#     model = KNeighborsClassifier(n_neighbors= k)
# #訓練模型
#     model.fit(x_train, y_train)
# #訓練得分
#     train=model.score(x_train, y_train)
#     train1.append(train)
# #測試得分
#     test=model.score(x_test, y_test)
#     test1.append(test)
# #測試得分
#     print('test={};k={}'.format(test,k))
# #繪製K的取值,測試得分
# plt.plot( k1, test1)
# plt.show()
#
#邏輯迴歸演算法
# model = LogisticRegression ()
# model.fit(x_test,y_test)
# print(model.score(x_test, y_test))
# fscore=model.score(x_test, y_test)
# print('\n模型正確率為:',fscore)
# pred_y=model.predict(pred_X)

#xgboot演算法
# params ={'learning_rate': 0.01,
#           'max_depth': 5,
#           'num_boost_round':20,
#           'objective': 'multi:softmax',
#           'random_state': 27,
#           'silent':0,
#           'num_class':32,
#         }
# model = xgb.train(params,xgb.DMatrix(x_train, y_train))#,num_boost_round=20)
# pred_y=model.predict(pred_X)
#隨機森林
model = RandomForestClassifier(n_estimators=100)
model.fit(x_train, y_train)
fscore=model.score(x_test, y_test)
print('\n模型正確率為:',fscore)
pred_y = model.predict(pred_X)


'''將結果轉換為整型資料開始kaggle提交'''
pred_y=pred_y.astype(int)

passenger_id=pr.loc[0:417,'PassengerId']
# print(passenger_id)
predDF=pd.DataFrame({'PassengerId':passenger_id,'Survived':pred_y})

'''將結果轉換為csv檔案輸出'''
predDF.to_csv(r'D:\test\Titanic_fucklog.csv',index=False)

相關文章