Machine Learning with Sklearn

天羽東臻發表於2020-12-11

Python 的 sklearn 是一個真香的機器學習package。

廢話不多,這裡分享一套我自己摸索的基於sklearn做資料初步分析的流程:(suppose data contained in excel, details see the code)

1. read in data and clean

raw_data = pd.read_excel('**.xlsx')
clean_data = raw_data.dropna(axis = 0,how='any') # axis = 0 for columns

2. feature correlation analysis

d = clean_data

corr = d.corr()

f, ax = plt.subplots(figsize=(15, 10))

# Draw the heatmap using seaborn
sns.heatmap(corr, vmax=1, square=True,cmap='Blues')

3. feature cluster study

m = d.iloc[:,1:].values

m = m.T

from scipy.cluster.hierarchy import dendrogram, linkage

Z = linkage(m,'ward')

plt.figure(figsize=(15,10))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('relative index')
plt.ylabel('distance')
plt.yscale('log')
dendrogram(
    Z,
    leaf_rotation=45.,  # rotates the x axis labels
    leaf_font_size=16.,  # font size for the x axis labels
    labels=[x for x in dd.columns],
)

plt.show()

4. model training with ROC plot

def roc_plot(fpr,tpr,roc_auc,color,desc,auc,title):
#     plt.figure()
    lw = 2
    plt.plot(fpr, tpr, color,
             lw=lw, label='AUC of '+ desc +' ={:.2f}'.format(auc))
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC for Lung Cancer with '+title)
    plt.legend(loc="lower right")
#     plt.show()


clf_LR = LogisticRegression(random_state=0)

import matplotlib.pyplot as plt
import matplotlib.patches as patches
from itertools import cycle
from sklearn.metrics import roc_curve,auc, roc_auc_score
plt.figure(figsize=(8,6))
color_lst = cycle(['b','g','r','c','m','y'])
for key,color in zip(study_data.keys(),color_lst):
    
    train, test = train_test_split(study_data[key], test_size = 0.2, random_state=0) 
    survival_classes_train = train['Path']  
    features_train = train.drop('Path', axis = 1)
    survival_classes_test = test['Path'] 
    features_test = test.drop('Path', axis = 1)

    clf_pipe = make_pipeline(StandardScaler(),clf_LR)

    clf_pipe.fit(features_train, survival_classes_train) # fit with training data only  

    pred_y_prob = clf_pipe.predict_proba(features_test)
    fpr,tpr, _ = roc_curve(survival_classes_test,pred_y_prob[:,1])
    roc_auc = auc(fpr,tpr)
    roc_plot(fpr,tpr,roc_auc,color,desc=key,auc = roc_auc, title = 'train_cohort')

plt.show()

相關文章