Machine Learning with Sklearn
Python 的 sklearn 是一個真香的機器學習package。
廢話不多,這裡分享一套我自己摸索的基於sklearn做資料初步分析的流程:(suppose data contained in excel, details see the code)
1. read in data and clean
raw_data = pd.read_excel('**.xlsx')
clean_data = raw_data.dropna(axis = 0,how='any') # axis = 0 for columns
2. feature correlation analysis
d = clean_data
corr = d.corr()
f, ax = plt.subplots(figsize=(15, 10))
# Draw the heatmap using seaborn
sns.heatmap(corr, vmax=1, square=True,cmap='Blues')
3. feature cluster study
m = d.iloc[:,1:].values
m = m.T
from scipy.cluster.hierarchy import dendrogram, linkage
Z = linkage(m,'ward')
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('relative index')
leaf_rotation=45., # rotates the x axis labels
leaf_font_size=16., # font size for the x axis labels
labels=[x for x in dd.columns],
4. model training with ROC plot
def roc_plot(fpr,tpr,roc_auc,color,desc,auc,title):
# plt.figure()
lw = 2
plt.plot(fpr, tpr, color,
lw=lw, label='AUC of '+ desc +' ={:.2f}'.format(auc))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC for Lung Cancer with '+title)
plt.legend(loc="lower right")
clf_LR = LogisticRegression(random_state=0)
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from itertools import cycle
from sklearn.metrics import roc_curve,auc, roc_auc_score
color_lst = cycle(['b','g','r','c','m','y'])
for key,color in zip(study_data.keys(),color_lst):
train, test = train_test_split(study_data[key], test_size = 0.2, random_state=0)
survival_classes_train = train['Path']
features_train = train.drop('Path', axis = 1)
survival_classes_test = test['Path']
features_test = test.drop('Path', axis = 1)
clf_pipe = make_pipeline(StandardScaler(),clf_LR), survival_classes_train) # fit with training data only
pred_y_prob = clf_pipe.predict_proba(features_test)
fpr,tpr, _ = roc_curve(survival_classes_test,pred_y_prob[:,1])
roc_auc = auc(fpr,tpr)
roc_plot(fpr,tpr,roc_auc,color,desc=key,auc = roc_auc, title = 'train_cohort')
