[Rscript]邏輯迴歸識別學生群體的R實現

TOMOCAT發表於2018-08-13
library(ggplot2)
library(pROC)

###讀取資料並檢視資料情況
setwd("D:\\student_recognition")
student<-read.csv("student.csv")
str(student) #檢視變數
apply(student,2,function(x){
  mean(x=='NULL'|x=="")
})  #檢視缺失值情況

###資料預處理
#略

###描述性分析
#繪製箱線圖,可見https://blog.csdn.net/TOMOCAT/article/details/80559006
#分組檢視變數
library(plyr) #可進行類似資料透視表的操作,將資料分割成更小的資料,對分割後的資料進行操作
ddply(student[,c("is_student","student_app")],c("is_student"),summarise,mean_app=mean(student_app))
##   whether_student  mean_app
## 1               0 0.1745353
## 2               1 0.4435253

###邏輯迴歸建模
model_student<-student[,c("is_student",cont_var,class_var)]
#這裡cont_var是經過資料清洗和預處理之後的用於建模的連續變數,class_var是分類變數
#計算建模的平均auc
auc<-NULL
for(i in 1:5){
  ind<-sample(1:dim(model_student)[1],round(dim(model_student)[1]*0.3)) #對資料集按照記錄數劃分成訓練集和測試集
  train<-model_student[-ind,]
  test<-model_student[ind,]
  
  for (var in cont_var){
    train[,var]<-(train[,var]-mean(train[,var]))/sd(train[,var])
    test[,var]<-(test[,var]-mean(test[,var]))/sd(test[,var])
  } #連續變數標準化
  
  for (var in class_var){
    train[,var]<-as.factor(train[,var])
    test[,var]<-as.factor(test[,var])
  } #離散變數轉化為因子型
  
  lm_res<-glm(is_student~.,data=train,family="binomial")
  pre<-predict(lm_res,test,type="reponse")
  auc0<-auc(test$is_student,pre)
  print(auc0)
  auc=c(auc,auc0)
}
mean(auc) #相當於五次留出法的平均auc值
#從所有變數建模到用逐步迴歸篩選變數
lm_res_all<-glm(is_student~.,data=model_student,family="binomial")
step_lm<-step(lm_res_all) #逐步迴歸篩選變數
#繪製邏輯迴歸變數係數大小圖
coef = as.data.frame(step_lm$coefficients)
coef$var = row.names(coef)
colnames(coef)[1] = "coef"
coef = coef[-1,]
coef$pos = coef$coef>0
ggplot(coef,aes(x = reorder(var,-coef),y = coef,fill = pos))+
  geom_bar(stat = 'identity',position = 'identity')+
  scale_x_discrete(labels = c("有無學生類APP","score","學校線路佔比","總出行次數","平均出行距離","出行時段-早高峰","工作日出行時間標準差","出行時段-平峰","週末出行次數","週末出行時間標準差","工作日出行時段-平峰","週末出行時段-平峰","工作日出行時段-早高峰","學校線路出行次數","週末出行時段-早高峰","工作日出行時段-無","工作日平均出行距離","週末平均出行距離","出行時間標準差","工作日出行次數","週末出行時段-無"))+
  theme(axis.text.x = element_text(angle = 50,hjust = 1,vjust = 1))+
  labs(fill = "係數正負",x = "變數",y="係數")
#視覺化結果參照我的R視覺化相關博文

 

相關文章