專案裡的一些小函式

一朵燦燦發表於2021-01-03

統計缺失個數

na_count<-function(data,x){
  sum(data[,x]==""|is.na(data[,x]))
}
eg:
na_count(Data_Tmp,"hypertension_before")

插補

centralImputation<-function (data,xx) 
{
  for (i in c(xx)) {
    if (any(idx <- which(data[, i]==""|is.na(data[, i])))) 
      data[idx,i]<-data[sample(1:length(data[,i]),length(idx)),i]
    
  }
  data
}

one hot procedure

for(i in 1:ncol(Data_Tmp))
{
  if(class(Data_Tmp[,i])=="character"){
    Data_Tmp[,i]<-as.factor(Data_Tmp[,i])
  }
}
NN <- sum(sapply(Data_Tmp, FUN = class) != "factor")
NRT <- dim(Data_Tmp)[1]
NR_Tmp <- dim(Data_Tmp)[2]
Matrix_Nu <- matrix(0, nrow = NRT, ncol = NN)
colnames(Matrix_Nu) <- names(Data_Tmp)[sapply(Data_Tmp, FUN = class) != "factor"]
#Matrix_Nu:數值變數
J <- 1
for(i in 1:NR_Tmp)
{
  if(!is.factor(Data_Tmp[,i]))
  {
    Matrix_Nu[,J] <- Data_Tmp[,i]
    J <- J+1
  }
}



NC <- sum(sapply(Data_Tmp, FUN = class) == "factor")


Matrix_Ca <- matrix(factor(0), nrow = NRT, ncol = NC)

colnames(Matrix_Ca) <- names(Data_Tmp)[sapply(Data_Tmp, FUN = class) == "factor"]

J <- 1


for(i in 1:NR_Tmp)
{
  if(is.factor(Data_Tmp[,i]))
  {
    QQbq <- Data_Tmp[,i]
    levels(QQbq) <- c(levels(QQbq), "NA")
    QQbq[is.na(QQbq)] <- "NA"
    Matrix_Ca[,J] <- QQbq
    J <- J+1
  }
}



DF_Ca <- as.data.frame(Matrix_Ca)
for(i in 1:ncol(DF_Ca)){
  DF_Ca[,i]<-as.factor(DF_Ca[,i])
}
DF_Ca<-DF_Ca[,as.data.frame(DF_Ca[1,] %>% c %>% sapply(FUN = nlevels))[,1]!=1]#delete the factor which levels==1
#DF_Ca<-
(DF_Ca[1,] %>% c %>% sapply(FUN = nlevels))
#sapply(DF_Ca, FUN = levels)
xnam <- names(DF_Ca)
fmla <-  as.formula(paste("~ ", paste(xnam, collapse = "+")) )


Ca_M <- model.matrix(fmla, DF_Ca)

#sapply(Matrix_Ca, FUN = levels)
#Matrix_Nu[is.na(Matrix_Nu)] <- 0
Training_Final <- cbind(Matrix_Nu, Ca_M)
DF_Ca %>% as_tibble %>% apply(MARGIN = 2, FUN = levels)
Training_Finals <- cbind(scale(Matrix_Nu), Ca_M)
Training_Finals %>% glimpse
DF_Ca[1,] %>% c %>% sapply(FUN = nlevels)
head(Matrix_Nu)
Training_Final1s<-as.data.frame(Training_Finals)

相關文章