統計缺失個數
na_count<- function ( data, x) {
sum( data[ , x] == "" | is.na( data[ , x] ) )
}
eg:
na_count( Data_Tmp, "hypertension_before" )
插補
centralImputation<- function ( data, xx)
{
for ( i in c( xx) ) {
if ( any( idx <- which( data[ , i] == "" | is.na( data[ , i] ) ) ) )
data[ idx, i] <- data[ sample( 1 : length( data[ , i] ) , length( idx) ) , i]
}
data
}
one hot procedure
for ( i in 1 : ncol( Data_Tmp) )
{
if ( class( Data_Tmp[ , i] ) == "character" ) {
Data_Tmp[ , i] <- as.factor( Data_Tmp[ , i] )
}
}
NN <- sum( sapply( Data_Tmp, FUN = class) != "factor" )
NRT <- dim( Data_Tmp) [ 1 ]
NR_Tmp <- dim( Data_Tmp) [ 2 ]
Matrix_Nu <- matrix( 0 , nrow = NRT, ncol = NN)
colnames( Matrix_Nu) <- names( Data_Tmp) [ sapply( Data_Tmp, FUN = class) != "factor" ]
J <- 1
for ( i in 1 : NR_Tmp)
{
if ( ! is.factor( Data_Tmp[ , i] ) )
{
Matrix_Nu[ , J] <- Data_Tmp[ , i]
J <- J+ 1
}
}
NC <- sum( sapply( Data_Tmp, FUN = class) == "factor" )
Matrix_Ca <- matrix( factor( 0 ) , nrow = NRT, ncol = NC)
colnames( Matrix_Ca) <- names( Data_Tmp) [ sapply( Data_Tmp, FUN = class) == "factor" ]
J <- 1
for ( i in 1 : NR_Tmp)
{
if ( is.factor( Data_Tmp[ , i] ) )
{
QQbq <- Data_Tmp[ , i]
levels( QQbq) <- c( levels( QQbq) , "NA" )
QQbq[ is.na( QQbq) ] <- "NA"
Matrix_Ca[ , J] <- QQbq
J <- J+ 1
}
}
DF_Ca <- as.data.frame( Matrix_Ca)
for ( i in 1 : ncol( DF_Ca) ) {
DF_Ca[ , i] <- as.factor( DF_Ca[ , i] )
}
DF_Ca<- DF_Ca[ , as.data.frame( DF_Ca[ 1 , ] %>% c %>% sapply( FUN = nlevels) ) [ , 1 ] != 1 ]
( DF_Ca[ 1 , ] %>% c %>% sapply( FUN = nlevels) )
xnam <- names( DF_Ca)
fmla <- as.formula( paste( "~ " , paste( xnam, collapse = "+" ) ) )
Ca_M <- model.matrix( fmla, DF_Ca)
Training_Final <- cbind( Matrix_Nu, Ca_M)
DF_Ca %>% as_tibble %>% apply( MARGIN = 2 , FUN = levels)
Training_Finals <- cbind( scale( Matrix_Nu) , Ca_M)
Training_Finals %>% glimpse
DF_Ca[ 1 , ] %>% c %>% sapply( FUN = nlevels)
head( Matrix_Nu)
Training_Final1s<- as.data.frame( Training_Finals)