Impute the columns of data.frame with its mean, median or mode.
impute_dt(.data, ..., .func = "mode")
A data.table
Pclass <- c(3, 1, 3, 1, 3, 2, 2, 3, NA, NA)
Sex <- c('male', 'male', 'female', 'female', 'female',
'female', NA, 'male', 'female', NA)
Age <- c(22, 38, 26, 35, NA,
45, 25, 39, 28, 40)
SibSp <- c(0, 1, 3, 1, 2, 3, 2, 2, NA, 0)
Fare <- c(7.25, 71.3, 7.92, NA, 8.05, 8.46, 51.9, 60, 32, 15)
Embarked <- c('S', NA, 'S', 'Q', 'Q', 'S', 'C', 'S', 'C', 'S')
data <- data.frame('Pclass' = Pclass,
'Sex' = Sex, 'Age' = Age, 'SibSp' = SibSp,
'Fare' = Fare, 'Embarked' = Embarked)
data
#> Pclass Sex Age SibSp Fare Embarked
#> 1 3 male 22 0 7.25 S
#> 2 1 male 38 1 71.30 <NA>
#> 3 3 female 26 3 7.92 S
#> 4 1 female 35 1 NA Q
#> 5 3 female NA 2 8.05 Q
#> 6 2 female 45 3 8.46 S
#> 7 2 <NA> 25 2 51.90 C
#> 8 3 male 39 2 60.00 S
#> 9 NA female 28 NA 32.00 C
#> 10 NA <NA> 40 0 15.00 S
data %>% impute_dt() # defalut uses "mode" as `.func`
#> Pclass Sex Age SibSp Fare Embarked
#> <num> <char> <num> <num> <num> <char>
#> 1: 3 male 22 0 7.25 S
#> 2: 1 male 38 1 71.30 S
#> 3: 3 female 26 3 7.92 S
#> 4: 1 female 35 1 7.25 Q
#> 5: 3 female 22 2 8.05 Q
#> 6: 2 female 45 3 8.46 S
#> 7: 2 female 25 2 51.90 C
#> 8: 3 male 39 2 60.00 S
#> 9: 3 female 28 2 32.00 C
#> 10: 3 female 40 0 15.00 S
data %>% impute_dt(is.numeric,.func = "mean")
#> Pclass Sex Age SibSp Fare Embarked
#> <num> <char> <num> <num> <num> <char>
#> 1: 3.00 male 22.00000 0.000000 7.25000 S
#> 2: 1.00 male 38.00000 1.000000 71.30000 <NA>
#> 3: 3.00 female 26.00000 3.000000 7.92000 S
#> 4: 1.00 female 35.00000 1.000000 29.09778 Q
#> 5: 3.00 female 33.11111 2.000000 8.05000 Q
#> 6: 2.00 female 45.00000 3.000000 8.46000 S
#> 7: 2.00 <NA> 25.00000 2.000000 51.90000 C
#> 8: 3.00 male 39.00000 2.000000 60.00000 S
#> 9: 2.25 female 28.00000 1.555556 32.00000 C
#> 10: 2.25 <NA> 40.00000 0.000000 15.00000 S
data %>% impute_dt(is.numeric,.func = "median")
#> Pclass Sex Age SibSp Fare Embarked
#> <num> <char> <num> <num> <num> <char>
#> 1: 3.0 male 22 0 7.25 S
#> 2: 1.0 male 38 1 71.30 <NA>
#> 3: 3.0 female 26 3 7.92 S
#> 4: 1.0 female 35 1 15.00 Q
#> 5: 3.0 female 35 2 8.05 Q
#> 6: 2.0 female 45 3 8.46 S
#> 7: 2.0 <NA> 25 2 51.90 C
#> 8: 3.0 male 39 2 60.00 S
#> 9: 2.5 female 28 2 32.00 C
#> 10: 2.5 <NA> 40 0 15.00 S
my_fun = function(x){
x[is.na(x)] = (max(x,na.rm = TRUE) - min(x,na.rm = TRUE))/2
x
}
data %>% impute_dt(is.numeric,.func = my_fun)
#> Pclass Sex Age SibSp Fare Embarked
#> <num> <char> <num> <num> <num> <char>
#> 1: 3 male 22.0 0.0 7.250 S
#> 2: 1 male 38.0 1.0 71.300 <NA>
#> 3: 3 female 26.0 3.0 7.920 S
#> 4: 1 female 35.0 1.0 32.025 Q
#> 5: 3 female 11.5 2.0 8.050 Q
#> 6: 2 female 45.0 3.0 8.460 S
#> 7: 2 <NA> 25.0 2.0 51.900 C
#> 8: 3 male 39.0 2.0 60.000 S
#> 9: 1 female 28.0 1.5 32.000 C
#> 10: 1 <NA> 40.0 0.0 15.000 S