Impute the columns of data.frame with its mean, median or mode.

impute_dt(.data, ..., .func = "mode")

Arguments

.data

A data.frame

...

Columns to select

.func

Character, "mode" (default), "mean" or "median". Could also define it by oneself.

Value

A data.table

Examples


Pclass <- c(3, 1, 3, 1, 3, 2, 2, 3, NA, NA)
Sex <- c('male', 'male', 'female', 'female', 'female',
         'female', NA, 'male', 'female', NA)
Age <- c(22, 38, 26, 35, NA,
         45, 25, 39, 28, 40)
SibSp <- c(0, 1, 3, 1, 2, 3, 2, 2, NA, 0)
Fare <- c(7.25, 71.3, 7.92, NA, 8.05, 8.46, 51.9, 60, 32, 15)
Embarked <- c('S', NA, 'S', 'Q', 'Q', 'S', 'C', 'S', 'C', 'S')
data <- data.frame('Pclass' = Pclass,
 'Sex' = Sex, 'Age' = Age, 'SibSp' = SibSp,
 'Fare' = Fare, 'Embarked' = Embarked)

data
#>    Pclass    Sex Age SibSp  Fare Embarked
#> 1       3   male  22     0  7.25        S
#> 2       1   male  38     1 71.30     <NA>
#> 3       3 female  26     3  7.92        S
#> 4       1 female  35     1    NA        Q
#> 5       3 female  NA     2  8.05        Q
#> 6       2 female  45     3  8.46        S
#> 7       2   <NA>  25     2 51.90        C
#> 8       3   male  39     2 60.00        S
#> 9      NA female  28    NA 32.00        C
#> 10     NA   <NA>  40     0 15.00        S
data %>% impute_dt() # defalut uses "mode" as `.func`
#>     Pclass    Sex   Age SibSp  Fare Embarked
#>      <num> <char> <num> <num> <num>   <char>
#>  1:      3   male    22     0  7.25        S
#>  2:      1   male    38     1 71.30        S
#>  3:      3 female    26     3  7.92        S
#>  4:      1 female    35     1  7.25        Q
#>  5:      3 female    22     2  8.05        Q
#>  6:      2 female    45     3  8.46        S
#>  7:      2 female    25     2 51.90        C
#>  8:      3   male    39     2 60.00        S
#>  9:      3 female    28     2 32.00        C
#> 10:      3 female    40     0 15.00        S
data %>% impute_dt(is.numeric,.func = "mean")
#>     Pclass    Sex      Age    SibSp     Fare Embarked
#>      <num> <char>    <num>    <num>    <num>   <char>
#>  1:   3.00   male 22.00000 0.000000  7.25000        S
#>  2:   1.00   male 38.00000 1.000000 71.30000     <NA>
#>  3:   3.00 female 26.00000 3.000000  7.92000        S
#>  4:   1.00 female 35.00000 1.000000 29.09778        Q
#>  5:   3.00 female 33.11111 2.000000  8.05000        Q
#>  6:   2.00 female 45.00000 3.000000  8.46000        S
#>  7:   2.00   <NA> 25.00000 2.000000 51.90000        C
#>  8:   3.00   male 39.00000 2.000000 60.00000        S
#>  9:   2.25 female 28.00000 1.555556 32.00000        C
#> 10:   2.25   <NA> 40.00000 0.000000 15.00000        S
data %>% impute_dt(is.numeric,.func = "median")
#>     Pclass    Sex   Age SibSp  Fare Embarked
#>      <num> <char> <num> <num> <num>   <char>
#>  1:    3.0   male    22     0  7.25        S
#>  2:    1.0   male    38     1 71.30     <NA>
#>  3:    3.0 female    26     3  7.92        S
#>  4:    1.0 female    35     1 15.00        Q
#>  5:    3.0 female    35     2  8.05        Q
#>  6:    2.0 female    45     3  8.46        S
#>  7:    2.0   <NA>    25     2 51.90        C
#>  8:    3.0   male    39     2 60.00        S
#>  9:    2.5 female    28     2 32.00        C
#> 10:    2.5   <NA>    40     0 15.00        S

my_fun = function(x){
  x[is.na(x)] = (max(x,na.rm = TRUE) - min(x,na.rm = TRUE))/2
  x
}
data %>% impute_dt(is.numeric,.func = my_fun)
#>     Pclass    Sex   Age SibSp   Fare Embarked
#>      <num> <char> <num> <num>  <num>   <char>
#>  1:      3   male  22.0   0.0  7.250        S
#>  2:      1   male  38.0   1.0 71.300     <NA>
#>  3:      3 female  26.0   3.0  7.920        S
#>  4:      1 female  35.0   1.0 32.025        Q
#>  5:      3 female  11.5   2.0  8.050        Q
#>  6:      2 female  45.0   3.0  8.460        S
#>  7:      2   <NA>  25.0   2.0 51.900        C
#>  8:      3   male  39.0   2.0 60.000        S
#>  9:      1 female  28.0   1.5 32.000        C
#> 10:      1   <NA>  40.0   0.0 15.000        S