Data manipulation within groups — group

Carry out data manipulation within specified groups.

group_dt(.data, by = NULL, ...)

rowwise_dt(.data, ...)

Arguments

.data: A data.frame
by: Variables to group by,unquoted name of grouping variable of list of unquoted names of grouping variables.
...: Any data manipulation arguments that could be implemented on a data.frame.

Value

data.table

Details

If you want to use summarise_dt and mutate_dt in group_dt, it is better to use the "by" parameter in those functions, that would be much faster because you don't have to use .SD (which takes extra time to copy).

References

https://stackoverflow.com/questions/36802385/use-by-each-row-for-data-table

Examples

iris %>% group_dt(by = Species,slice_dt(1:2))
#>       Species Sepal.Length Sepal.Width Petal.Length Petal.Width
#>        <fctr>        <num>       <num>        <num>       <num>
#> 1:     setosa          5.1         3.5          1.4         0.2
#> 2:     setosa          4.9         3.0          1.4         0.2
#> 3: versicolor          7.0         3.2          4.7         1.4
#> 4: versicolor          6.4         3.2          4.5         1.5
#> 5:  virginica          6.3         3.3          6.0         2.5
#> 6:  virginica          5.8         2.7          5.1         1.9
iris %>% group_dt(Species,filter_dt(Sepal.Length == max(Sepal.Length)))
#>       Species Sepal.Length Sepal.Width Petal.Length Petal.Width
#>        <fctr>        <num>       <num>        <num>       <num>
#> 1:     setosa          5.8         4.0          1.2         0.2
#> 2: versicolor          7.0         3.2          4.7         1.4
#> 3:  virginica          7.9         3.8          6.4         2.0
iris %>% group_dt(Species,summarise_dt(new = max(Sepal.Length)))
#>       Species   new
#>        <fctr> <num>
#> 1:     setosa   5.8
#> 2: versicolor   7.0
#> 3:  virginica   7.9

# you can pipe in the `group_dt`
iris %>% group_dt(Species,
                  mutate_dt(max= max(Sepal.Length)) %>%
                    summarise_dt(sum=sum(Sepal.Length)))
#>       Species   sum
#>        <fctr> <num>
#> 1:     setosa 250.3
#> 2: versicolor 296.8
#> 3:  virginica 329.4

# for users familiar with data.table, you can work on .SD directly
# following codes get the first and last row from each group
iris %>%
  group_dt(
    by = Species,
    rbind(.SD[1],.SD[.N])
  )
#>       Species Sepal.Length Sepal.Width Petal.Length Petal.Width
#>        <fctr>        <num>       <num>        <num>       <num>
#> 1:     setosa          5.1         3.5          1.4         0.2
#> 2:     setosa          5.0         3.3          1.4         0.2
#> 3: versicolor          7.0         3.2          4.7         1.4
#> 4: versicolor          5.7         2.8          4.1         1.3
#> 5:  virginica          6.3         3.3          6.0         2.5
#> 6:  virginica          5.9         3.0          5.1         1.8

#' # for summarise_dt, you can use "by" to calculate within the group
mtcars %>%
  summarise_dt(
   disp = mean(disp),
   hp = mean(hp),
   by = cyl
)
#>      cyl     disp        hp
#>    <num>    <num>     <num>
#> 1:     6 183.3143 122.28571
#> 2:     4 105.1364  82.63636
#> 3:     8 353.1000 209.21429

  # but you could also, of course, use group_dt
 mtcars %>%
   group_dt(by =.(vs,am),
     summarise_dt(avg = mean(mpg)))
#>       vs    am      avg
#>    <num> <num>    <num>
#> 1:     0     1 19.75000
#> 2:     1     1 28.37143
#> 3:     1     0 20.74286
#> 4:     0     0 15.05000

  # and list of variables could also be used
 mtcars %>%
   group_dt(by =list(vs,am),
            summarise_dt(avg = mean(mpg)))
#>       vs    am      avg
#>    <num> <num>    <num>
#> 1:     0     1 19.75000
#> 2:     1     1 28.37143
#> 3:     1     0 20.74286
#> 4:     0     0 15.05000

# examples for `rowwise_dt`
df <- data.table(x = 1:2, y = 3:4, z = 4:5)

df %>% mutate_dt(m = mean(c(x, y, z)))
#>        x     y     z        m
#>    <int> <int> <int>    <num>
#> 1:     1     3     4 3.166667
#> 2:     2     4     5 3.166667

df %>% rowwise_dt(
  mutate_dt(m = mean(c(x, y, z)))
)
#>        x     y     z        m
#>    <int> <int> <int>    <num>
#> 1:     1     3     4 2.666667
#> 2:     2     4     5 3.666667