Modify, aggregate, select or filter data.frame/data.table

let adds new variables or modify existing variables. let_if make the same thing conditionally. take aggregates data or select subset of the data by rows or columns. Both functions return data.table.

Add new variables: let(mtcars, new_var = 42, new_var2 = new_var*hp)
Filter data: take_if(mtcars, am==0)
Select variables: take(mtcars, am, vs, mpg)
Aggregate data: take(mtcars, mean_mpg = mean(mpg), by = am)
Aggregate all non-grouping columns: take(mtcars, fun = mean, by = am)

let_if(
  data,
  i,
  ...,
  by,
  keyby,
  with = TRUE,
  nomatch = getOption("datatable.nomatch"),
  mult = "all",
  roll = FALSE,
  rollends = if (roll == "nearest") c(TRUE, TRUE) else if (roll >= 0) c(FALSE, TRUE)
    else c(TRUE, FALSE),
  which = FALSE,
  .SDcols,
  verbose = getOption("datatable.verbose"),
  allow.cartesian = getOption("datatable.allow.cartesian"),
  drop = NULL,
  on = NULL
)

take_if(
  data,
  i,
  ...,
  by,
  keyby,
  with = TRUE,
  nomatch = getOption("datatable.nomatch"),
  mult = "all",
  roll = FALSE,
  rollends = if (roll == "nearest") c(TRUE, TRUE) else if (roll >= 0) c(FALSE, TRUE)
    else c(TRUE, FALSE),
  which = FALSE,
  .SDcols,
  verbose = getOption("datatable.verbose"),
  allow.cartesian = getOption("datatable.allow.cartesian"),
  drop = NULL,
  on = NULL,
  autoname = TRUE,
  fun = NULL
)

take(
  data,
  ...,
  by,
  keyby,
  with = TRUE,
  nomatch = getOption("datatable.nomatch"),
  mult = "all",
  roll = FALSE,
  rollends = if (roll == "nearest") c(TRUE, TRUE) else if (roll >= 0) c(FALSE, TRUE)
    else c(TRUE, FALSE),
  which = FALSE,
  .SDcols,
  verbose = getOption("datatable.verbose"),
  allow.cartesian = getOption("datatable.allow.cartesian"),
  drop = NULL,
  on = NULL,
  autoname = TRUE,
  fun = NULL
)

let(
  data,
  ...,
  by,
  keyby,
  with = TRUE,
  nomatch = getOption("datatable.nomatch"),
  mult = "all",
  roll = FALSE,
  rollends = if (roll == "nearest") c(TRUE, TRUE) else if (roll >= 0) c(FALSE, TRUE)
    else c(TRUE, FALSE),
  which = FALSE,
  .SDcols,
  verbose = getOption("datatable.verbose"),
  allow.cartesian = getOption("datatable.allow.cartesian"),
  drop = NULL,
  on = NULL
)

# S3 method for default
let(
  data,
  ...,
  by,
  keyby,
  with = TRUE,
  nomatch = getOption("datatable.nomatch"),
  mult = "all",
  roll = FALSE,
  rollends = if (roll == "nearest") c(TRUE, TRUE) else if (roll >= 0) c(FALSE, TRUE)
    else c(TRUE, FALSE),
  which = FALSE,
  .SDcols,
  verbose = getOption("datatable.verbose"),
  allow.cartesian = getOption("datatable.allow.cartesian"),
  drop = NULL,
  on = NULL
)

sort_by(data, ..., na.last = FALSE)

Arguments

data	data.table/data.frame data.frame will be automatically converted to data.table. `let` modify data.table object in-place.
i	integer/logical vector. Supposed to use to subset/conditional modifications of `data`. For details see data.table
...	List of variables or name-value pairs of summary/modifications functions. The name will be the name of the variable in the result. In the `let` and `take` functions we can use `a = b` or `a := b` notation. Advantages of `:=` is parametric assignment, e. g. `(a) := 2` create variable with name which are stored in `a`. In `let` `:=` can be used for multiassignment (`c("a", "b") := list(1,2)`)
by	unquoted name of grouping variable of list of unquoted names of grouping variables. For details see data.table
keyby	Same as `by`, but with an additional `setkey()` run on the by columns of the result, for convenience. It is common practice to use 'keyby=' routinely when you wish the result to be sorted. For details see data.table.
with	logical. For details see data.table.
nomatch	Same as nomatch in match. For details see data.table.
mult	For details see data.table.
roll	For details see data.table.
rollends	For details see data.table.
which	For details see data.table.
.SDcols	Specifies the columns of x to be included in the special symbol .SD which stands for Subset of data.table. May be character column names or numeric positions. For details see data.table.
verbose	logical. For details see data.table.
allow.cartesian	For details see data.table.
drop	For details see data.table.
on	For details see data.table.
autoname	logical. TRUE by default. Should we create names for unnamed expressions in `take`?
fun	function which will be applied to all variables in `take`. If there are no variables in `take` then it will be applied to all non-grouping variables in the `data`.
na.last	logical. FALSE by default. If TRUE, missing values in the data are put last; if FALSE, they are put first.

Value

data.table. let returns its result invisibly.

Examples

# examples form 'dplyr' package
data(mtcars)

# Newly created variables are available immediately
mtcars %>%
    let(
        cyl2 = cyl * 2,
        cyl4 = cyl2 * 2
    ) %>% head()
#>     mpg cyl disp  hp drat    wt  qsec vs am gear carb cyl2 cyl4
#> 1: 21.0   6  160 110 3.90 2.620 16.46  0  1    4    4   12   24
#> 2: 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4   12   24
#> 3: 22.8   4  108  93 3.85 2.320 18.61  1  1    4    1    8   16
#> 4: 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1   12   24
#> 5: 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2   16   32
#> 6: 18.1   6  225 105 2.76 3.460 20.22  1  0    3    1   12   24

# You can also use let() to remove variables and
# modify existing variables
mtcars %>%
    let(
        mpg = NULL,
        disp = disp * 0.0163871 # convert to litres
    ) %>% head()
#>    cyl     disp  hp drat    wt  qsec vs am gear carb
#> 1:   6 2.621936 110 3.90 2.620 16.46  0  1    4    4
#> 2:   6 2.621936 110 3.90 2.875 17.02  0  1    4    4
#> 3:   4 1.769807  93 3.85 2.320 18.61  1  1    4    1
#> 4:   6 4.227872 110 3.08 3.215 19.44  1  0    3    1
#> 5:   8 5.899356 175 3.15 3.440 17.02  0  0    3    2
#> 6:   6 3.687098 105 2.76 3.460 20.22  1  0    3    1


# window functions are useful for grouped computations
mtcars %>%
    let(rank = rank(-mpg, ties.method = "min"),
        by = cyl) %>%
    head()
#>     mpg cyl disp  hp drat    wt  qsec vs am gear carb rank
#> 1: 21.0   6  160 110 3.90 2.620 16.46  0  1    4    4    2
#> 2: 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4    2
#> 3: 22.8   4  108  93 3.85 2.320 18.61  1  1    4    1    8
#> 4: 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1    1
#> 5: 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2    2
#> 6: 18.1   6  225 105 2.76 3.460 20.22  1  0    3    1    6

# You can drop variables by setting them to NULL
mtcars %>% let(cyl = NULL) %>% head()
#>     mpg disp  hp drat    wt  qsec vs am gear carb
#> 1: 21.0  160 110 3.90 2.620 16.46  0  1    4    4
#> 2: 21.0  160 110 3.90 2.875 17.02  0  1    4    4
#> 3: 22.8  108  93 3.85 2.320 18.61  1  1    4    1
#> 4: 21.4  258 110 3.08 3.215 19.44  1  0    3    1
#> 5: 18.7  360 175 3.15 3.440 17.02  0  0    3    2
#> 6: 18.1  225 105 2.76 3.460 20.22  1  0    3    1

# keeps all existing variables
mtcars %>%
    let(displ_l = disp / 61.0237) %>%
    head()
#>     mpg cyl disp  hp drat    wt  qsec vs am gear carb  displ_l
#> 1: 21.0   6  160 110 3.90 2.620 16.46  0  1    4    4 2.621932
#> 2: 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4 2.621932
#> 3: 22.8   4  108  93 3.85 2.320 18.61  1  1    4    1 1.769804
#> 4: 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1 4.227866
#> 5: 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2 5.899347
#> 6: 18.1   6  225 105 2.76 3.460 20.22  1  0    3    1 3.687092

# keeps only the variables you create
mtcars %>%
    take(displ_l = disp / 61.0237)
#>      displ_l
#>  1: 2.621932
#>  2: 2.621932
#>  3: 1.769804
#>  4: 4.227866
#>  5: 5.899347
#>  6: 3.687092
#>  7: 5.899347
#>  8: 2.403984
#>  9: 2.307300
#> 10: 2.746474
#> 11: 2.746474
#> 12: 4.519556
#> 13: 4.519556
#> 14: 4.519556
#> 15: 7.734700
#> 16: 7.538055
#> 17: 7.210313
#> 18: 1.289663
#> 19: 1.240502
#> 20: 1.165121
#> 21: 1.968088
#> 22: 5.211090
#> 23: 4.981671
#> 24: 5.735477
#> 25: 6.554830
#> 26: 1.294579
#> 27: 1.971365
#> 28: 1.558411
#> 29: 5.751864
#> 30: 2.376126
#> 31: 4.932510
#> 32: 1.982836
#>      displ_l


# can refer to both contextual variables and variable names:
var = 100
mtcars %>%
    let(cyl = cyl * var) %>%
    head()
#> Error in cyl * var: non-numeric argument to binary operator

# filter by condition
mtcars %>%
    take_if(am==0)
#>      mpg cyl  disp  hp drat    wt  qsec vs am gear carb
#>  1: 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
#>  2: 18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
#>  3: 18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
#>  4: 14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
#>  5: 24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
#>  6: 22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
#>  7: 19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
#>  8: 17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
#>  9: 16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
#> 10: 17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
#> 11: 15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
#> 12: 10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
#> 13: 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
#> 14: 14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
#> 15: 21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
#> 16: 15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2
#> 17: 15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2
#> 18: 13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
#> 19: 19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2

# filter by compound condition
mtcars %>%
    take_if(am==0 & mpg>mean(mpg))
#>     mpg cyl  disp  hp drat    wt  qsec vs am gear carb
#> 1: 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
#> 2: 24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
#> 3: 22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
#> 4: 21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1


# A 'take' with summary functions applied without 'by' argument returns an aggregated data
mtcars %>%
    take(mean = mean(disp), n = .N)
#>        mean  n
#> 1: 230.7219 32

# Usually, you'll want to group first
mtcars %>%
    take(mean = mean(disp), n = .N, by = cyl)
#>    cyl     mean  n
#> 1:   6 183.3143  7
#> 2:   4 105.1364 11
#> 3:   8 353.1000 14

# You can group by expressions:
mtcars %>%
    take(fun = mean, by = list(vsam = vs + am))
#>    vsam      mpg      cyl     disp        hp     drat       wt     qsec
#> 1:    1 20.28462 5.692308 189.4692 138.46154 3.738462 3.038846 18.04231
#> 2:    2 28.37143 4.000000  89.8000  80.57143 4.148571 2.028286 18.70000
#> 3:    0 15.05000 8.000000 357.6167 194.16667 3.120833 4.104083 17.14250
#>        gear     carb
#> 1: 4.076923 3.307692
#> 2: 4.142857 1.428571
#> 3: 3.000000 3.083333


# parametric evaluation:
var = quote(mean(cyl))
mtcars %>%
    let(mean_cyl = eval(var)) %>%
    head()
#> Error in `[.data.table`(data, , `:=`("mean_cyl", eval(var))): invalid type/length (closure/32) in vector allocation
take(mtcars, eval(var))
#>    eval(var)
#> 1:    6.1875

# all together
new_var = "mean_cyl"
mtcars %>%
    let((new_var) := eval(var)) %>%
    head()
#> Error in eval(lhs, parent.frame(), parent.frame()): object 'new_var' not found
take(mtcars, (new_var) := eval(var))
#>    mean_cyl
#> 1:   6.1875

########################################

# examples from data.table
dat = data.table(
    x=rep(c("b","a","c"), each=3),
    y=c(1,3,6),
    v=1:9
)

# basic row subset operations
take_if(dat, 2)                         # 2nd row
#>    x y v
#> 1: b 3 2
take_if(dat, 3:2)                       # 3rd and 2nd row
#>    x y v
#> 1: b 6 3
#> 2: b 3 2
take_if(dat, order(x))                  # no need for order(dat$x)
#>    x y v
#> 1: a 1 4
#> 2: a 3 5
#> 3: a 6 6
#> 4: b 1 1
#> 5: b 3 2
#> 6: b 6 3
#> 7: c 1 7
#> 8: c 3 8
#> 9: c 6 9
take_if(dat, y>2)                       # all rows where dat$y > 2
#>    x y v
#> 1: b 3 2
#> 2: b 6 3
#> 3: a 3 5
#> 4: a 6 6
#> 5: c 3 8
#> 6: c 6 9
take_if(dat, y>2 & v>5)                 # compound logical expressions
#>    x y v
#> 1: a 6 6
#> 2: c 3 8
#> 3: c 6 9
take_if(dat, !2:4)                      # all rows other than 2:4
#>    x y v
#> 1: b 1 1
#> 2: a 3 5
#> 3: a 6 6
#> 4: c 1 7
#> 5: c 3 8
#> 6: c 6 9
take_if(dat, -(2:4))                    # same
#>    x y v
#> 1: b 1 1
#> 2: a 3 5
#> 3: a 6 6
#> 4: c 1 7
#> 5: c 3 8
#> 6: c 6 9

# select|compute columns
take(dat, v)                  # v column (as data.table)
#>    v
#> 1: 1
#> 2: 2
#> 3: 3
#> 4: 4
#> 5: 5
#> 6: 6
#> 7: 7
#> 8: 8
#> 9: 9
take(dat, sum(v))             # return data.table with sum of v (column autonamed 'sum(v)')
#>    sum(v)
#> 1:     45
take(dat, sv = sum(v))        # same, but column named "sv"
#>    sv
#> 1: 45
take(dat, v, v*2)             # return two column data.table, v and v*2
#>    v v * 2
#> 1: 1     2
#> 2: 2     4
#> 3: 3     6
#> 4: 4     8
#> 5: 5    10
#> 6: 6    12
#> 7: 7    14
#> 8: 8    16
#> 9: 9    18

# subset rows and select|compute
take_if(dat, 2:3, sum(v))      # sum(v) over rows 2 and 3
#>    sum(v)
#> 1:      5
take_if(dat, 2:3, sv = sum(v)) # same, but return data.table with column sv
#>    sv
#> 1:  5

# grouping operations
take(dat, sum(v), by = x)             # ad hoc by, order of groups preserved in result
#>    x sum(v)
#> 1: b      6
#> 2: a     15
#> 3: c     24
take(dat, sum(v), keyby = x)          # same, but order the result on by cols
#>    x sum(v)
#> 1: a     15
#> 2: b      6
#> 3: c     24


# all together now
take_if(dat, x!="a", sum(v), by=x)                       # get sum(v) by "x" for each x != "a"
#>    x sum(v)
#> 1: b      6
#> 2: c     24
take_if(dat, c("b", "c"), sum(v), by = .EACHI, on="x")   # same
#>    x sum(v)
#> 1: b      6
#> 2: c     24

# more on special symbols, see also ?"data.table::special-symbols"
take_if(dat, .N)                           # last row
#>    x y v
#> 1: c 6 9
take(dat, .N)                              # total number of rows in DT
#>    .N
#> 1:  9
take(dat, .N, by=x)                        # number of rows in each group
#>    x .N
#> 1: b  3
#> 2: a  3
#> 3: c  3

take(dat, .I[1], by=x)                     # row number in DT corresponding to each group
#>    x .I[1]
#> 1: b     1
#> 2: a     4
#> 3: c     7


# add/update/delete by reference
# [] at the end of expression is for autoprinting
let(dat, grp = .GRP, by=x)[]          # add a group counter column
#>    x y v grp
#> 1: b 1 1   1
#> 2: b 3 2   1
#> 3: b 6 3   1
#> 4: a 1 4   2
#> 5: a 3 5   2
#> 6: a 6 6   2
#> 7: c 1 7   3
#> 8: c 3 8   3
#> 9: c 6 9   3
let(dat, z = 42L)[]                   # add new column by reference
#>    x y v grp  z
#> 1: b 1 1   1 42
#> 2: b 3 2   1 42
#> 3: b 6 3   1 42
#> 4: a 1 4   2 42
#> 5: a 3 5   2 42
#> 6: a 6 6   2 42
#> 7: c 1 7   3 42
#> 8: c 3 8   3 42
#> 9: c 6 9   3 42
let(dat, z = NULL)[]                  # remove column by reference
#>    x y v grp
#> 1: b 1 1   1
#> 2: b 3 2   1
#> 3: b 6 3   1
#> 4: a 1 4   2
#> 5: a 3 5   2
#> 6: a 6 6   2
#> 7: c 1 7   3
#> 8: c 3 8   3
#> 9: c 6 9   3
let_if(dat, x=="a", v = 42L)[]        # subassign to existing v column by reference
#>    x y  v grp
#> 1: b 1  1   1
#> 2: b 3  2   1
#> 3: b 6  3   1
#> 4: a 1 42   2
#> 5: a 3 42   2
#> 6: a 6 42   2
#> 7: c 1  7   3
#> 8: c 3  8   3
#> 9: c 6  9   3
let_if(dat, x=="b", v2 = 84L)[]       # subassign to new column by reference (NA padded)
#>    x y  v grp v2
#> 1: b 1  1   1 84
#> 2: b 3  2   1 84
#> 3: b 6  3   1 84
#> 4: a 1 42   2 NA
#> 5: a 3 42   2 NA
#> 6: a 6 42   2 NA
#> 7: c 1  7   3 NA
#> 8: c 3  8   3 NA
#> 9: c 6  9   3 NA

let(dat, m = mean(v), by=x)[]         # add new column by reference by group
#>    x y  v grp v2  m
#> 1: b 1  1   1 84  2
#> 2: b 3  2   1 84  2
#> 3: b 6  3   1 84  2
#> 4: a 1 42   2 NA 42
#> 5: a 3 42   2 NA 42
#> 6: a 6 42   2 NA 42
#> 7: c 1  7   3 NA  8
#> 8: c 3  8   3 NA  8
#> 9: c 6  9   3 NA  8

# advanced usage
dat = data.table(x=rep(c("b","a","c"), each=3),
                 v=c(1,1,1,2,2,1,1,2,2),
                 y=c(1,3,6),
                 a=1:9,
                 b=9:1)

take(dat, sum(v), by=list(y%%2))              # expressions in by
#>    y sum(v)
#> 1: 1      9
#> 2: 0      4
take(dat, sum(v), by=list(bool = y%%2))       # same, using a named list to change by column name
#>    bool sum(v)
#> 1:    1      9
#> 2:    0      4
take(dat, fun = sum, by=x)                    # sum of all (other) columns for each group
#>    x v  y  a  b
#> 1: b 3 10  6 24
#> 2: a 5 10 15 15
#> 3: c 5 10 24  6
take(dat,
     MySum=sum(v),
     MyMin=min(v),
     MyMax=max(v),
     by = list(x, y%%2)               # by 2 expressions
)
#>    x y MySum MyMin MyMax
#> 1: b 1     2     1     1
#> 2: b 0     1     1     1
#> 3: a 1     4     2     2
#> 4: a 0     1     1     1
#> 5: c 1     3     1     2
#> 6: c 0     2     2     2

take(dat, seq = min(a):max(b), by=x)  # j is not limited to just aggregations
#>     x seq
#>  1: b   1
#>  2: b   2
#>  3: b   3
#>  4: b   4
#>  5: b   5
#>  6: b   6
#>  7: b   7
#>  8: b   8
#>  9: b   9
#> 10: a   4
#> 11: a   5
#> 12: a   6
#> 13: c   7
#> 14: c   6
#> 15: c   5
#> 16: c   4
#> 17: c   3
dat %>%
    take(V1 = sum(v), by=x) %>%
    take_if(V1<20)                    # compound query
#>    x V1
#> 1: b  3
#> 2: a  5
#> 3: c  5

dat %>%
    take(V1 = sum(v), by=x) %>%
    sort_by(-V1) %>%                  # ordering results
    head()
#>    x V1
#> 1: a  5
#> 2: c  5
#> 3: b  3

Modify, aggregate, select or filter data.frame/data.table

Arguments

Value

Examples

Contents