Quote from data.table:
query(data, j, by) # + extra arguments | | | -------> grouped by what? -------> what to do?
or,
query_if(data, i, j, by) # + extra arguments | | | | | -------> grouped by what? | -------> what to do? ---> on which rows?
If you don't need 'i
' argument, use 'query
'. In this case you can
avoid printing leading comma inside brackets to denote empty 'i
'.
query_if( data, i, j, by, keyby, with = TRUE, nomatch = getOption("datatable.nomatch"), mult = "all", roll = FALSE, rollends = if (roll == "nearest") c(TRUE, TRUE) else if (roll >= 0) c(FALSE, TRUE) else c(TRUE, FALSE), which = FALSE, .SDcols, verbose = getOption("datatable.verbose"), allow.cartesian = getOption("datatable.allow.cartesian"), drop = NULL, on = NULL ) query( data, j, by, keyby, with = TRUE, nomatch = getOption("datatable.nomatch"), mult = "all", roll = FALSE, rollends = if (roll == "nearest") c(TRUE, TRUE) else if (roll >= 0) c(FALSE, TRUE) else c(TRUE, FALSE), which = FALSE, .SDcols, verbose = getOption("datatable.verbose"), allow.cartesian = getOption("datatable.allow.cartesian"), drop = NULL, on = NULL )
data | data.table/data.frame data.frame will be automatically converted to data.table. |
---|---|
i | Integer, logical or character vector, single column numeric matrix, expression of column names, list, data.frame or data.table. integer and logical vectors work the same way they do in [.data.frame except logical NAs are treated as FALSE. expression is evaluated within the frame of the data.table (i.e. it sees column names as if they are variables) and can evaluate to any of the other types. For details see data.table |
j | When with=TRUE (default), j is evaluated within the frame of the data.table; i.e., it sees column names as if they are variables. This allows to not just select columns in j, but also compute on them e.g., x[, a] and x[, sum(a)] returns x$a and sum(x$a) as a vector respectively. x[, .(a, b)] and x[, .(sa=sum(a), sb=sum(b))] returns a two column data.table each, the first simply selecting columns a, b and the second computing their sums. For details see data.table. |
by | unquoted name of grouping variable of list of unquoted names of grouping variables. For details see data.table |
keyby | Same as by, but with an additional |
with | logical. For details see data.table. |
nomatch | Same as nomatch in match. For details see data.table. |
mult | For details see data.table. |
roll | For details see data.table. |
rollends | For details see data.table. |
which | For details see data.table. |
.SDcols | Specifies the columns of x to be included in the special symbol .SD which stands for Subset of data.table. May be character column names or numeric positions. For details see data.table. |
verbose | logical. For details see data.table. |
allow.cartesian | For details see data.table. |
drop | For details see data.table. |
on | For details see data.table. |
It depends. For details see data.table.
#> x y v #> 1: b 1 1 #> 2: b 3 2 #> 3: b 6 3 #> 4: a 1 4 #> 5: a 3 5 #> 6: a 6 6 #> 7: c 1 7 #> 8: c 3 8 #> 9: c 6 9# basic row subset operations query_if(dat, 2) # 2nd row#> x y v #> 1: b 3 2query_if(dat, 3:2) # 3rd and 2nd row#> x y v #> 1: b 6 3 #> 2: b 3 2#> x y v #> 1: a 1 4 #> 2: a 3 5 #> 3: a 6 6 #> 4: b 1 1 #> 5: b 3 2 #> 6: b 6 3 #> 7: c 1 7 #> 8: c 3 8 #> 9: c 6 9query_if(dat, y>2) # all rows where dat$y > 2#> x y v #> 1: b 3 2 #> 2: b 6 3 #> 3: a 3 5 #> 4: a 6 6 #> 5: c 3 8 #> 6: c 6 9query_if(dat, y>2 & v>5) # compound logical expressions#> x y v #> 1: a 6 6 #> 2: c 3 8 #> 3: c 6 9query_if(dat, !2:4) # all rows other than 2:4#> x y v #> 1: b 1 1 #> 2: a 3 5 #> 3: a 6 6 #> 4: c 1 7 #> 5: c 3 8 #> 6: c 6 9query_if(dat, -(2:4)) # same#> x y v #> 1: b 1 1 #> 2: a 3 5 #> 3: a 6 6 #> 4: c 1 7 #> 5: c 3 8 #> 6: c 6 9# select|compute columns data.table way query(dat, v) # v column (as vector)#> [1] 1 2 3 4 5 6 7 8 9#> v #> 1: 1 #> 2: 2 #> 3: 3 #> 4: 4 #> 5: 5 #> 6: 6 #> 7: 7 #> 8: 8 #> 9: 9#> [1] 45#> V1 #> 1: 45#> v V2 #> 1: 1 2 #> 2: 2 4 #> 3: 3 6 #> 4: 4 8 #> 5: 5 10 #> 6: 6 12 #> 7: 7 14 #> 8: 8 16 #> 9: 9 18# subset rows and select|compute data.table way query_if(dat, 2:3, sum(v)) # sum(v) over rows 2 and 3, return vector#> [1] 5#> V1 #> 1: 5#> sv #> 1: 5#> 2 3 4 5#> NULL# select columns the data.frame way query(dat, 2, with=FALSE) # 2nd column, returns a data.table always#> y #> 1: 1 #> 2: 3 #> 3: 6 #> 4: 1 #> 5: 3 #> 6: 6 #> 7: 1 #> 8: 3 #> 9: 6colNum = 2 query(dat, colNum, with=FALSE) # same, equivalent to DT[, .SD, .SDcols=colNum]#> y #> 1: 1 #> 2: 3 #> 3: 6 #> 4: 1 #> 5: 3 #> 6: 6 #> 7: 1 #> 8: 3 #> 9: 6# grouping operations - j and by query(dat, sum(v), by=x) # ad hoc by, order of groups preserved in result#> x V1 #> 1: b 6 #> 2: a 15 #> 3: c 24#> x V1 #> 1: a 15 #> 2: b 6 #> 3: c 24#> x V1 #> 1: a 15 #> 2: b 6 #> 3: c 24# fast ad hoc row subsets (subsets as joins) # same as x == "a" but uses binary search (fast) query_if(dat, "a", on="x")#> x y v #> 1: a 1 4 #> 2: a 3 5 #> 3: a 6 6#> x y v #> 1: a 1 4 #> 2: a 3 5 #> 3: a 6 6query_if(dat, .("a"), on="x") # same#> x y v #> 1: a 1 4 #> 2: a 3 5 #> 3: a 6 6# same, single "==" internally optimised to use binary search (fast) query_if(dat, x=="a")#> x y v #> 1: a 1 4 #> 2: a 3 5 #> 3: a 6 6# not yet optimized, currently vector scan subset query_if(dat, x!="b" | y!=3)#> x y v #> 1: b 1 1 #> 2: b 6 3 #> 3: a 1 4 #> 4: a 3 5 #> 5: a 6 6 #> 6: c 1 7 #> 7: c 3 8 #> 8: c 6 9#> x y v #> 1: b 3 2#> x y v #> 1: b 3 2#> x y v #> 1: b 1 1 #> 2: b 2 NAquery_if(dat, .("b", 1:2), on=.(x, y), nomatch=0) # no match row is not returned#> x y v #> 1: b 1 1# locf, nomatch row gets rolled by previous row query_if(dat, .("b", 1:2), on=c("x", "y"), roll=Inf)#> x y v #> 1: b 1 1 #> 2: b 2 1query_if(dat, .("b", 1:2), on=.(x, y), roll=-Inf) # nocb, nomatch row gets rolled by next row#> x y v #> 1: b 1 1 #> 2: b 2 2#> [1] 25#> x V1 #> 1: b 6 #> 2: c 24#> x V1 #> 1: b 6 #> 2: c 24#> x V1 #> 1: b 6 #> 2: c 24#> x V1 #> 1: b 6 #> 2: c 24#> x v foo #> 1: c 8 4 #> 2: b 7 2query_if(dat, X, on="x") # right join#> x y v i.v foo #> 1: c 1 7 8 4 #> 2: c 3 8 8 4 #> 3: c 6 9 8 4 #> 4: b 1 1 7 2 #> 5: b 3 2 7 2 #> 6: b 6 3 7 2query_if(X, dat, on="x") # left join#> x v foo y i.v #> 1: b 7 2 1 1 #> 2: b 7 2 3 2 #> 3: b 7 2 6 3 #> 4: a NA NA 1 4 #> 5: a NA NA 3 5 #> 6: a NA NA 6 6 #> 7: c 8 4 1 7 #> 8: c 8 4 3 8 #> 9: c 8 4 6 9query_if(dat, X, on="x", nomatch=0) # inner join#> x y v i.v foo #> 1: c 1 7 8 4 #> 2: c 3 8 8 4 #> 3: c 6 9 8 4 #> 4: b 1 1 7 2 #> 5: b 3 2 7 2 #> 6: b 6 3 7 2query_if(dat, !X, on="x") # not join#> x y v #> 1: a 1 4 #> 2: a 3 5 #> 3: a 6 6#> x y v i.x foo #> 1: <NA> 8 NA c 4 #> 2: <NA> 7 NA b 2query_if(dat,X, on="y==v") # same as above (v1.9.8+)#> x y v i.x foo #> 1: <NA> 8 NA c 4 #> 2: <NA> 7 NA b 2query_if(dat, X, on = .(y<=foo)) # NEW non-equi join (v1.9.8+)#> x y v i.x i.v #> 1: b 4 1 c 8 #> 2: b 4 2 c 8 #> 3: a 4 4 c 8 #> 4: a 4 5 c 8 #> 5: c 4 7 c 8 #> 6: c 4 8 c 8 #> 7: b 2 1 b 7 #> 8: a 2 4 b 7 #> 9: c 2 7 b 7query_if(dat, X, on="y<=foo") # same as above#> x y v i.x i.v #> 1: b 4 1 c 8 #> 2: b 4 2 c 8 #> 3: a 4 4 c 8 #> 4: a 4 5 c 8 #> 5: c 4 7 c 8 #> 6: c 4 8 c 8 #> 7: b 2 1 b 7 #> 8: a 2 4 b 7 #> 9: c 2 7 b 7#> x y v i.x i.v #> 1: b 4 1 c 8 #> 2: b 4 2 c 8 #> 3: a 4 4 c 8 #> 4: a 4 5 c 8 #> 5: c 4 7 c 8 #> 6: c 4 8 c 8 #> 7: b 2 1 b 7 #> 8: a 2 4 b 7 #> 9: c 2 7 b 7query_if(dat, X, on=.(y>=foo)) # NEW non-equi join (v1.9.8+)#> x y v i.x i.v #> 1: b 4 3 c 8 #> 2: a 4 6 c 8 #> 3: c 4 9 c 8 #> 4: b 2 2 b 7 #> 5: b 2 3 b 7 #> 6: a 2 5 b 7 #> 7: a 2 6 b 7 #> 8: c 2 8 b 7 #> 9: c 2 9 b 7query_if(dat, X, on=.(x, y<=foo)) # NEW non-equi join (v1.9.8+)#> x y v i.v #> 1: c 4 7 8 #> 2: c 4 8 8 #> 3: b 2 1 7query_if(dat, X, .(x,y,x.y,v), on=.(x, y>=foo)) # Select x's join columns as well#> x y x.y v #> 1: c 4 6 9 #> 2: b 2 3 2 #> 3: b 2 6 3query_if(dat, X, on="x", mult="first") # first row of each group#> x y v i.v foo #> 1: c 1 7 8 4 #> 2: b 1 1 7 2query_if(dat, X, on="x", mult="last") # last row of each group#> x y v i.v foo #> 1: c 6 9 8 4 #> 2: b 6 3 7 2#> x V1 #> 1: c 24 #> 2: b 6#> x V1 #> 1: c 96 #> 2: b 12#> x V1 #> 1: c 192 #> 2: b 42#> x v V1 #> 1: c 8 36 #> 2: b 7 NA# more on special symbols, see also ?"special-symbols" query_if(dat, .N) # last row#> x y v #> 1: c 6 9query(dat, .N) # total number of rows in DT#> [1] 9query(dat, .N, by=x) # number of rows in each group#> x N #> 1: b 3 #> 2: a 3 #> 3: c 3query(dat, .SD, .SDcols=x:y) # select columns 'x' and 'y'#> x y #> 1: b 1 #> 2: b 3 #> 3: b 6 #> 4: a 1 #> 5: a 3 #> 6: a 6 #> 7: c 1 #> 8: c 3 #> 9: c 6query(dat, .SD[1]) # first row of all columns#> x y v #> 1: b 1 1query(dat, .SD[1], by=x) # first row of 'y' and 'v' for each group in 'x'#> x y v #> 1: b 1 1 #> 2: a 1 4 #> 3: c 1 7#> x N y v #> 1: b 3 10 6 #> 2: a 3 10 15 #> 3: c 3 10 24query(dat, .I[1], by=x) # row number in DT corresponding to each group#> x V1 #> 1: b 1 #> 2: a 4 #> 3: c 7#> x y v grp #> 1: b 1 1 1 #> 2: b 3 2 1 #> 3: b 6 3 1 #> 4: a 1 4 2 #> 5: a 3 5 2 #> 6: a 6 6 2query(X, query_if(dat, .BY, y, on="x"), by=x) # join within each group#> x V1 #> 1: c 1 #> 2: c 3 #> 3: c 6 #> 4: b 1 #> 5: b 3 #> 6: b 6# add/update/delete by reference (see ?assign) query(dat, z:=42L) %>% head() # add new column by reference#> x y v grp z #> 1: b 1 1 1 42 #> 2: b 3 2 1 42 #> 3: b 6 3 1 42 #> 4: a 1 4 2 42 #> 5: a 3 5 2 42 #> 6: a 6 6 2 42#> x y v grp #> 1: b 1 1 1 #> 2: b 3 2 1 #> 3: b 6 3 1 #> 4: a 1 4 2 #> 5: a 3 5 2 #> 6: a 6 6 2#> x y v grp #> 1: b 1 1 1 #> 2: b 3 2 1 #> 3: b 6 3 1 #> 4: a 1 42 2 #> 5: a 3 42 2 #> 6: a 6 42 2#> x y v grp v2 #> 1: b 1 1 1 84 #> 2: b 3 2 1 84 #> 3: b 6 3 1 84 #> 4: a 1 42 2 NA #> 5: a 3 42 2 NA #> 6: a 6 42 2 NA# NB: postfix [] is shortcut to print() query(dat, m:=mean(v), by=x)[] # add new column by reference by group#> x y v grp v2 m #> 1: b 1 1 1 84 2 #> 2: b 3 2 1 84 2 #> 3: b 6 3 1 84 2 #> 4: a 1 42 2 NA 42 #> 5: a 3 42 2 NA 42 #> 6: a 6 42 2 NA 42 #> 7: c 1 7 3 NA 8 #> 8: c 3 8 3 NA 8 #> 9: c 6 9 3 NA 8# advanced usage dat = data.table(x=rep(c("b","a","c"),each=3), v=c(1,1,1,2,2,1,1,2,2), y=c(1,3,6), a=1:9, b=9:1) dat#> x v y a b #> 1: b 1 1 1 9 #> 2: b 1 3 2 8 #> 3: b 1 6 3 7 #> 4: a 2 1 4 6 #> 5: a 2 3 5 5 #> 6: a 1 6 6 4 #> 7: c 1 1 7 3 #> 8: c 2 3 8 2 #> 9: c 2 6 9 1#> y V1 #> 1: 1 9 #> 2: 0 4#> bool V1 #> 1: 1 9 #> 2: 0 4query(dat, .SD[2], by=x) # get 2nd row of each group#> x v y a b #> 1: b 1 3 2 8 #> 2: a 2 3 5 5 #> 3: c 2 3 8 2#> x v y a b #> 1: b 1 3 2 8 #> 2: b 1 6 3 7 #> 3: a 2 3 5 5 #> 4: a 1 6 6 4 #> 5: c 2 3 8 2 #> 6: c 2 6 9 1#> x v y a b #> 1: b 3 10 6 24 #> 2: a 5 10 15 15 #> 3: c 5 10 24 6#> x v y a b #> 1: b 1 1 1 9 #> 2: a 1 6 6 4 #> 3: c 1 1 7 3#> x y MySum MyMin MyMax #> 1: b 1 2 1 1 #> 2: b 0 1 1 1 #> 3: a 1 4 2 2 #> 4: a 0 1 1 1 #> 5: c 1 3 1 2 #> 6: c 0 2 2 2query(dat, .(a = .(a), b = .(b)), by=x) # list columns#> x a b #> 1: b 1,2,3 9,8,7 #> 2: a 4,5,6 6,5,4 #> 3: c 7,8,9 3,2,1#> x seq #> 1: b 1 #> 2: b 2 #> 3: b 3 #> 4: b 4 #> 5: b 5 #> 6: b 6 #> 7: b 7 #> 8: b 8 #> 9: b 9 #> 10: a 4 #> 11: a 5 #> 12: a 6 #> 13: c 7 #> 14: c 6 #> 15: c 5 #> 16: c 4 #> 17: c 3#> x V1 #> 1: b 3 #> 2: a 5 #> 3: c 5#> x V1 #> 1: a 5 #> 2: c 5 #> 3: b 3#> x N v y a b #> 1: b 3 3 10 6 24 #> 2: a 3 5 10 15 15 #> 3: c 3 5 10 24 6# anonymous lambda in 'j', j accepts any valid # expression. TO REMEMBER: every element of # the list becomes a column in result. query(dat, {tmp = mean(y); .(a = a-tmp, b = b-tmp) }, by=x)#> x a b #> 1: b -2.3333333 5.6666667 #> 2: b -1.3333333 4.6666667 #> 3: b -0.3333333 3.6666667 #> 4: a 0.6666667 2.6666667 #> 5: a 1.6666667 1.6666667 #> 6: a 2.6666667 0.6666667 #> 7: c 3.6666667 -0.3333333 #> 8: c 4.6666667 -1.3333333 #> 9: c 5.6666667 -2.3333333if (FALSE) { pdf("new.pdf") query(dat, plot(a,b), by=x) # can also plot in 'j' dev.off() } # using rleid, get max(y) and min of all cols in .SDcols for each consecutive run of 'v' query(dat, c(.(y=max(y)), lapply(.SD, min)), by=rleid(v), .SDcols=v:b )#> rleid y v y a b #> 1: 1 6 1 1 1 7 #> 2: 2 3 2 1 4 5 #> 3: 3 6 1 1 6 3 #> 4: 4 6 2 3 8 1