10  必知必会第10课:分组数据

作者

黄天元

10.1 数据库的连接

library(pacman)
p_load(tidyverse,DBI,RSQLite)

mydb = dbConnect(RSQLite::SQLite(), "data/TYSQL.sqlite")

10.2 基于SQL的数据操作

# 10.1
dbGetQuery(mydb,"SELECT COUNT(*) AS num_prods
FROM Products
WHERE vend_id = 'DLL01';") 
  num_prods
1         4
# 10.2 
dbGetQuery(mydb,"SELECT vend_id, COUNT(*) AS num_prods
FROM Products
GROUP BY vend_id;")
  vend_id num_prods
1   BRS01         3
2   DLL01         4
3   FNG01         2
# 10.3
dbGetQuery(mydb,"SELECT cust_id, COUNT(*) AS orders
FROM Orders
GROUP BY cust_id
HAVING COUNT(*) >= 2;")
     cust_id orders
1 1000000001      2
dbGetQuery(mydb,"SELECT vend_id, COUNT(*) AS num_prods
FROM Products
WHERE prod_price >= 4
GROUP BY vend_id
HAVING COUNT(*) >= 2;")
  vend_id num_prods
1   BRS01         3
2   FNG01         2
dbGetQuery(mydb,"SELECT vend_id, COUNT(*) AS num_prods
FROM Products
GROUP BY vend_id
HAVING COUNT(*) >= 2;")
  vend_id num_prods
1   BRS01         3
2   DLL01         4
3   FNG01         2
# 10.4
dbGetQuery(mydb,"SELECT order_num, COUNT(*) AS items
FROM OrderItems
GROUP BY order_num
HAVING COUNT(*) >= 3;")
  order_num items
1     20006     3
2     20007     5
3     20008     5
4     20009     3
dbGetQuery(mydb,"SELECT order_num, COUNT(*) AS items
FROM OrderItems
GROUP BY order_num
HAVING COUNT(*) >= 3
ORDER BY items, order_num;")
  order_num items
1     20006     3
2     20009     3
3     20007     5
4     20008     5

10.3 基于tidyverse的数据操作

# 10.1 
tbl(mydb,"Products") %>% 
  filter(vend_id == 'DLL01') %>% 
  tally(name = "num_prods")
# Source:   SQL [1 x 1]
# Database: sqlite 3.46.0 [F:\Data\R4SQL\data\TYSQL.sqlite]
  num_prods
      <int>
1         4
# 10.2
tbl(mydb,"Products") %>% 
  group_by(vend_id) %>% 
  tally(name = "num_prods")
# Source:   SQL [3 x 2]
# Database: sqlite 3.46.0 [F:\Data\R4SQL\data\TYSQL.sqlite]
  vend_id num_prods
  <chr>       <int>
1 BRS01           3
2 DLL01           4
3 FNG01           2
# 10.3
tbl(mydb,"Orders") %>% 
  summarise(orders = n(),.by = cust_id) %>% 
  filter(orders >= 2)
# Source:   SQL [1 x 2]
# Database: sqlite 3.46.0 [F:\Data\R4SQL\data\TYSQL.sqlite]
  cust_id    orders
  <chr>       <int>
1 1000000001      2
tbl(mydb,"Products") %>% 
  filter(prod_price >= 4) %>%  # WHERE
  group_by(vend_id) %>% 
  summarise(num_prods = n()) %>% 
  filter(num_prods >= 2)  # HAVING
# Source:   SQL [2 x 2]
# Database: sqlite 3.46.0 [F:\Data\R4SQL\data\TYSQL.sqlite]
  vend_id num_prods
  <chr>       <int>
1 BRS01           3
2 FNG01           2
tbl(mydb,"Products") %>% 
  group_by(vend_id) %>% 
  summarise(num_prods = n()) %>% 
  filter(num_prods >= 2)  
# Source:   SQL [3 x 2]
# Database: sqlite 3.46.0 [F:\Data\R4SQL\data\TYSQL.sqlite]
  vend_id num_prods
  <chr>       <int>
1 BRS01           3
2 DLL01           4
3 FNG01           2
# 10.4
tbl(mydb,"OrderItems") %>% 
  group_by(order_num) %>% 
  tally(name = "items") %>% 
  filter(items >=3)
# Source:   SQL [4 x 2]
# Database: sqlite 3.46.0 [F:\Data\R4SQL\data\TYSQL.sqlite]
  order_num items
      <int> <int>
1     20006     3
2     20007     5
3     20008     5
4     20009     3
tbl(mydb,"OrderItems") %>% 
  group_by(order_num) %>% 
  tally(name = "items") %>% 
  filter(items >=3) %>% 
  arrange(items,order_num)
# Source:     SQL [4 x 2]
# Database:   sqlite 3.46.0 [F:\Data\R4SQL\data\TYSQL.sqlite]
# Ordered by: items, order_num
  order_num items
      <int> <int>
1     20006     3
2     20009     3
3     20007     5
4     20008     5

10.4 练习

  • 请使用SQL和R两种方式,解决课后的挑战题。