13  必知必会第13课:创建高级联结

作者

黄天元

13.1 数据库的连接

library(pacman)
p_load(tidyverse,DBI,RSQLite)

mydb = dbConnect(RSQLite::SQLite(), "data/TYSQL.sqlite")

13.2 基于SQL和tidyverse的数据操作

# 13.1
dbGetQuery(mydb,"SELECT RTRIM(vend_name) || ' (' || RTRIM(vend_country) || ')'
AS vend_title
FROM Vendors
ORDER BY vend_name;") 
               vend_title
1     Bear Emporium (USA)
2        Bears R Us (USA)
3   Doll House Inc. (USA)
4 Fun and Games (England)
5      Furball Inc. (USA)
6 Jouets et ours (France)
tbl(mydb,"Vendors") %>% 
  arrange(vend_name) %>% 
  transmute(vend_title = str_c(
    str_trim(vend_name,"right")," (",
    str_trim(vend_country),")"
  )) 
# Source:   SQL [6 x 1]
# Database: sqlite 3.46.0 [F:\Data\R4SQL\data\TYSQL.sqlite]
  vend_title             
  <chr>                  
1 Bear Emporium (USA)    
2 Bears R Us (USA)       
3 Doll House Inc. (USA)  
4 Fun and Games (England)
5 Furball Inc. (USA)     
6 Jouets et ours (France)
dbGetQuery(mydb,"SELECT cust_name, cust_contact
FROM Customers AS C, Orders AS O, OrderItems AS OI
WHERE C.cust_id = O.cust_id
AND OI.order_num = O.order_num
AND prod_id = 'RGAN01';")
      cust_name       cust_contact
1       Fun4All Denise L. Stephens
2 The Toy Store         Kim Howard
tbl(mydb,"Customers") -> C
tbl(mydb,"Orders") -> O
tbl(mydb,"OrderItems") -> OI
C %>% inner_join(O,by = "cust_id") %>% 
  inner_join(OI,by = "order_num") %>% 
  filter(prod_id == 'RGAN01') %>% 
  select(cust_name,cust_contact)
# Source:   SQL [2 x 2]
# Database: sqlite 3.46.0 [F:\Data\R4SQL\data\TYSQL.sqlite]
  cust_name     cust_contact      
  <chr>         <chr>             
1 Fun4All       Denise L. Stephens
2 The Toy Store Kim Howard        
# 13.2
dbGetQuery(mydb,"SELECT cust_id, cust_name, cust_contact
FROM Customers
WHERE cust_name = (SELECT cust_name
FROM Customers
WHERE cust_contact = 'Jim Jones');")
     cust_id cust_name       cust_contact
1 1000000003   Fun4All          Jim Jones
2 1000000004   Fun4All Denise L. Stephens
tbl(mydb,"Customers") %>% 
  filter(cust_contact == 'Jim Jones') %>% 
  pull(cust_name) -> query1
tbl(mydb,"Customers") %>% 
  select(cust_id, cust_name, cust_contact) %>% 
  filter(cust_name == query1)
# Source:   SQL [2 x 3]
# Database: sqlite 3.46.0 [F:\Data\R4SQL\data\TYSQL.sqlite]
  cust_id    cust_name cust_contact      
  <chr>      <chr>     <chr>             
1 1000000003 Fun4All   Jim Jones         
2 1000000004 Fun4All   Denise L. Stephens
dbGetQuery(mydb,"SELECT c1.cust_id, c1.cust_name, c1.cust_contact
FROM Customers AS c1, Customers AS c2
WHERE c1.cust_name = c2.cust_name
AND c2.cust_contact = 'Jim Jones';")
     cust_id cust_name       cust_contact
1 1000000003   Fun4All          Jim Jones
2 1000000004   Fun4All Denise L. Stephens
tbl(mydb,"Customers") %>% 
  select(cust_id,cust_name,cust_contact) -> c1
tbl(mydb,"Customers") %>% 
  filter(cust_contact == 'Jim Jones') %>% 
  select(cust_name) -> c2
c1 %>% inner_join(c2)
Joining with `by = join_by(cust_name)`
# Source:   SQL [2 x 3]
# Database: sqlite 3.46.0 [F:\Data\R4SQL\data\TYSQL.sqlite]
  cust_id    cust_name cust_contact      
  <chr>      <chr>     <chr>             
1 1000000003 Fun4All   Jim Jones         
2 1000000004 Fun4All   Denise L. Stephens
# 13.2.2
dbGetQuery(mydb,"SELECT C.*, O.order_num, O.order_date,
OI.prod_id, OI.quantity, OI.item_price
FROM Customers AS C, Orders AS O,
OrderItems AS OI
WHERE C.cust_id = O.cust_id
AND OI.order_num = O.order_num
AND prod_id = 'RGAN01';")
     cust_id     cust_name        cust_address cust_city cust_state cust_zip
1 1000000004       Fun4All 829 Riverside Drive   Phoenix         AZ    88888
2 1000000005 The Toy Store    4545 53rd Street   Chicago         IL    54545
  cust_country       cust_contact            cust_email order_num order_date
1          USA Denise L. Stephens dstephens@fun4all.com     20007 2020-01-30
2          USA         Kim Howard   kim@thetoystore.com     20008 2020-02-03
  prod_id quantity item_price
1  RGAN01       50       4.49
2  RGAN01        5       4.99
tbl(mydb,"Customers") -> C
tbl(mydb,"Orders") %>% 
  select(order_num,order_date,cust_id)-> O
tbl(mydb,"OrderItems") %>% 
  filter(prod_id == 'RGAN01') %>% 
  select(prod_id,quantity,item_price,order_num)-> OI
C %>% inner_join(O) %>% 
  inner_join(OI) %>% 
  collect() %>% 
  print(width = Inf)
Joining with `by = join_by(cust_id)`
Joining with `by = join_by(order_num)`
# A tibble: 2 × 14
  cust_id    cust_name     cust_address        cust_city cust_state cust_zip
  <chr>      <chr>         <chr>               <chr>     <chr>      <chr>   
1 1000000004 Fun4All       829 Riverside Drive Phoenix   AZ         88888   
2 1000000005 The Toy Store 4545 53rd Street    Chicago   IL         54545   
  cust_country cust_contact       cust_email            order_num order_date
  <chr>        <chr>              <chr>                     <int> <chr>     
1 USA          Denise L. Stephens dstephens@fun4all.com     20007 2020-01-30
2 USA          Kim Howard         kim@thetoystore.com       20008 2020-02-03
  prod_id quantity item_price
  <chr>      <int>      <dbl>
1 RGAN01        50       4.49
2 RGAN01         5       4.99
# 13.2.3
dbGetQuery(mydb,"SELECT Customers.cust_id, Orders.order_num
FROM Customers
INNER JOIN Orders ON Customers.cust_id = Orders.cust_id;")
     cust_id order_num
1 1000000001     20005
2 1000000003     20006
3 1000000004     20007
4 1000000005     20008
5 1000000001     20009
tbl(mydb,"Customers") %>% 
  inner_join(
    tbl(mydb,"Orders"),by = "cust_id"
  ) %>% 
  select(cust_id,order_num) 
# Source:   SQL [5 x 2]
# Database: sqlite 3.46.0 [F:\Data\R4SQL\data\TYSQL.sqlite]
  cust_id    order_num
  <chr>          <int>
1 1000000001     20005
2 1000000003     20006
3 1000000004     20007
4 1000000005     20008
5 1000000001     20009
dbGetQuery(mydb,"SELECT Customers.cust_id, Orders.order_num
FROM Customers
LEFT OUTER JOIN Orders ON Customers.cust_id = Orders.cust_id;")
     cust_id order_num
1 1000000001     20005
2 1000000001     20009
3 1000000002        NA
4 1000000003     20006
5 1000000004     20007
6 1000000005     20008
7 1000000006        NA
tbl(mydb,"Customers") %>% 
  left_join(
    tbl(mydb,"Orders"),by = "cust_id"
  ) %>% 
  select(cust_id,order_num)
# Source:   SQL [7 x 2]
# Database: sqlite 3.46.0 [F:\Data\R4SQL\data\TYSQL.sqlite]
  cust_id    order_num
  <chr>          <int>
1 1000000001     20005
2 1000000001     20009
3 1000000002        NA
4 1000000003     20006
5 1000000004     20007
6 1000000005     20008
7 1000000006        NA
dbGetQuery(mydb,"SELECT Customers.cust_id, Orders.order_num
FROM Customers
RIGHT OUTER JOIN Orders ON Customers.cust_id = Orders.cust_id;")
     cust_id order_num
1 1000000001     20005
2 1000000001     20009
3 1000000003     20006
4 1000000004     20007
5 1000000005     20008
tbl(mydb,"Customers") %>% 
  right_join(
    tbl(mydb,"Orders"),by = "cust_id"
  ) %>% 
  select(cust_id,order_num)
# Source:   SQL [5 x 2]
# Database: sqlite 3.46.0 [F:\Data\R4SQL\data\TYSQL.sqlite]
  cust_id    order_num
  <chr>          <int>
1 1000000001     20005
2 1000000001     20009
3 1000000003     20006
4 1000000004     20007
5 1000000005     20008
dbGetQuery(mydb,"SELECT Customers.cust_id, Orders.order_num
FROM Customers
FULL OUTER JOIN Orders ON Customers.cust_id = Orders.cust_id;")
     cust_id order_num
1 1000000001     20005
2 1000000001     20009
3 1000000002        NA
4 1000000003     20006
5 1000000004     20007
6 1000000005     20008
7 1000000006        NA
tbl(mydb,"Customers") %>% 
  full_join(
    tbl(mydb,"Orders"),by = "cust_id"
  ) %>% 
  select(cust_id,order_num)
# Source:   SQL [7 x 2]
# Database: sqlite 3.46.0 [F:\Data\R4SQL\data\TYSQL.sqlite]
  cust_id    order_num
  <chr>          <int>
1 1000000001     20005
2 1000000001     20009
3 1000000002        NA
4 1000000003     20006
5 1000000004     20007
6 1000000005     20008
7 1000000006        NA
# 13.3
dbGetQuery(mydb,"SELECT Customers.cust_id,
COUNT(Orders.order_num) AS num_ord
FROM Customers
INNER JOIN Orders ON Customers.cust_id = Orders.cust_id
GROUP BY Customers.cust_id;")
     cust_id num_ord
1 1000000001       2
2 1000000003       1
3 1000000004       1
4 1000000005       1
tbl(mydb,"Customers") %>% 
  select(cust_id) %>% 
  inner_join(tbl(mydb,"Orders"),by = "cust_id") %>% 
  count(cust_id,name = "num_ord")
# Source:   SQL [4 x 2]
# Database: sqlite 3.46.0 [F:\Data\R4SQL\data\TYSQL.sqlite]
  cust_id    num_ord
  <chr>        <int>
1 1000000001       2
2 1000000003       1
3 1000000004       1
4 1000000005       1
dbGetQuery(mydb,"SELECT Customers.cust_id,
COUNT(Orders.order_num) AS num_ord
FROM Customers
LEFT OUTER JOIN Orders ON Customers.cust_id = Orders.cust_id
GROUP BY Customers.cust_id;")
     cust_id num_ord
1 1000000001       2
2 1000000002       0
3 1000000003       1
4 1000000004       1
5 1000000005       1
6 1000000006       0
tbl(mydb,"Customers") %>% 
  select(cust_id) %>% 
  left_join(tbl(mydb,"Orders"),by = "cust_id") %>% 
  summarise(num_ord = sum(!is.na(order_num)),.by = cust_id)
Warning: Missing values are always removed in SQL aggregation functions.
Use `na.rm = TRUE` to silence this warning
This warning is displayed once every 8 hours.
# Source:   SQL [6 x 2]
# Database: sqlite 3.46.0 [F:\Data\R4SQL\data\TYSQL.sqlite]
  cust_id    num_ord
  <chr>        <int>
1 1000000001       2
2 1000000002       0
3 1000000003       1
4 1000000004       1
5 1000000005       1
6 1000000006       0

13.3 注意

SQLite在3.39.0版本之前不支持右连接(RIGHT JOIN)和全外连接(FULL OUTER JOIN),但从3.39.0版本开始支持这两种连接类型。

13.4 练习

  • 请使用SQL和R两种方式,解决课后的挑战题。