library(pacman)
p_load(tidyverse,DBI,RSQLite)
= dbConnect(RSQLite::SQLite(), "data/TYSQL.sqlite") mydb
13 必知必会第13课:创建高级联结
13.1 数据库的连接
13.2 基于SQL和tidyverse的数据操作
# 13.1
dbGetQuery(mydb,"SELECT RTRIM(vend_name) || ' (' || RTRIM(vend_country) || ')'
AS vend_title
FROM Vendors
ORDER BY vend_name;")
vend_title
1 Bear Emporium (USA)
2 Bears R Us (USA)
3 Doll House Inc. (USA)
4 Fun and Games (England)
5 Furball Inc. (USA)
6 Jouets et ours (France)
tbl(mydb,"Vendors") %>%
arrange(vend_name) %>%
transmute(vend_title = str_c(
str_trim(vend_name,"right")," (",
str_trim(vend_country),")"
))
# Source: SQL [6 x 1]
# Database: sqlite 3.46.0 [F:\Data\R4SQL\data\TYSQL.sqlite]
vend_title
<chr>
1 Bear Emporium (USA)
2 Bears R Us (USA)
3 Doll House Inc. (USA)
4 Fun and Games (England)
5 Furball Inc. (USA)
6 Jouets et ours (France)
dbGetQuery(mydb,"SELECT cust_name, cust_contact
FROM Customers AS C, Orders AS O, OrderItems AS OI
WHERE C.cust_id = O.cust_id
AND OI.order_num = O.order_num
AND prod_id = 'RGAN01';")
cust_name cust_contact
1 Fun4All Denise L. Stephens
2 The Toy Store Kim Howard
tbl(mydb,"Customers") -> C
tbl(mydb,"Orders") -> O
tbl(mydb,"OrderItems") -> OI
%>% inner_join(O,by = "cust_id") %>%
C inner_join(OI,by = "order_num") %>%
filter(prod_id == 'RGAN01') %>%
select(cust_name,cust_contact)
# Source: SQL [2 x 2]
# Database: sqlite 3.46.0 [F:\Data\R4SQL\data\TYSQL.sqlite]
cust_name cust_contact
<chr> <chr>
1 Fun4All Denise L. Stephens
2 The Toy Store Kim Howard
# 13.2
dbGetQuery(mydb,"SELECT cust_id, cust_name, cust_contact
FROM Customers
WHERE cust_name = (SELECT cust_name
FROM Customers
WHERE cust_contact = 'Jim Jones');")
cust_id cust_name cust_contact
1 1000000003 Fun4All Jim Jones
2 1000000004 Fun4All Denise L. Stephens
tbl(mydb,"Customers") %>%
filter(cust_contact == 'Jim Jones') %>%
pull(cust_name) -> query1
tbl(mydb,"Customers") %>%
select(cust_id, cust_name, cust_contact) %>%
filter(cust_name == query1)
# Source: SQL [2 x 3]
# Database: sqlite 3.46.0 [F:\Data\R4SQL\data\TYSQL.sqlite]
cust_id cust_name cust_contact
<chr> <chr> <chr>
1 1000000003 Fun4All Jim Jones
2 1000000004 Fun4All Denise L. Stephens
dbGetQuery(mydb,"SELECT c1.cust_id, c1.cust_name, c1.cust_contact
FROM Customers AS c1, Customers AS c2
WHERE c1.cust_name = c2.cust_name
AND c2.cust_contact = 'Jim Jones';")
cust_id cust_name cust_contact
1 1000000003 Fun4All Jim Jones
2 1000000004 Fun4All Denise L. Stephens
tbl(mydb,"Customers") %>%
select(cust_id,cust_name,cust_contact) -> c1
tbl(mydb,"Customers") %>%
filter(cust_contact == 'Jim Jones') %>%
select(cust_name) -> c2
%>% inner_join(c2) c1
Joining with `by = join_by(cust_name)`
# Source: SQL [2 x 3]
# Database: sqlite 3.46.0 [F:\Data\R4SQL\data\TYSQL.sqlite]
cust_id cust_name cust_contact
<chr> <chr> <chr>
1 1000000003 Fun4All Jim Jones
2 1000000004 Fun4All Denise L. Stephens
# 13.2.2
dbGetQuery(mydb,"SELECT C.*, O.order_num, O.order_date,
OI.prod_id, OI.quantity, OI.item_price
FROM Customers AS C, Orders AS O,
OrderItems AS OI
WHERE C.cust_id = O.cust_id
AND OI.order_num = O.order_num
AND prod_id = 'RGAN01';")
cust_id cust_name cust_address cust_city cust_state cust_zip
1 1000000004 Fun4All 829 Riverside Drive Phoenix AZ 88888
2 1000000005 The Toy Store 4545 53rd Street Chicago IL 54545
cust_country cust_contact cust_email order_num order_date
1 USA Denise L. Stephens dstephens@fun4all.com 20007 2020-01-30
2 USA Kim Howard kim@thetoystore.com 20008 2020-02-03
prod_id quantity item_price
1 RGAN01 50 4.49
2 RGAN01 5 4.99
tbl(mydb,"Customers") -> C
tbl(mydb,"Orders") %>%
select(order_num,order_date,cust_id)-> O
tbl(mydb,"OrderItems") %>%
filter(prod_id == 'RGAN01') %>%
select(prod_id,quantity,item_price,order_num)-> OI
%>% inner_join(O) %>%
C inner_join(OI) %>%
collect() %>%
print(width = Inf)
Joining with `by = join_by(cust_id)`
Joining with `by = join_by(order_num)`
# A tibble: 2 × 14
cust_id cust_name cust_address cust_city cust_state cust_zip
<chr> <chr> <chr> <chr> <chr> <chr>
1 1000000004 Fun4All 829 Riverside Drive Phoenix AZ 88888
2 1000000005 The Toy Store 4545 53rd Street Chicago IL 54545
cust_country cust_contact cust_email order_num order_date
<chr> <chr> <chr> <int> <chr>
1 USA Denise L. Stephens dstephens@fun4all.com 20007 2020-01-30
2 USA Kim Howard kim@thetoystore.com 20008 2020-02-03
prod_id quantity item_price
<chr> <int> <dbl>
1 RGAN01 50 4.49
2 RGAN01 5 4.99
# 13.2.3
dbGetQuery(mydb,"SELECT Customers.cust_id, Orders.order_num
FROM Customers
INNER JOIN Orders ON Customers.cust_id = Orders.cust_id;")
cust_id order_num
1 1000000001 20005
2 1000000003 20006
3 1000000004 20007
4 1000000005 20008
5 1000000001 20009
tbl(mydb,"Customers") %>%
inner_join(
tbl(mydb,"Orders"),by = "cust_id"
%>%
) select(cust_id,order_num)
# Source: SQL [5 x 2]
# Database: sqlite 3.46.0 [F:\Data\R4SQL\data\TYSQL.sqlite]
cust_id order_num
<chr> <int>
1 1000000001 20005
2 1000000003 20006
3 1000000004 20007
4 1000000005 20008
5 1000000001 20009
dbGetQuery(mydb,"SELECT Customers.cust_id, Orders.order_num
FROM Customers
LEFT OUTER JOIN Orders ON Customers.cust_id = Orders.cust_id;")
cust_id order_num
1 1000000001 20005
2 1000000001 20009
3 1000000002 NA
4 1000000003 20006
5 1000000004 20007
6 1000000005 20008
7 1000000006 NA
tbl(mydb,"Customers") %>%
left_join(
tbl(mydb,"Orders"),by = "cust_id"
%>%
) select(cust_id,order_num)
# Source: SQL [7 x 2]
# Database: sqlite 3.46.0 [F:\Data\R4SQL\data\TYSQL.sqlite]
cust_id order_num
<chr> <int>
1 1000000001 20005
2 1000000001 20009
3 1000000002 NA
4 1000000003 20006
5 1000000004 20007
6 1000000005 20008
7 1000000006 NA
dbGetQuery(mydb,"SELECT Customers.cust_id, Orders.order_num
FROM Customers
RIGHT OUTER JOIN Orders ON Customers.cust_id = Orders.cust_id;")
cust_id order_num
1 1000000001 20005
2 1000000001 20009
3 1000000003 20006
4 1000000004 20007
5 1000000005 20008
tbl(mydb,"Customers") %>%
right_join(
tbl(mydb,"Orders"),by = "cust_id"
%>%
) select(cust_id,order_num)
# Source: SQL [5 x 2]
# Database: sqlite 3.46.0 [F:\Data\R4SQL\data\TYSQL.sqlite]
cust_id order_num
<chr> <int>
1 1000000001 20005
2 1000000001 20009
3 1000000003 20006
4 1000000004 20007
5 1000000005 20008
dbGetQuery(mydb,"SELECT Customers.cust_id, Orders.order_num
FROM Customers
FULL OUTER JOIN Orders ON Customers.cust_id = Orders.cust_id;")
cust_id order_num
1 1000000001 20005
2 1000000001 20009
3 1000000002 NA
4 1000000003 20006
5 1000000004 20007
6 1000000005 20008
7 1000000006 NA
tbl(mydb,"Customers") %>%
full_join(
tbl(mydb,"Orders"),by = "cust_id"
%>%
) select(cust_id,order_num)
# Source: SQL [7 x 2]
# Database: sqlite 3.46.0 [F:\Data\R4SQL\data\TYSQL.sqlite]
cust_id order_num
<chr> <int>
1 1000000001 20005
2 1000000001 20009
3 1000000002 NA
4 1000000003 20006
5 1000000004 20007
6 1000000005 20008
7 1000000006 NA
# 13.3
dbGetQuery(mydb,"SELECT Customers.cust_id,
COUNT(Orders.order_num) AS num_ord
FROM Customers
INNER JOIN Orders ON Customers.cust_id = Orders.cust_id
GROUP BY Customers.cust_id;")
cust_id num_ord
1 1000000001 2
2 1000000003 1
3 1000000004 1
4 1000000005 1
tbl(mydb,"Customers") %>%
select(cust_id) %>%
inner_join(tbl(mydb,"Orders"),by = "cust_id") %>%
count(cust_id,name = "num_ord")
# Source: SQL [4 x 2]
# Database: sqlite 3.46.0 [F:\Data\R4SQL\data\TYSQL.sqlite]
cust_id num_ord
<chr> <int>
1 1000000001 2
2 1000000003 1
3 1000000004 1
4 1000000005 1
dbGetQuery(mydb,"SELECT Customers.cust_id,
COUNT(Orders.order_num) AS num_ord
FROM Customers
LEFT OUTER JOIN Orders ON Customers.cust_id = Orders.cust_id
GROUP BY Customers.cust_id;")
cust_id num_ord
1 1000000001 2
2 1000000002 0
3 1000000003 1
4 1000000004 1
5 1000000005 1
6 1000000006 0
tbl(mydb,"Customers") %>%
select(cust_id) %>%
left_join(tbl(mydb,"Orders"),by = "cust_id") %>%
summarise(num_ord = sum(!is.na(order_num)),.by = cust_id)
Warning: Missing values are always removed in SQL aggregation functions.
Use `na.rm = TRUE` to silence this warning
This warning is displayed once every 8 hours.
# Source: SQL [6 x 2]
# Database: sqlite 3.46.0 [F:\Data\R4SQL\data\TYSQL.sqlite]
cust_id num_ord
<chr> <int>
1 1000000001 2
2 1000000002 0
3 1000000003 1
4 1000000004 1
5 1000000005 1
6 1000000006 0
13.3 注意
SQLite在3.39.0版本之前不支持右连接(RIGHT JOIN)和全外连接(FULL OUTER JOIN),但从3.39.0版本开始支持这两种连接类型。
13.4 练习
- 请使用SQL和R两种方式,解决课后的挑战题。