9  必知必会第9课:汇总数据

作者

黄天元

9.1 数据库的连接

library(pacman)
p_load(tidyverse,DBI,RSQLite)

mydb = dbConnect(RSQLite::SQLite(), "data/TYSQL.sqlite")

9.2 基于SQL的数据操作

# 9.1.1
dbGetQuery(mydb,"SELECT AVG(prod_price) AS avg_price
FROM Products;") 
  avg_price
1  6.823333
dbGetQuery(mydb,"SELECT AVG(prod_price) AS avg_price
FROM Products
WHERE vend_id = 'DLL01';")
  avg_price
1     3.865
# 9.1.2
dbGetQuery(mydb,"SELECT COUNT(*) AS num_cust
FROM Customers;")
  num_cust
1        6
dbGetQuery(mydb,"SELECT COUNT(cust_email) AS num_cust
FROM Customers;")
  num_cust
1        4
# 9.1.3
dbGetQuery(mydb,"SELECT MAX(prod_price) AS max_price
FROM Products;")
  max_price
1     11.99
# 9.1.4
dbGetQuery(mydb,"SELECT MIN(prod_price) AS min_price
FROM Products;")
  min_price
1      3.49
# 9.1.5
dbGetQuery(mydb,"SELECT SUM(quantity) AS items_ordered
FROM OrderItems
WHERE order_num = 20005;")
  items_ordered
1           200
dbGetQuery(mydb,"SELECT SUM(item_price*quantity) AS total_price
FROM OrderItems
WHERE order_num = 20005;")
  total_price
1        1648
# 9.2
dbGetQuery(mydb,"SELECT AVG(DISTINCT prod_price) AS avg_price
FROM Products
WHERE vend_id = 'DLL01';")
  avg_price
1      4.24
# 9.3
dbGetQuery(mydb,"SELECT COUNT(*) AS num_items,
MIN(prod_price) AS price_min,
MAX(prod_price) AS price_max,
AVG(prod_price) AS price_avg
FROM Products;")
  num_items price_min price_max price_avg
1         9      3.49     11.99  6.823333

9.3 基于tidyverse的数据操作

# 9.1.1
tbl(mydb,"Products") %>% 
  summarise(avg_price = mean(prod_price,na.rm = TRUE))
# Source:   SQL [1 x 1]
# Database: sqlite 3.46.0 [F:\Data\R4SQL\data\TYSQL.sqlite]
  avg_price
      <dbl>
1      6.82
tbl(mydb,"Products") %>% 
  filter(vend_id == 'DLL01') %>% 
  summarise(avg_price = mean(prod_price,na.rm = TRUE))
# Source:   SQL [1 x 1]
# Database: sqlite 3.46.0 [F:\Data\R4SQL\data\TYSQL.sqlite]
  avg_price
      <dbl>
1      3.86
# 9.1.2
tbl(mydb,"Customers") %>% 
  tally(name = "num_cust")
# Source:   SQL [1 x 1]
# Database: sqlite 3.46.0 [F:\Data\R4SQL\data\TYSQL.sqlite]
  num_cust
     <int>
1        6
tbl(mydb,"Customers") %>% 
  filter(!is.na(cust_email)) %>% 
  tally(name = "num_cust")
# Source:   SQL [1 x 1]
# Database: sqlite 3.46.0 [F:\Data\R4SQL\data\TYSQL.sqlite]
  num_cust
     <int>
1        4
# 9.1.3
tbl(mydb,"Products") %>% 
  summarise(max_price = max(prod_price))
Warning: Missing values are always removed in SQL aggregation functions.
Use `na.rm = TRUE` to silence this warning
This warning is displayed once every 8 hours.
# Source:   SQL [1 x 1]
# Database: sqlite 3.46.0 [F:\Data\R4SQL\data\TYSQL.sqlite]
  max_price
      <dbl>
1      12.0
# 9.1.4
tbl(mydb,"Products") %>% 
  summarise(min_price = min(prod_price))
# Source:   SQL [1 x 1]
# Database: sqlite 3.46.0 [F:\Data\R4SQL\data\TYSQL.sqlite]
  min_price
      <dbl>
1      3.49
# 9.1.5
tbl(mydb,"OrderItems") %>% 
  filter(order_num == 20005) %>% 
  summarise(items_ordered = sum(quantity))
# Source:   SQL [1 x 1]
# Database: sqlite 3.46.0 [F:\Data\R4SQL\data\TYSQL.sqlite]
  items_ordered
          <int>
1           200
tbl(mydb,"OrderItems") %>% 
  filter(order_num == 20005) %>% 
  summarise(total_price = sum(item_price * quantity))
# Source:   SQL [1 x 1]
# Database: sqlite 3.46.0 [F:\Data\R4SQL\data\TYSQL.sqlite]
  total_price
        <dbl>
1        1648
# 9.2
tbl(mydb,"Products") %>% 
  filter(vend_id == 'DLL01') %>% 
  summarise(avg_price = mean(distinct(prod_price))) 
# Source:   SQL [1 x 1]
# Database: sqlite 3.46.0 [F:\Data\R4SQL\data\TYSQL.sqlite]
  avg_price
      <dbl>
1      4.24
# 9.3
tbl(mydb,"Products") %>% 
  summarise(
    num_items = n(),
    price_min = min(prod_price),
    price_max = max(prod_price),
    price_avg = mean(prod_price)
  )
# Source:   SQL [1 x 4]
# Database: sqlite 3.46.0 [F:\Data\R4SQL\data\TYSQL.sqlite]
  num_items price_min price_max price_avg
      <int>     <dbl>     <dbl>     <dbl>
1         9      3.49      12.0      6.82

9.4 练习

  • 请使用SQL和R两种方式,解决课后的挑战题。