-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdb_12_solutions.R
103 lines (61 loc) · 2.66 KB
/
db_12_solutions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# attach relevant packages
library(tidyverse)
# Connection -------------------------------------------------------------------
con <- DBI::dbConnect(duckdb::duckdb())
dm::copy_dm_to(con, dm::dm_pixarfilms(), set_key_constraints = FALSE, temporary = FALSE)
# Lazy tables ------------------------------------------------------------------
pixar_films <- tbl(con, "pixar_films")
# Downsizing on the database: Exercises ----------------------------------------
# `select()` -------------------------------------------------------------------
pixar_films
# * Find several ways to select the 3 first columns
select(pixar_films, 1:3)
select(pixar_films, number:release_date)
select(pixar_films, !4:ncol(pixar_films))
# * What happens if you include the name of a variable multiple times in a `select()` call?
select(pixar_films, number, release_date, number)
# * Select all columns that contain underscores (use `contains()`)
select(pixar_films, contains("_"))
# * Use `all_of()` to select 2 columns of your choice
cols_of_interest <- head(colnames(pixar_films), n = 2)
select(pixar_films, all_of(cols_of_interest))
select(pixar_films, !!cols_of_interest)
# `filter()` -------------------------------------------------------------------
pixar_films
# Find all films that
# 1. Are rated "PG"
filter(pixar_films, film_rating == "PG")
# 2. Had a run time below 95
filter(pixar_films, run_time < 95)
# 3. Had a rating of "N/A" or "Not Rated"
pixar_films |>
filter(film_rating %in% c("N/A", "Not Rated")) |>
show_query()
# 4. Were released after and including year 2020
filter(pixar_films, release_date >= as.Date("2020-01-01"))
# 5. Have a missing name (`film` column) or `run_time`
pixar_films |>
filter(is.na(film) | is.na(run_time))
pixar_films |>
filter(is.na(film) | is.na(run_time)) |>
show_query()
# 6. Are a first sequel (the name ends with "2", as in "Toy Story 2")
# - Hint: Bring the data into the R session before filtering
pixar_films |>
filter(grepl("2$", film)) |>
collect()
# `count()`, `summarize()`, `group_by()`, `ungroup()` --------------------------
pixar_films
# 1. How many films are stored in the table?
count(pixar_films)
# 2. How many films released after 2005 are stored in the table?
filter(pixar_films, release_date >= as.Date("2006-01-01")) |>
count()
# 3. What is the total run time of all films?
# - Hint: Use `summarize(sum(...))`, watch out for the warning
summarize(pixar_films, total_time = sum(run_time, na.rm = TRUE))
# 4. What is the total run time of all films, per rating?
# - Hint: Use `group_by()` or `.by`
pixar_films |>
summarize(.by = film_rating, total_time = sum(run_time, na.rm = TRUE))
# dbDisconnect(con)