-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdb_12.R
87 lines (63 loc) · 2.26 KB
/
db_12.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# attach relevant packages
library(tidyverse)
### Downsizing on the database #################################################
# Connection -------------------------------------------------------------------
con <- DBI::dbConnect(duckdb::duckdb())
dm::copy_dm_to(con, dm::dm_pixarfilms(), set_key_constraints = FALSE, temporary = FALSE)
# Lazy tables ------------------------------------------------------------------
pixar_films <- tbl(con, "pixar_films")
pixar_films
# Get all data ----
df_pixar_films <-
pixar_films |>
collect()
df_pixar_films
# Projection (column selection) -----------------------------------------------
pixar_films |>
select(1:3)
# Computations happens on the database!
pixar_films |>
select(1:3) |>
show_query()
# Bring the data into the R session
df_pixar_films_3 <-
pixar_films |>
select(1:3) |>
collect()
df_pixar_films_3
# Immutable data: original data unchanged
pixar_films |>
collect()
# Filtering (row selection) ---------------------------------------------------
pixar_films |>
filter(release_date >= "2020-01-01")
# Computations happens on the database!
pixar_films |>
filter(release_date >= "2020-01-01") |>
show_query()
# Bring the data into the R session
df_pixar_films_202x <-
pixar_films |>
filter(release_date >= "2020-01-01") |>
collect()
df_pixar_films_202x
# Immutable data: original data unchanged
pixar_films |>
collect()
# Downsizing on the database: Exercises ----------------------------------------
# `select()` -------------------------------------------------------------------
pixar_films
# * Find several ways to select the 3 first columns
# * What happens if you include the name of a variable multiple times in a `select()` call?
# * Select all columns that contain underscores (use `contains()`)
# * Use `all_of()` to select 2 columns of your choice
# `filter()` -------------------------------------------------------------------
pixar_films
# Find all films that
# 1. Are rated "PG"
# 2. Had a run time below 95
# 3. Had a rating of "N/A" or "Not Rated"
# 4. Were released after and including year 2020
# 5. Have a missing name (`film` column) or `run_time`
# 6. Are a first sequel (the name ends with "2", as in "Toy Story 2")
# - Hint: Bring the data into the R session before filtering