-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdb_21.R
157 lines (108 loc) · 3.16 KB
/
db_21.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
library(DBI)
library(tidyverse)
requireNamespace("duckplyr")
### Working with database dumps #################################################
# Create data -------------------------------------------------------------------
arrow::write_parquet(nycflights13::flights, "flights.parquet")
fs::file_size("flights.parquet")
object.size(nycflights13::flights)
# Processing the local data ----
# Read as tibble ----
df <- arrow::read_parquet("flights.parquet")
df
# Read as Arrow dataset ----
ds <- arrow::open_dataset("flights.parquet")
ds
ds |>
count(year, month, day) |>
collect()
# Register as duckdb lazy table ----
con_memory <- dbConnect(
duckdb::duckdb(),
dbdir = ":memory:"
)
tbl <- duckdb::tbl_file(con_memory, "flights.parquet")
tbl
class(tbl)
tbl |>
count(year, month, day)
tbl |>
count(year, month, day) |>
filter(month == 1) |>
explain()
# The future: Register as duckplyr lazy data frame ----
system.time(
duckplyr_df <- duckplyr::duckplyr_df_from_parquet(
"flights.parquet"
)
)
system.time(nrow(duckplyr_df))
class(duckplyr_df)
filtered <-
duckplyr_df |>
count(year, month, day) |>
filter(month == 1)
filtered |>
explain()
class(filtered)
filtered[["year"]]
filtered
filtered |>
explain()
duckplyr_df |>
count(year, month, day) |>
filter(month == 1L) |>
explain()
# Create partitioned data ------------------------------------------------------------------
arrow::write_dataset(
nycflights13::flights,
"flights-part/",
partitioning = c("year", "month")
)
fs::dir_tree("flights-part")
# Read partitioned data ------------------------------------------------------------------
tbl_part <- duckdb::tbl_function(
con_memory,
"read_parquet('flights-part/*/*/*.parquet', hive_partitioning = true)"
)
tbl_part
class(tbl_part)
tbl_part |>
count(year, month, day)
tbl_part |>
filter(month %in% 1:3) |>
explain()
# Create CSV data ------------------------------------------------------------------------
readr::write_csv(nycflights13::flights, "flights.csv")
# Read CSV data --------------------------------------------------------------------------
tbl_csv <- duckdb::tbl_file(con_memory, "flights.csv")
tbl_csv |>
count(year, month, day)
tbl_csv |>
count(year, month, day) |>
explain()
duckplyr_df_csv <- duckplyr::duckplyr_df_from_csv("flights.csv")
duckplyr_df_csv |>
count(year, month, day)
duckplyr_df_csv |>
count(year, month, day) |>
explain()
# Create derived Parquet data with duckplyr ---------------------------------------------------------
duckplyr_df_csv |>
count(year, month, day) |>
duckplyr::df_to_parquet("flights-count.parquet")
fs::file_size("flights-count.parquet")
duckplyr_df_count <-
duckplyr::duckplyr_df_from_parquet("flights-count.parquet")
duckplyr_df_count |>
explain()
duckplyr_df_count
duckplyr_df_count |>
explain()
# Exercises -------------------------------------------------------------------------
# 1. From the Parquet file, compute a lazy dbplyr table
# showing the mean and median departure delay
# for each month.
# 2. Compute the same data as duckplyr lazy data frames.
# 3. Store this data as a Parquet file.
# 4. Read the Parquet file and plot the data.