diff --git a/_freeze/materials/0_housekeeping/execute-results/html.json b/_freeze/materials/0_housekeeping/execute-results/html.json
index aa195bc..9211f02 100644
--- a/_freeze/materials/0_housekeeping/execute-results/html.json
+++ b/_freeze/materials/0_housekeeping/execute-results/html.json
@@ -1,9 +1,11 @@
 {
-  "hash": "73e91f1bd3eb564aac096eb7f28e2119",
+  "hash": "98eca0fd351522cd2f53c0e060df7006",
   "result": {
     "engine": "knitr",
-    "markdown": "---\ntitle: \"Big Data in R with Arrow\"\nsubtitle: \"posit::conf(2024) 1-day workshop\"\nauthor: \"Nic Crane + Steph Hazlitt\"\nfooter: \"[🔗 pos.it/arrow-conf24](https://pos.it/arrow-conf24)\"\nlogo: \"images/logo.png\"\nexecute:\n  echo: true\nformat:\n  revealjs: \n    theme: default\nengine: knitr\neditor: source\n---\n\n\n\n\n# Welcome 👋\n\n## \n\n### WiFi `<svg aria-hidden=\"true\" role=\"img\" viewBox=\"0 0 640 512\" style=\"height:1em;width:1.25em;vertical-align:-0.125em;margin-left:auto;margin-right:auto;font-size:inherit;fill:currentColor;overflow:visible;position:relative;\"><path d=\"M54.2 202.9C123.2 136.7 216.8 96 320 96s196.8 40.7 265.8 106.9c12.8 12.2 33 11.8 45.2-.9s11.8-33-.9-45.2C549.7 79.5 440.4 32 320 32S90.3 79.5 9.8 156.7C-2.9 169-3.3 189.2 8.9 202s32.5 13.2 45.2 .9zM320 256c56.8 0 108.6 21.1 148.2 56c13.3 11.7 33.5 10.4 45.2-2.8s10.4-33.5-2.8-45.2C459.8 219.2 393 192 320 192s-139.8 27.2-190.5 72c-13.3 11.7-14.5 31.9-2.8 45.2s31.9 14.5 45.2 2.8c39.5-34.9 91.3-56 148.2-56zm64 160a64 64 0 1 0 -128 0 64 64 0 1 0 128 0z\"/></svg>`{=html}\n\n-   Username: **Posit Conf 2024**\n-   Password: **conf2024**\n\n<br>\n\n### Workshop `<svg aria-hidden=\"true\" role=\"img\" viewBox=\"0 0 496 512\" style=\"height:1em;width:0.97em;vertical-align:-0.125em;margin-left:auto;margin-right:auto;font-size:inherit;fill:currentColor;overflow:visible;position:relative;\"><path d=\"M165.9 397.4c0 2-2.3 3.6-5.2 3.6-3.3.3-5.6-1.3-5.6-3.6 0-2 2.3-3.6 5.2-3.6 3-.3 5.6 1.3 5.6 3.6zm-31.1-4.5c-.7 2 1.3 4.3 4.3 4.9 2.6 1 5.6 0 6.2-2s-1.3-4.3-4.3-5.2c-2.6-.7-5.5.3-6.2 2.3zm44.2-1.7c-2.9.7-4.9 2.6-4.6 4.9.3 2 2.9 3.3 5.9 2.6 2.9-.7 4.9-2.6 4.6-4.6-.3-1.9-3-3.2-5.9-2.9zM244.8 8C106.1 8 0 113.3 0 252c0 110.9 69.8 205.8 169.5 239.2 12.8 2.3 17.3-5.6 17.3-12.1 0-6.2-.3-40.4-.3-61.4 0 0-70 15-84.7-29.8 0 0-11.4-29.1-27.8-36.6 0 0-22.9-15.7 1.6-15.4 0 0 24.9 2 38.6 25.8 21.9 38.6 58.6 27.5 72.9 20.9 2.3-16 8.8-27.1 16-33.7-55.9-6.2-112.3-14.3-112.3-110.5 0-27.5 7.6-41.3 23.6-58.9-2.6-6.5-11.1-33.3 2.6-67.9 20.9-6.5 69 27 69 27 20-5.6 41.5-8.5 62.8-8.5s42.8 2.9 62.8 8.5c0 0 48.1-33.6 69-27 13.7 34.7 5.2 61.4 2.6 67.9 16 17.7 25.8 31.5 25.8 58.9 0 96.5-58.9 104.2-114.8 110.5 9.2 7.9 17 22.9 17 46.4 0 33.7-.3 75.4-.3 83.6 0 6.5 4.6 14.4 17.3 12.1C428.2 457.8 496 362.9 496 252 496 113.3 383.5 8 244.8 8zM97.2 352.9c-1.3 1-1 3.3.7 5.2 1.6 1.6 3.9 2.3 5.2 1 1.3-1 1-3.3-.7-5.2-1.6-1.6-3.9-2.3-5.2-1zm-10.8-8.1c-.7 1.3.3 2.9 2.3 3.9 1.6 1 3.6.7 4.3-.7.7-1.3-.3-2.9-2.3-3.9-2-.6-3.6-.3-4.3.7zm32.4 35.6c-1.6 1.3-1 4.3 1.3 6.2 2.3 2.3 5.2 2.6 6.5 1 1.3-1.3.7-4.3-1.3-6.2-2.2-2.3-5.2-2.6-6.5-1zm-11.4-14.7c-1.6 1-1.6 3.6 0 5.9 1.6 2.3 4.3 3.3 5.6 2.3 1.6-1.3 1.6-3.9 0-6.2-1.4-2.3-4-3.3-5.6-2z\"/></svg>`{=html}\n\n-   Website: [pos.it/arrow-conf24](https://pos.it/arrow-conf24)\n-   GitHub: [github.com/posit-conf-2024/arrow](https://github.com/posit-conf-2024/arrow)\n\n## Housekeeping\n\n<br>\n\n### Gender Neutral Bathrooms `<svg aria-hidden=\"true\" role=\"img\" viewBox=\"0 0 320 512\" style=\"height:1em;width:0.62em;vertical-align:-0.125em;margin-left:auto;margin-right:auto;font-size:inherit;fill:currentColor;overflow:visible;position:relative;\"><path d=\"M160 0a48 48 0 1 1 0 96 48 48 0 1 1 0-96zm8 352V128h6.9c33.7 0 64.9 17.7 82.3 46.6l58.3 97c9.1 15.1 4.2 34.8-10.9 43.9s-34.8 4.2-43.9-10.9L232 256.9V480c0 17.7-14.3 32-32 32s-32-14.3-32-32V352h0zM58.2 182.3c19.9-33.1 55.3-53.5 93.8-54.3V384h0v96c0 17.7-14.3 32-32 32s-32-14.3-32-32V384H70.2c-10.9 0-18.6-10.7-15.2-21.1L93.3 248.1 59.4 304.5c-9.1 15.1-28.8 20-43.9 10.9s-20-28.8-10.9-43.9l53.6-89.2z\"/></svg>`{=html}\n\n-   Located on levels 3, 4, 5, 6 & 7\n\n### Specialty Rooms `<svg aria-hidden=\"true\" role=\"img\" viewBox=\"0 0 448 512\" style=\"height:1em;width:0.88em;vertical-align:-0.125em;margin-left:auto;margin-right:auto;font-size:inherit;fill:currentColor;overflow:visible;position:relative;\"><path d=\"M301.1 34.8C312.6 40 320 51.4 320 64V448c0 12.6-7.4 24-18.9 29.2s-25 3.1-34.4-5.3L131.8 352H64c-35.3 0-64-28.7-64-64V224c0-35.3 28.7-64 64-64h67.8L266.7 40.1c9.4-8.4 22.9-10.4 34.4-5.3zM412.6 181.5C434.1 199.1 448 225.9 448 256s-13.9 56.9-35.4 74.5c-10.3 8.4-25.4 6.8-33.8-3.5s-6.8-25.4 3.5-33.8C393.1 284.4 400 271 400 256s-6.9-28.4-17.7-37.3c-10.3-8.4-11.8-23.5-3.5-33.8s23.5-11.8 33.8-3.5z\"/></svg>`{=html}\n\n-   Meditation/Prayer Room (503)\n-   Lactation Room (509)\n\n*Available Mon & Tues 7am - 7pm, and Wed 7am - 5pm\n\n\n## Photos\n\n<br>\n\n### Red Lanyards `<svg aria-hidden=\"true\" role=\"img\" viewBox=\"0 0 448 512\" style=\"height:1em;width:0.88em;vertical-align:-0.125em;margin-left:auto;margin-right:auto;font-size:inherit;fill:currentColor;overflow:visible;position:relative;\"><path d=\"M48 128c-17.7 0-32 14.3-32 32s14.3 32 32 32H400c17.7 0 32-14.3 32-32s-14.3-32-32-32H48zm0 192c-17.7 0-32 14.3-32 32s14.3 32 32 32H400c17.7 0 32-14.3 32-32s-14.3-32-32-32H48z\"/></svg>`{=html}`<svg aria-hidden=\"true\" role=\"img\" viewBox=\"0 0 448 512\" style=\"height:1em;width:0.88em;vertical-align:-0.125em;margin-left:auto;margin-right:auto;font-size:inherit;fill:currentColor;overflow:visible;position:relative;\"><path d=\"M48 128c-17.7 0-32 14.3-32 32s14.3 32 32 32H400c17.7 0 32-14.3 32-32s-14.3-32-32-32H48zm0 192c-17.7 0-32 14.3-32 32s14.3 32 32 32H400c17.7 0 32-14.3 32-32s-14.3-32-32-32H48z\"/></svg>`{=html} **NO** `<svg aria-hidden=\"true\" role=\"img\" viewBox=\"0 0 512 512\" style=\"height:1em;width:1em;vertical-align:-0.125em;margin-left:auto;margin-right:auto;font-size:inherit;fill:currentColor;overflow:visible;position:relative;\"><path d=\"M149.1 64.8L138.7 96H64C28.7 96 0 124.7 0 160V416c0 35.3 28.7 64 64 64H448c35.3 0 64-28.7 64-64V160c0-35.3-28.7-64-64-64H373.3L362.9 64.8C356.4 45.2 338.1 32 317.4 32H194.6c-20.7 0-39 13.2-45.5 32.8zM256 192a96 96 0 1 1 0 192 96 96 0 1 1 0-192z\"/></svg>`{=html}\n\n<br>\n\nPlease note everyone’s lanyard colors before taking a photo and respect their choices.\n\n## Code of Conduct\n\n<br>\n\n### `<svg aria-hidden=\"true\" role=\"img\" viewBox=\"0 0 640 512\" style=\"height:1em;width:1.25em;vertical-align:-0.125em;margin-left:auto;margin-right:auto;font-size:inherit;fill:currentColor;overflow:visible;position:relative;\"><path d=\"M579.8 267.7c56.5-56.5 56.5-148 0-204.5c-50-50-128.8-56.5-186.3-15.4l-1.6 1.1c-14.4 10.3-17.7 30.3-7.4 44.6s30.3 17.7 44.6 7.4l1.6-1.1c32.1-22.9 76-19.3 103.8 8.6c31.5 31.5 31.5 82.5 0 114L422.3 334.8c-31.5 31.5-82.5 31.5-114 0c-27.9-27.9-31.5-71.8-8.6-103.8l1.1-1.6c10.3-14.4 6.9-34.4-7.4-44.6s-34.4-6.9-44.6 7.4l-1.1 1.6C206.5 251.2 213 330 263 380c56.5 56.5 148 56.5 204.5 0L579.8 267.7zM60.2 244.3c-56.5 56.5-56.5 148 0 204.5c50 50 128.8 56.5 186.3 15.4l1.6-1.1c14.4-10.3 17.7-30.3 7.4-44.6s-30.3-17.7-44.6-7.4l-1.6 1.1c-32.1 22.9-76 19.3-103.8-8.6C74 372 74 321 105.5 289.5L217.7 177.2c31.5-31.5 82.5-31.5 114 0c27.9 27.9 31.5 71.8 8.6 103.9l-1.1 1.6c-10.3 14.4-6.9 34.4 7.4 44.6s34.4 6.9 44.6-7.4l1.1-1.6C433.5 260.8 427 182 377 132c-56.5-56.5-148-56.5-204.5 0L60.2 244.3z\"/></svg>`{=html} [posit.co/code-of-conduct/](https://posit.co/code-of-conduct/)\n\n-   Contact any posit::conf staff member, identifiable by their staff t-shirt, or visit the conference general information desk.\n-   Send a message to conf\\@posit.com; event organizers will respond promptly.\n-   Call +1-844-448-1212; this phone number will be monitored for the duration of the event.\n\n## Meet Your Teaching Team `<svg aria-hidden=\"true\" role=\"img\" viewBox=\"0 0 640 512\" style=\"height:1em;width:1.25em;vertical-align:-0.125em;margin-left:auto;margin-right:auto;font-size:inherit;fill:currentColor;overflow:visible;position:relative;\"><path d=\"M72 88a56 56 0 1 1 112 0A56 56 0 1 1 72 88zM64 245.7C54 256.9 48 271.8 48 288s6 31.1 16 42.3V245.7zm144.4-49.3C178.7 222.7 160 261.2 160 304c0 34.3 12 65.8 32 90.5V416c0 17.7-14.3 32-32 32H96c-17.7 0-32-14.3-32-32V389.2C26.2 371.2 0 332.7 0 288c0-61.9 50.1-112 112-112h32c24 0 46.2 7.5 64.4 20.3zM448 416V394.5c20-24.7 32-56.2 32-90.5c0-42.8-18.7-81.3-48.4-107.7C449.8 183.5 472 176 496 176h32c61.9 0 112 50.1 112 112c0 44.7-26.2 83.2-64 101.2V416c0 17.7-14.3 32-32 32H480c-17.7 0-32-14.3-32-32zm8-328a56 56 0 1 1 112 0A56 56 0 1 1 456 88zM576 245.7v84.7c10-11.3 16-26.1 16-42.3s-6-31.1-16-42.3zM320 32a64 64 0 1 1 0 128 64 64 0 1 1 0-128zM240 304c0 16.2 6 31 16 42.3V261.7c-10 11.3-16 26.1-16 42.3zm144-42.3v84.7c10-11.3 16-26.1 16-42.3s-6-31.1-16-42.3zM448 304c0 44.7-26.2 83.2-64 101.2V448c0 17.7-14.3 32-32 32H288c-17.7 0-32-14.3-32-32V405.2c-37.8-18-64-56.5-64-101.2c0-61.9 50.1-112 112-112h32c61.9 0 112 50.1 112 112z\"/></svg>`{=html}\n\n<br>\n\n### Co-Instructors\n\n-   Nic Crane\n-   Steph Hazlitt\n\n### Teaching Assistant\n\n-   Jonathan Keane\n\n## Meet Each Other `<svg aria-hidden=\"true\" role=\"img\" viewBox=\"0 0 640 512\" style=\"height:1em;width:1.25em;vertical-align:-0.125em;margin-left:auto;margin-right:auto;font-size:inherit;fill:currentColor;overflow:visible;position:relative;\"><path d=\"M272.2 64.6l-51.1 51.1c-15.3 4.2-29.5 11.9-41.5 22.5L153 161.9C142.8 171 129.5 176 115.8 176H96V304c20.4 .6 39.8 8.9 54.3 23.4l35.6 35.6 7 7 0 0L219.9 397c6.2 6.2 16.4 6.2 22.6 0c1.7-1.7 3-3.7 3.7-5.8c2.8-7.7 9.3-13.5 17.3-15.3s16.4 .6 22.2 6.5L296.5 393c11.6 11.6 30.4 11.6 41.9 0c5.4-5.4 8.3-12.3 8.6-19.4c.4-8.8 5.6-16.6 13.6-20.4s17.3-3 24.4 2.1c9.4 6.7 22.5 5.8 30.9-2.6c9.4-9.4 9.4-24.6 0-33.9L340.1 243l-35.8 33c-27.3 25.2-69.2 25.6-97 .9c-31.7-28.2-32.4-77.4-1.6-106.5l70.1-66.2C303.2 78.4 339.4 64 377.1 64c36.1 0 71 13.3 97.9 37.2L505.1 128H544h40 40c8.8 0 16 7.2 16 16V352c0 17.7-14.3 32-32 32H576c-11.8 0-22.2-6.4-27.7-16H463.4c-3.4 6.7-7.9 13.1-13.5 18.7c-17.1 17.1-40.8 23.8-63 20.1c-3.6 7.3-8.5 14.1-14.6 20.2c-27.3 27.3-70 30-100.4 8.1c-25.1 20.8-62.5 19.5-86-4.1L159 404l-7-7-35.6-35.6c-5.5-5.5-12.7-8.7-20.4-9.3C96 369.7 81.6 384 64 384H32c-17.7 0-32-14.3-32-32V144c0-8.8 7.2-16 16-16H56 96h19.8c2 0 3.9-.7 5.3-2l26.5-23.6C175.5 77.7 211.4 64 248.7 64H259c4.4 0 8.9 .2 13.2 .6zM544 320V176H496c-5.9 0-11.6-2.2-15.9-6.1l-36.9-32.8c-18.2-16.2-41.7-25.1-66.1-25.1c-25.4 0-49.8 9.7-68.3 27.1l-70.1 66.2c-10.3 9.8-10.1 26.3 .5 35.7c9.3 8.3 23.4 8.1 32.5-.3l71.9-66.4c9.7-9 24.9-8.4 33.9 1.4s8.4 24.9-1.4 33.9l-.8 .8 74.4 74.4c10 10 16.5 22.3 19.4 35.1H544zM64 336a16 16 0 1 0 -32 0 16 16 0 1 0 32 0zm528 16a16 16 0 1 0 0-32 16 16 0 1 0 0 32z\"/></svg>`{=html}\n\n<br>\n\n-   When did you use R for the first time?\n-   What is your favorite R package?\n-   Which package hex sticker would you like to find the most during posit::conf(2024)?\n\n## Getting Help Today `<svg aria-hidden=\"true\" role=\"img\" viewBox=\"0 0 448 512\" style=\"height:1em;width:0.88em;vertical-align:-0.125em;margin-left:auto;margin-right:auto;font-size:inherit;fill:currentColor;overflow:visible;position:relative;\"><path d=\"M48 24C48 10.7 37.3 0 24 0S0 10.7 0 24V64 350.5 400v88c0 13.3 10.7 24 24 24s24-10.7 24-24V388l80.3-20.1c41.1-10.3 84.6-5.5 122.5 13.4c44.2 22.1 95.5 24.8 141.7 7.4l34.7-13c12.5-4.7 20.8-16.6 20.8-30V66.1c0-23-24.2-38-44.8-27.7l-9.6 4.8c-46.3 23.2-100.8 23.2-147.1 0c-35.1-17.6-75.4-22-113.5-12.5L48 52V24zm0 77.5l96.6-24.2c27-6.7 55.5-3.6 80.4 8.8c54.9 27.4 118.7 29.7 175 6.8V334.7l-24.4 9.1c-33.7 12.6-71.2 10.7-103.4-5.4c-48.2-24.1-103.3-30.1-155.6-17.1L48 338.5v-237z\"/></svg>`{=html}\n\n<br>\n\n[TEAL]{style=\"color:teal;\"} sticky note: I am OK / I am done\n\n[PINK]{style=\"color:pink;\"} sticky note: I need support / I am working\n\n<br>\n\n`<svg aria-hidden=\"true\" role=\"img\" viewBox=\"0 0 512 512\" style=\"height:1em;width:1em;vertical-align:-0.125em;margin-left:auto;margin-right:auto;font-size:inherit;fill:currentColor;overflow:visible;position:relative;\"><path d=\"M256 0c-25.3 0-47.2 14.7-57.6 36c-7-2.6-14.5-4-22.4-4c-35.3 0-64 28.7-64 64V261.5l-2.7-2.7c-25-25-65.5-25-90.5 0s-25 65.5 0 90.5L106.5 437c48 48 113.1 75 181 75H296h8c1.5 0 3-.1 4.5-.4c91.7-6.2 165-79.4 171.1-171.1c.3-1.5 .4-3 .4-4.5V160c0-35.3-28.7-64-64-64c-5.5 0-10.9 .7-16 2V96c0-35.3-28.7-64-64-64c-7.9 0-15.4 1.4-22.4 4C303.2 14.7 281.3 0 256 0zM240 96.1c0 0 0-.1 0-.1V64c0-8.8 7.2-16 16-16s16 7.2 16 16V95.9c0 0 0 .1 0 .1V232c0 13.3 10.7 24 24 24s24-10.7 24-24V96c0 0 0 0 0-.1c0-8.8 7.2-16 16-16s16 7.2 16 16v55.9c0 0 0 .1 0 .1v80c0 13.3 10.7 24 24 24s24-10.7 24-24V160.1c0 0 0-.1 0-.1c0-8.8 7.2-16 16-16s16 7.2 16 16V332.9c-.1 .6-.1 1.3-.2 1.9c-3.4 69.7-59.3 125.6-129 129c-.6 0-1.3 .1-1.9 .2H296h-8.5c-55.2 0-108.1-21.9-147.1-60.9L52.7 315.3c-6.2-6.2-6.2-16.4 0-22.6s16.4-6.2 22.6 0L119 336.4c6.9 6.9 17.2 8.9 26.2 5.2s14.8-12.5 14.8-22.2V96c0-8.8 7.2-16 16-16c8.8 0 16 7.1 16 15.9V232c0 13.3 10.7 24 24 24s24-10.7 24-24V96.1z\"/></svg>`{=html} You can ask questions at any time during the workshop\n\n## Discord `<svg aria-hidden=\"true\" role=\"img\" viewBox=\"0 0 640 512\" style=\"height:1em;width:1.25em;vertical-align:-0.125em;margin-left:auto;margin-right:auto;font-size:inherit;fill:currentColor;overflow:visible;position:relative;\"><path d=\"M524.531,69.836a1.5,1.5,0,0,0-.764-.7A485.065,485.065,0,0,0,404.081,32.03a1.816,1.816,0,0,0-1.923.91,337.461,337.461,0,0,0-14.9,30.6,447.848,447.848,0,0,0-134.426,0,309.541,309.541,0,0,0-15.135-30.6,1.89,1.89,0,0,0-1.924-.91A483.689,483.689,0,0,0,116.085,69.137a1.712,1.712,0,0,0-.788.676C39.068,183.651,18.186,294.69,28.43,404.354a2.016,2.016,0,0,0,.765,1.375A487.666,487.666,0,0,0,176.02,479.918a1.9,1.9,0,0,0,2.063-.676A348.2,348.2,0,0,0,208.12,430.4a1.86,1.86,0,0,0-1.019-2.588,321.173,321.173,0,0,1-45.868-21.853,1.885,1.885,0,0,1-.185-3.126c3.082-2.309,6.166-4.711,9.109-7.137a1.819,1.819,0,0,1,1.9-.256c96.229,43.917,200.41,43.917,295.5,0a1.812,1.812,0,0,1,1.924.233c2.944,2.426,6.027,4.851,9.132,7.16a1.884,1.884,0,0,1-.162,3.126,301.407,301.407,0,0,1-45.89,21.83,1.875,1.875,0,0,0-1,2.611,391.055,391.055,0,0,0,30.014,48.815,1.864,1.864,0,0,0,2.063.7A486.048,486.048,0,0,0,610.7,405.729a1.882,1.882,0,0,0,.765-1.352C623.729,277.594,590.933,167.465,524.531,69.836ZM222.491,337.58c-28.972,0-52.844-26.587-52.844-59.239S193.056,219.1,222.491,219.1c29.665,0,53.306,26.82,52.843,59.239C275.334,310.993,251.924,337.58,222.491,337.58Zm195.38,0c-28.971,0-52.843-26.587-52.843-59.239S388.437,219.1,417.871,219.1c29.667,0,53.307,26.82,52.844,59.239C470.715,310.993,447.538,337.58,417.871,337.58Z\"/></svg>`{=html}\n\n-   [pos.it/conf-event-portal](http://pos.it/conf-event-portal) (login)\n-   Click on \"Join Discord, the virtual networking platform!\"\n-   Browse Channels -> `#workshop-arrow`\n\n## We Assume\n\n-   You know `<svg aria-hidden=\"true\" role=\"img\" viewBox=\"0 0 581 512\" style=\"height:1em;width:1.13em;vertical-align:-0.125em;margin-left:auto;margin-right:auto;font-size:inherit;fill:currentColor;overflow:visible;position:relative;\"><path d=\"M581 226.6C581 119.1 450.9 32 290.5 32S0 119.1 0 226.6C0 322.4 103.3 402 239.4 418.1V480h99.1v-61.5c24.3-2.7 47.6-7.4 69.4-13.9L448 480h112l-67.4-113.7c54.5-35.4 88.4-84.9 88.4-139.7zm-466.8 14.5c0-73.5 98.9-133 220.8-133s211.9 40.7 211.9 133c0 50.1-26.5 85-70.3 106.4-2.4-1.6-4.7-2.9-6.4-3.7-10.2-5.2-27.8-10.5-27.8-10.5s86.6-6.4 86.6-92.7-90.6-87.9-90.6-87.9h-199V361c-74.1-21.5-125.2-67.1-125.2-119.9zm225.1 38.3v-55.6c57.8 0 87.8-6.8 87.8 27.3 0 36.5-38.2 28.3-87.8 28.3zm-.9 72.5H365c10.8 0 18.9 11.7 24 19.2-16.1 1.9-33 2.8-50.6 2.9v-22.1z\"/></svg>`{=html}\n-   You are familiar with the [dplyr](https://dplyr.tidyverse.org/) package for data manipulation `<svg aria-hidden=\"true\" role=\"img\" viewBox=\"0 0 512 512\" style=\"height:1em;width:1em;vertical-align:-0.125em;margin-left:auto;margin-right:auto;font-size:inherit;fill:currentColor;overflow:visible;position:relative;\"><path d=\"M78.6 5C69.1-2.4 55.6-1.5 47 7L7 47c-8.5 8.5-9.4 22-2.1 31.6l80 104c4.5 5.9 11.6 9.4 19 9.4h54.1l109 109c-14.7 29-10 65.4 14.3 89.6l112 112c12.5 12.5 32.8 12.5 45.3 0l64-64c12.5-12.5 12.5-32.8 0-45.3l-112-112c-24.2-24.2-60.6-29-89.6-14.3l-109-109V104c0-7.5-3.5-14.5-9.4-19L78.6 5zM19.9 396.1C7.2 408.8 0 426.1 0 444.1C0 481.6 30.4 512 67.9 512c18 0 35.3-7.2 48-19.9L233.7 374.3c-7.8-20.9-9-43.6-3.6-65.1l-61.7-61.7L19.9 396.1zM512 144c0-10.5-1.1-20.7-3.2-30.5c-2.4-11.2-16.1-14.1-24.2-6l-63.9 63.9c-3 3-7.1 4.7-11.3 4.7H352c-8.8 0-16-7.2-16-16V102.6c0-4.2 1.7-8.3 4.7-11.3l63.9-63.9c8.1-8.1 5.2-21.8-6-24.2C388.7 1.1 378.5 0 368 0C288.5 0 224 64.5 224 144l0 .8 85.3 85.3c36-9.1 75.8 .5 104 28.7L429 274.5c49-23 83-72.8 83-130.5zM56 432a24 24 0 1 1 48 0 24 24 0 1 1 -48 0z\"/></svg>`{=html}\n-   You have data in your life that is too large to fit into memory or sluggish in memory\n-   You want to learn how to engineer your data storage for more performant access and analysis\n",
-    "supporting": [],
+    "markdown": "---\ntitle: \"Big Data in R with Arrow\"\nsubtitle: \"posit::conf(2024) 1-day workshop\"\nauthor: \"Nic Crane + Steph Hazlitt\"\nfooter: \"[🔗 pos.it/arrow-conf24](https://pos.it/arrow-conf24)\"\nlogo: \"images/logo.png\"\nexecute:\n  echo: true\nformat:\n  revealjs: \n    theme: default\nengine: knitr\neditor: source\n---\n\n\n# Welcome 👋\n\n## \n\n### WiFi `<svg aria-hidden=\"true\" role=\"img\" viewBox=\"0 0 640 512\" style=\"height:1em;width:1.25em;vertical-align:-0.125em;margin-left:auto;margin-right:auto;font-size:inherit;fill:currentColor;overflow:visible;position:relative;\"><path d=\"M54.2 202.9C123.2 136.7 216.8 96 320 96s196.8 40.7 265.8 106.9c12.8 12.2 33 11.8 45.2-.9s11.8-33-.9-45.2C549.7 79.5 440.4 32 320 32S90.3 79.5 9.8 156.7C-2.9 169-3.3 189.2 8.9 202s32.5 13.2 45.2 .9zM320 256c56.8 0 108.6 21.1 148.2 56c13.3 11.7 33.5 10.4 45.2-2.8s10.4-33.5-2.8-45.2C459.8 219.2 393 192 320 192s-139.8 27.2-190.5 72c-13.3 11.7-14.5 31.9-2.8 45.2s31.9 14.5 45.2 2.8c39.5-34.9 91.3-56 148.2-56zm64 160a64 64 0 1 0 -128 0 64 64 0 1 0 128 0z\"/></svg>`{=html}\n\n-   Username: **Posit Conf 2024**\n-   Password: **conf2024**\n\n<br>\n\n### Workshop `<svg aria-hidden=\"true\" role=\"img\" viewBox=\"0 0 496 512\" style=\"height:1em;width:0.97em;vertical-align:-0.125em;margin-left:auto;margin-right:auto;font-size:inherit;fill:currentColor;overflow:visible;position:relative;\"><path d=\"M165.9 397.4c0 2-2.3 3.6-5.2 3.6-3.3.3-5.6-1.3-5.6-3.6 0-2 2.3-3.6 5.2-3.6 3-.3 5.6 1.3 5.6 3.6zm-31.1-4.5c-.7 2 1.3 4.3 4.3 4.9 2.6 1 5.6 0 6.2-2s-1.3-4.3-4.3-5.2c-2.6-.7-5.5.3-6.2 2.3zm44.2-1.7c-2.9.7-4.9 2.6-4.6 4.9.3 2 2.9 3.3 5.9 2.6 2.9-.7 4.9-2.6 4.6-4.6-.3-1.9-3-3.2-5.9-2.9zM244.8 8C106.1 8 0 113.3 0 252c0 110.9 69.8 205.8 169.5 239.2 12.8 2.3 17.3-5.6 17.3-12.1 0-6.2-.3-40.4-.3-61.4 0 0-70 15-84.7-29.8 0 0-11.4-29.1-27.8-36.6 0 0-22.9-15.7 1.6-15.4 0 0 24.9 2 38.6 25.8 21.9 38.6 58.6 27.5 72.9 20.9 2.3-16 8.8-27.1 16-33.7-55.9-6.2-112.3-14.3-112.3-110.5 0-27.5 7.6-41.3 23.6-58.9-2.6-6.5-11.1-33.3 2.6-67.9 20.9-6.5 69 27 69 27 20-5.6 41.5-8.5 62.8-8.5s42.8 2.9 62.8 8.5c0 0 48.1-33.6 69-27 13.7 34.7 5.2 61.4 2.6 67.9 16 17.7 25.8 31.5 25.8 58.9 0 96.5-58.9 104.2-114.8 110.5 9.2 7.9 17 22.9 17 46.4 0 33.7-.3 75.4-.3 83.6 0 6.5 4.6 14.4 17.3 12.1C428.2 457.8 496 362.9 496 252 496 113.3 383.5 8 244.8 8zM97.2 352.9c-1.3 1-1 3.3.7 5.2 1.6 1.6 3.9 2.3 5.2 1 1.3-1 1-3.3-.7-5.2-1.6-1.6-3.9-2.3-5.2-1zm-10.8-8.1c-.7 1.3.3 2.9 2.3 3.9 1.6 1 3.6.7 4.3-.7.7-1.3-.3-2.9-2.3-3.9-2-.6-3.6-.3-4.3.7zm32.4 35.6c-1.6 1.3-1 4.3 1.3 6.2 2.3 2.3 5.2 2.6 6.5 1 1.3-1.3.7-4.3-1.3-6.2-2.2-2.3-5.2-2.6-6.5-1zm-11.4-14.7c-1.6 1-1.6 3.6 0 5.9 1.6 2.3 4.3 3.3 5.6 2.3 1.6-1.3 1.6-3.9 0-6.2-1.4-2.3-4-3.3-5.6-2z\"/></svg>`{=html}\n\n-   Website: [pos.it/arrow-conf24](https://pos.it/arrow-conf24)\n-   GitHub: [github.com/posit-conf-2024/arrow](https://github.com/posit-conf-2024/arrow)\n\n## Housekeeping\n\n<br>\n\n### Gender Neutral Bathrooms `<svg aria-hidden=\"true\" role=\"img\" viewBox=\"0 0 320 512\" style=\"height:1em;width:0.62em;vertical-align:-0.125em;margin-left:auto;margin-right:auto;font-size:inherit;fill:currentColor;overflow:visible;position:relative;\"><path d=\"M160 0a48 48 0 1 1 0 96 48 48 0 1 1 0-96zm8 352V128h6.9c33.7 0 64.9 17.7 82.3 46.6l58.3 97c9.1 15.1 4.2 34.8-10.9 43.9s-34.8 4.2-43.9-10.9L232 256.9V480c0 17.7-14.3 32-32 32s-32-14.3-32-32V352h0zM58.2 182.3c19.9-33.1 55.3-53.5 93.8-54.3V384h0v96c0 17.7-14.3 32-32 32s-32-14.3-32-32V384H70.2c-10.9 0-18.6-10.7-15.2-21.1L93.3 248.1 59.4 304.5c-9.1 15.1-28.8 20-43.9 10.9s-20-28.8-10.9-43.9l53.6-89.2z\"/></svg>`{=html}\n\n-   Located on levels 3, 4, 5, 6 & 7\n\n### Specialty Rooms `<svg aria-hidden=\"true\" role=\"img\" viewBox=\"0 0 448 512\" style=\"height:1em;width:0.88em;vertical-align:-0.125em;margin-left:auto;margin-right:auto;font-size:inherit;fill:currentColor;overflow:visible;position:relative;\"><path d=\"M301.1 34.8C312.6 40 320 51.4 320 64V448c0 12.6-7.4 24-18.9 29.2s-25 3.1-34.4-5.3L131.8 352H64c-35.3 0-64-28.7-64-64V224c0-35.3 28.7-64 64-64h67.8L266.7 40.1c9.4-8.4 22.9-10.4 34.4-5.3zM412.6 181.5C434.1 199.1 448 225.9 448 256s-13.9 56.9-35.4 74.5c-10.3 8.4-25.4 6.8-33.8-3.5s-6.8-25.4 3.5-33.8C393.1 284.4 400 271 400 256s-6.9-28.4-17.7-37.3c-10.3-8.4-11.8-23.5-3.5-33.8s23.5-11.8 33.8-3.5z\"/></svg>`{=html}\n\n-   Meditation/Prayer Room (503)\n-   Lactation Room (509)\n\n*Available Mon & Tues 7am - 7pm, and Wed 7am - 5pm\n\n\n## Photos\n\n<br>\n\n### Red Lanyards `<svg aria-hidden=\"true\" role=\"img\" viewBox=\"0 0 448 512\" style=\"height:1em;width:0.88em;vertical-align:-0.125em;margin-left:auto;margin-right:auto;font-size:inherit;fill:currentColor;overflow:visible;position:relative;\"><path d=\"M48 128c-17.7 0-32 14.3-32 32s14.3 32 32 32H400c17.7 0 32-14.3 32-32s-14.3-32-32-32H48zm0 192c-17.7 0-32 14.3-32 32s14.3 32 32 32H400c17.7 0 32-14.3 32-32s-14.3-32-32-32H48z\"/></svg>`{=html}`<svg aria-hidden=\"true\" role=\"img\" viewBox=\"0 0 448 512\" style=\"height:1em;width:0.88em;vertical-align:-0.125em;margin-left:auto;margin-right:auto;font-size:inherit;fill:currentColor;overflow:visible;position:relative;\"><path d=\"M48 128c-17.7 0-32 14.3-32 32s14.3 32 32 32H400c17.7 0 32-14.3 32-32s-14.3-32-32-32H48zm0 192c-17.7 0-32 14.3-32 32s14.3 32 32 32H400c17.7 0 32-14.3 32-32s-14.3-32-32-32H48z\"/></svg>`{=html} **NO** `<svg aria-hidden=\"true\" role=\"img\" viewBox=\"0 0 512 512\" style=\"height:1em;width:1em;vertical-align:-0.125em;margin-left:auto;margin-right:auto;font-size:inherit;fill:currentColor;overflow:visible;position:relative;\"><path d=\"M149.1 64.8L138.7 96H64C28.7 96 0 124.7 0 160V416c0 35.3 28.7 64 64 64H448c35.3 0 64-28.7 64-64V160c0-35.3-28.7-64-64-64H373.3L362.9 64.8C356.4 45.2 338.1 32 317.4 32H194.6c-20.7 0-39 13.2-45.5 32.8zM256 192a96 96 0 1 1 0 192 96 96 0 1 1 0-192z\"/></svg>`{=html}\n\n<br>\n\nPlease note everyone’s lanyard colors before taking a photo and respect their choices.\n\n## Code of Conduct\n\n<br>\n\n### `<svg aria-hidden=\"true\" role=\"img\" viewBox=\"0 0 640 512\" style=\"height:1em;width:1.25em;vertical-align:-0.125em;margin-left:auto;margin-right:auto;font-size:inherit;fill:currentColor;overflow:visible;position:relative;\"><path d=\"M579.8 267.7c56.5-56.5 56.5-148 0-204.5c-50-50-128.8-56.5-186.3-15.4l-1.6 1.1c-14.4 10.3-17.7 30.3-7.4 44.6s30.3 17.7 44.6 7.4l1.6-1.1c32.1-22.9 76-19.3 103.8 8.6c31.5 31.5 31.5 82.5 0 114L422.3 334.8c-31.5 31.5-82.5 31.5-114 0c-27.9-27.9-31.5-71.8-8.6-103.8l1.1-1.6c10.3-14.4 6.9-34.4-7.4-44.6s-34.4-6.9-44.6 7.4l-1.1 1.6C206.5 251.2 213 330 263 380c56.5 56.5 148 56.5 204.5 0L579.8 267.7zM60.2 244.3c-56.5 56.5-56.5 148 0 204.5c50 50 128.8 56.5 186.3 15.4l1.6-1.1c14.4-10.3 17.7-30.3 7.4-44.6s-30.3-17.7-44.6-7.4l-1.6 1.1c-32.1 22.9-76 19.3-103.8-8.6C74 372 74 321 105.5 289.5L217.7 177.2c31.5-31.5 82.5-31.5 114 0c27.9 27.9 31.5 71.8 8.6 103.9l-1.1 1.6c-10.3 14.4-6.9 34.4 7.4 44.6s34.4 6.9 44.6-7.4l1.1-1.6C433.5 260.8 427 182 377 132c-56.5-56.5-148-56.5-204.5 0L60.2 244.3z\"/></svg>`{=html} [posit.co/code-of-conduct/](https://posit.co/code-of-conduct/)\n\n-   Contact any posit::conf staff member, identifiable by their staff t-shirt, or visit the conference general information desk.\n-   Send a message to conf\\@posit.com; event organizers will respond promptly.\n-   Call +1-844-448-1212; this phone number will be monitored for the duration of the event.\n\n## Meet Your Teaching Team `<svg aria-hidden=\"true\" role=\"img\" viewBox=\"0 0 640 512\" style=\"height:1em;width:1.25em;vertical-align:-0.125em;margin-left:auto;margin-right:auto;font-size:inherit;fill:currentColor;overflow:visible;position:relative;\"><path d=\"M72 88a56 56 0 1 1 112 0A56 56 0 1 1 72 88zM64 245.7C54 256.9 48 271.8 48 288s6 31.1 16 42.3V245.7zm144.4-49.3C178.7 222.7 160 261.2 160 304c0 34.3 12 65.8 32 90.5V416c0 17.7-14.3 32-32 32H96c-17.7 0-32-14.3-32-32V389.2C26.2 371.2 0 332.7 0 288c0-61.9 50.1-112 112-112h32c24 0 46.2 7.5 64.4 20.3zM448 416V394.5c20-24.7 32-56.2 32-90.5c0-42.8-18.7-81.3-48.4-107.7C449.8 183.5 472 176 496 176h32c61.9 0 112 50.1 112 112c0 44.7-26.2 83.2-64 101.2V416c0 17.7-14.3 32-32 32H480c-17.7 0-32-14.3-32-32zm8-328a56 56 0 1 1 112 0A56 56 0 1 1 456 88zM576 245.7v84.7c10-11.3 16-26.1 16-42.3s-6-31.1-16-42.3zM320 32a64 64 0 1 1 0 128 64 64 0 1 1 0-128zM240 304c0 16.2 6 31 16 42.3V261.7c-10 11.3-16 26.1-16 42.3zm144-42.3v84.7c10-11.3 16-26.1 16-42.3s-6-31.1-16-42.3zM448 304c0 44.7-26.2 83.2-64 101.2V448c0 17.7-14.3 32-32 32H288c-17.7 0-32-14.3-32-32V405.2c-37.8-18-64-56.5-64-101.2c0-61.9 50.1-112 112-112h32c61.9 0 112 50.1 112 112z\"/></svg>`{=html}\n\n<br>\n\n### Co-Instructors\n\n-   Nic Crane\n-   Steph Hazlitt\n\n### Teaching Assistant\n\n-   Jonathan Keane\n\n## Meet Each Other `<svg aria-hidden=\"true\" role=\"img\" viewBox=\"0 0 640 512\" style=\"height:1em;width:1.25em;vertical-align:-0.125em;margin-left:auto;margin-right:auto;font-size:inherit;fill:currentColor;overflow:visible;position:relative;\"><path d=\"M272.2 64.6l-51.1 51.1c-15.3 4.2-29.5 11.9-41.5 22.5L153 161.9C142.8 171 129.5 176 115.8 176H96V304c20.4 .6 39.8 8.9 54.3 23.4l35.6 35.6 7 7 0 0L219.9 397c6.2 6.2 16.4 6.2 22.6 0c1.7-1.7 3-3.7 3.7-5.8c2.8-7.7 9.3-13.5 17.3-15.3s16.4 .6 22.2 6.5L296.5 393c11.6 11.6 30.4 11.6 41.9 0c5.4-5.4 8.3-12.3 8.6-19.4c.4-8.8 5.6-16.6 13.6-20.4s17.3-3 24.4 2.1c9.4 6.7 22.5 5.8 30.9-2.6c9.4-9.4 9.4-24.6 0-33.9L340.1 243l-35.8 33c-27.3 25.2-69.2 25.6-97 .9c-31.7-28.2-32.4-77.4-1.6-106.5l70.1-66.2C303.2 78.4 339.4 64 377.1 64c36.1 0 71 13.3 97.9 37.2L505.1 128H544h40 40c8.8 0 16 7.2 16 16V352c0 17.7-14.3 32-32 32H576c-11.8 0-22.2-6.4-27.7-16H463.4c-3.4 6.7-7.9 13.1-13.5 18.7c-17.1 17.1-40.8 23.8-63 20.1c-3.6 7.3-8.5 14.1-14.6 20.2c-27.3 27.3-70 30-100.4 8.1c-25.1 20.8-62.5 19.5-86-4.1L159 404l-7-7-35.6-35.6c-5.5-5.5-12.7-8.7-20.4-9.3C96 369.7 81.6 384 64 384H32c-17.7 0-32-14.3-32-32V144c0-8.8 7.2-16 16-16H56 96h19.8c2 0 3.9-.7 5.3-2l26.5-23.6C175.5 77.7 211.4 64 248.7 64H259c4.4 0 8.9 .2 13.2 .6zM544 320V176H496c-5.9 0-11.6-2.2-15.9-6.1l-36.9-32.8c-18.2-16.2-41.7-25.1-66.1-25.1c-25.4 0-49.8 9.7-68.3 27.1l-70.1 66.2c-10.3 9.8-10.1 26.3 .5 35.7c9.3 8.3 23.4 8.1 32.5-.3l71.9-66.4c9.7-9 24.9-8.4 33.9 1.4s8.4 24.9-1.4 33.9l-.8 .8 74.4 74.4c10 10 16.5 22.3 19.4 35.1H544zM64 336a16 16 0 1 0 -32 0 16 16 0 1 0 32 0zm528 16a16 16 0 1 0 0-32 16 16 0 1 0 0 32z\"/></svg>`{=html}\n\n<br>\n\n-   When did you use R for the first time?\n-   What is your favorite R package?\n-   Which package hex sticker would you like to find the most during posit::conf(2024)?\n\n## Getting Help Today `<svg aria-hidden=\"true\" role=\"img\" viewBox=\"0 0 448 512\" style=\"height:1em;width:0.88em;vertical-align:-0.125em;margin-left:auto;margin-right:auto;font-size:inherit;fill:currentColor;overflow:visible;position:relative;\"><path d=\"M48 24C48 10.7 37.3 0 24 0S0 10.7 0 24V64 350.5 400v88c0 13.3 10.7 24 24 24s24-10.7 24-24V388l80.3-20.1c41.1-10.3 84.6-5.5 122.5 13.4c44.2 22.1 95.5 24.8 141.7 7.4l34.7-13c12.5-4.7 20.8-16.6 20.8-30V66.1c0-23-24.2-38-44.8-27.7l-9.6 4.8c-46.3 23.2-100.8 23.2-147.1 0c-35.1-17.6-75.4-22-113.5-12.5L48 52V24zm0 77.5l96.6-24.2c27-6.7 55.5-3.6 80.4 8.8c54.9 27.4 118.7 29.7 175 6.8V334.7l-24.4 9.1c-33.7 12.6-71.2 10.7-103.4-5.4c-48.2-24.1-103.3-30.1-155.6-17.1L48 338.5v-237z\"/></svg>`{=html}\n\n<br>\n\n[TEAL]{style=\"color:teal;\"} sticky note: I am OK / I am done\n\n[PINK]{style=\"color:pink;\"} sticky note: I need support / I am working\n\n<br>\n\n`<svg aria-hidden=\"true\" role=\"img\" viewBox=\"0 0 512 512\" style=\"height:1em;width:1em;vertical-align:-0.125em;margin-left:auto;margin-right:auto;font-size:inherit;fill:currentColor;overflow:visible;position:relative;\"><path d=\"M256 0c-25.3 0-47.2 14.7-57.6 36c-7-2.6-14.5-4-22.4-4c-35.3 0-64 28.7-64 64V261.5l-2.7-2.7c-25-25-65.5-25-90.5 0s-25 65.5 0 90.5L106.5 437c48 48 113.1 75 181 75H296h8c1.5 0 3-.1 4.5-.4c91.7-6.2 165-79.4 171.1-171.1c.3-1.5 .4-3 .4-4.5V160c0-35.3-28.7-64-64-64c-5.5 0-10.9 .7-16 2V96c0-35.3-28.7-64-64-64c-7.9 0-15.4 1.4-22.4 4C303.2 14.7 281.3 0 256 0zM240 96.1c0 0 0-.1 0-.1V64c0-8.8 7.2-16 16-16s16 7.2 16 16V95.9c0 0 0 .1 0 .1V232c0 13.3 10.7 24 24 24s24-10.7 24-24V96c0 0 0 0 0-.1c0-8.8 7.2-16 16-16s16 7.2 16 16v55.9c0 0 0 .1 0 .1v80c0 13.3 10.7 24 24 24s24-10.7 24-24V160.1c0 0 0-.1 0-.1c0-8.8 7.2-16 16-16s16 7.2 16 16V332.9c-.1 .6-.1 1.3-.2 1.9c-3.4 69.7-59.3 125.6-129 129c-.6 0-1.3 .1-1.9 .2H296h-8.5c-55.2 0-108.1-21.9-147.1-60.9L52.7 315.3c-6.2-6.2-6.2-16.4 0-22.6s16.4-6.2 22.6 0L119 336.4c6.9 6.9 17.2 8.9 26.2 5.2s14.8-12.5 14.8-22.2V96c0-8.8 7.2-16 16-16c8.8 0 16 7.1 16 15.9V232c0 13.3 10.7 24 24 24s24-10.7 24-24V96.1z\"/></svg>`{=html} You can ask questions at any time during the workshop\n\n## Discord `<svg aria-hidden=\"true\" role=\"img\" viewBox=\"0 0 640 512\" style=\"height:1em;width:1.25em;vertical-align:-0.125em;margin-left:auto;margin-right:auto;font-size:inherit;fill:currentColor;overflow:visible;position:relative;\"><path d=\"M524.531,69.836a1.5,1.5,0,0,0-.764-.7A485.065,485.065,0,0,0,404.081,32.03a1.816,1.816,0,0,0-1.923.91,337.461,337.461,0,0,0-14.9,30.6,447.848,447.848,0,0,0-134.426,0,309.541,309.541,0,0,0-15.135-30.6,1.89,1.89,0,0,0-1.924-.91A483.689,483.689,0,0,0,116.085,69.137a1.712,1.712,0,0,0-.788.676C39.068,183.651,18.186,294.69,28.43,404.354a2.016,2.016,0,0,0,.765,1.375A487.666,487.666,0,0,0,176.02,479.918a1.9,1.9,0,0,0,2.063-.676A348.2,348.2,0,0,0,208.12,430.4a1.86,1.86,0,0,0-1.019-2.588,321.173,321.173,0,0,1-45.868-21.853,1.885,1.885,0,0,1-.185-3.126c3.082-2.309,6.166-4.711,9.109-7.137a1.819,1.819,0,0,1,1.9-.256c96.229,43.917,200.41,43.917,295.5,0a1.812,1.812,0,0,1,1.924.233c2.944,2.426,6.027,4.851,9.132,7.16a1.884,1.884,0,0,1-.162,3.126,301.407,301.407,0,0,1-45.89,21.83,1.875,1.875,0,0,0-1,2.611,391.055,391.055,0,0,0,30.014,48.815,1.864,1.864,0,0,0,2.063.7A486.048,486.048,0,0,0,610.7,405.729a1.882,1.882,0,0,0,.765-1.352C623.729,277.594,590.933,167.465,524.531,69.836ZM222.491,337.58c-28.972,0-52.844-26.587-52.844-59.239S193.056,219.1,222.491,219.1c29.665,0,53.306,26.82,52.843,59.239C275.334,310.993,251.924,337.58,222.491,337.58Zm195.38,0c-28.971,0-52.843-26.587-52.843-59.239S388.437,219.1,417.871,219.1c29.667,0,53.307,26.82,52.844,59.239C470.715,310.993,447.538,337.58,417.871,337.58Z\"/></svg>`{=html}\n\n-   [pos.it/conf-event-portal](http://pos.it/conf-event-portal) (login)\n-   Click on \"Join Discord, the virtual networking platform!\"\n-   Browse Channels -> `#workshop-arrow`\n\n## We Assume\n\n-   You know `<svg aria-hidden=\"true\" role=\"img\" viewBox=\"0 0 581 512\" style=\"height:1em;width:1.13em;vertical-align:-0.125em;margin-left:auto;margin-right:auto;font-size:inherit;fill:currentColor;overflow:visible;position:relative;\"><path d=\"M581 226.6C581 119.1 450.9 32 290.5 32S0 119.1 0 226.6C0 322.4 103.3 402 239.4 418.1V480h99.1v-61.5c24.3-2.7 47.6-7.4 69.4-13.9L448 480h112l-67.4-113.7c54.5-35.4 88.4-84.9 88.4-139.7zm-466.8 14.5c0-73.5 98.9-133 220.8-133s211.9 40.7 211.9 133c0 50.1-26.5 85-70.3 106.4-2.4-1.6-4.7-2.9-6.4-3.7-10.2-5.2-27.8-10.5-27.8-10.5s86.6-6.4 86.6-92.7-90.6-87.9-90.6-87.9h-199V361c-74.1-21.5-125.2-67.1-125.2-119.9zm225.1 38.3v-55.6c57.8 0 87.8-6.8 87.8 27.3 0 36.5-38.2 28.3-87.8 28.3zm-.9 72.5H365c10.8 0 18.9 11.7 24 19.2-16.1 1.9-33 2.8-50.6 2.9v-22.1z\"/></svg>`{=html}\n-   You are familiar with the [dplyr](https://dplyr.tidyverse.org/) package for data manipulation `<svg aria-hidden=\"true\" role=\"img\" viewBox=\"0 0 512 512\" style=\"height:1em;width:1em;vertical-align:-0.125em;margin-left:auto;margin-right:auto;font-size:inherit;fill:currentColor;overflow:visible;position:relative;\"><path d=\"M78.6 5C69.1-2.4 55.6-1.5 47 7L7 47c-8.5 8.5-9.4 22-2.1 31.6l80 104c4.5 5.9 11.6 9.4 19 9.4h54.1l109 109c-14.7 29-10 65.4 14.3 89.6l112 112c12.5 12.5 32.8 12.5 45.3 0l64-64c12.5-12.5 12.5-32.8 0-45.3l-112-112c-24.2-24.2-60.6-29-89.6-14.3l-109-109V104c0-7.5-3.5-14.5-9.4-19L78.6 5zM19.9 396.1C7.2 408.8 0 426.1 0 444.1C0 481.6 30.4 512 67.9 512c18 0 35.3-7.2 48-19.9L233.7 374.3c-7.8-20.9-9-43.6-3.6-65.1l-61.7-61.7L19.9 396.1zM512 144c0-10.5-1.1-20.7-3.2-30.5c-2.4-11.2-16.1-14.1-24.2-6l-63.9 63.9c-3 3-7.1 4.7-11.3 4.7H352c-8.8 0-16-7.2-16-16V102.6c0-4.2 1.7-8.3 4.7-11.3l63.9-63.9c8.1-8.1 5.2-21.8-6-24.2C388.7 1.1 378.5 0 368 0C288.5 0 224 64.5 224 144l0 .8 85.3 85.3c36-9.1 75.8 .5 104 28.7L429 274.5c49-23 83-72.8 83-130.5zM56 432a24 24 0 1 1 48 0 24 24 0 1 1 -48 0z\"/></svg>`{=html}\n-   You have data in your life that is too large to fit into memory or sluggish in memory\n-   You want to learn how to engineer your data storage for more performant access and analysis\n\n## Setup\n\n- Log onto Workbench at the following URL: <add on conf day>\n- Create a new session; **select \"Resource Profile: Large\"**\n- Run `usethis::use_course(\"posit-conf-2024/arrow\")`\n- Open `data/setup.R` and run the script\n",
+    "supporting": [
+      "0_housekeeping_files"
+    ],
     "filters": [
       "rmarkdown/pagebreak.lua"
     ],
diff --git a/_freeze/materials/3_data_engineering-exercises/execute-results/html.json b/_freeze/materials/3_data_engineering-exercises/execute-results/html.json
index 2b558ae..a4b8edc 100644
--- a/_freeze/materials/3_data_engineering-exercises/execute-results/html.json
+++ b/_freeze/materials/3_data_engineering-exercises/execute-results/html.json
@@ -1,8 +1,8 @@
 {
-  "hash": "949ad5f0c58f263cb46500cfd640fc1d",
+  "hash": "fa36122b964d2adb9ad5f21d7c58c8cc",
   "result": {
     "engine": "knitr",
-    "markdown": "---\ntitle: \"Data Engineering with Arrow Exercises\"\nexecute:\n  echo: true\n  messages: false\n  warning: false\neditor: source  \n---\n\n\n\n\n# Schemas\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(arrow)\nlibrary(dplyr)\n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_csv <- open_dataset(sources = \"data/seattle-library-checkouts.csv\",\n  format = \"csv\"\n)\n```\n:::\n\n\n\n\n::: {#exercise-schema .callout-tip}\n# Data Types & Controlling the Schema\n\n::: panel-tabset\n## Problems\n\n1.  The first few thousand rows of `ISBN` are blank in the Seattle Checkouts CSV file. Read in the Seattle Checkouts CSV file with `open_dataset()` and ensure the correct data type for `ISBN` is `<string>` (or the alias `<utf8>`) instead of the `<null>` interpreted by Arrow.\n\n2.  Once you have a `Dataset` object with the metadata you are after, count the number of `Checkouts` by `CheckoutYear` and arrange the result by `CheckoutYear`.\n\n## Solution 1\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_csv <- open_dataset(sources = \"data/seattle-library-checkouts.csv\",\n  format = \"csv\",\n  skip = 1,\n  schema(\n    UsageClass = utf8(),\n    CheckoutType = utf8(),\n    MaterialType = utf8(),\n    CheckoutYear = int64(),\n    CheckoutMonth = int64(),\n    Checkouts = int64(),\n    Title = utf8(),\n    ISBN = string(), #or utf8()\n    Creator = utf8(),\n    Subjects = utf8(),\n    Publisher = utf8(),\n    PublicationYear = utf8()\n  )\n)\n```\n:::\n\n\n\n\nor\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_csv <- open_dataset(sources = \"data/seattle-library-checkouts.csv\",\n  format = \"csv\",\n  col_types = schema(ISBN = string()) #utf8()\n)\nseattle_csv\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\nFileSystemDataset with 1 csv file\n12 columns\nUsageClass: string\nCheckoutType: string\nMaterialType: string\nCheckoutYear: int64\nCheckoutMonth: int64\nCheckouts: int64\nTitle: string\nISBN: string\nCreator: string\nSubjects: string\nPublisher: string\nPublicationYear: string\n```\n\n\n:::\n:::\n\n\n\n\n## Solution 2\n\nThe number of `Checkouts` by `CheckoutYear` arranged by `CheckoutYear`:\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_csv |>\n  group_by(CheckoutYear) |>\n  summarise(sum(Checkouts)) |>\n  arrange(CheckoutYear) |> \n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n# A tibble: 18 × 2\n   CheckoutYear `sum(Checkouts)`\n          <int>            <int>\n 1         2005          3798685\n 2         2006          6599318\n 3         2007          7126627\n 4         2008          8438486\n 5         2009          9135167\n 6         2010          8608966\n 7         2011          8321732\n 8         2012          8163046\n 9         2013          9057096\n10         2014          9136081\n11         2015          9084179\n12         2016          9021051\n13         2017          9231648\n14         2018          9149176\n15         2019          9199083\n16         2020          6053717\n17         2021          7361031\n18         2022          7001989\n```\n\n\n:::\n:::\n\n\n\n\nor\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_csv |> \n  count(CheckoutYear, wt = Checkouts) |> \n  arrange(CheckoutYear) |> \n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n# A tibble: 18 × 2\n   CheckoutYear       n\n          <int>   <int>\n 1         2005 3798685\n 2         2006 6599318\n 3         2007 7126627\n 4         2008 8438486\n 5         2009 9135167\n 6         2010 8608966\n 7         2011 8321732\n 8         2012 8163046\n 9         2013 9057096\n10         2014 9136081\n11         2015 9084179\n12         2016 9021051\n13         2017 9231648\n14         2018 9149176\n15         2019 9199083\n16         2020 6053717\n17         2021 7361031\n18         2022 7001989\n```\n\n\n:::\n:::\n\n\n\n\nTiming the query:\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_csv |>\n  group_by(CheckoutYear) |>\n  summarise(sum(Checkouts)) |>\n  arrange(CheckoutYear) |> \n  collect() |> \n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n   user  system elapsed \n 11.474   1.084  11.003 \n```\n\n\n:::\n:::\n\n\n\n\nQuerying 42 million rows of data stored in a CSV on disk in \\~10 seconds, not too bad.\n:::\n:::\n\n# Parquet\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_parquet <- \"data/seattle-library-checkouts-parquet\"\n\nseattle_csv |>\n  write_dataset(path = seattle_parquet,\n                format = \"parquet\")\n```\n:::\n\n\n\n\n::: {#exercise-dataset .callout-tip}\n# Parquet\n\n::: panel-tabset\n## Problem\n\n1.  Re-run the query counting the number of `Checkouts` by `CheckoutYear` and arranging the result by `CheckoutYear`, this time using the Seattle Checkout data saved to disk as a single, Parquet file. Did you notice a difference in compute time?\n\n## Solution 1\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_parquet <- \"data/seattle-library-checkouts-parquet\"\n\nopen_dataset(sources = seattle_parquet, \n             format = \"parquet\") |>\n  group_by(CheckoutYear) |>\n  summarise(sum(Checkouts)) |>\n  arrange(CheckoutYear) |> \n  collect() |> \n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n   user  system elapsed \n  2.076   0.287   0.646 \n```\n\n\n:::\n:::\n\n\n\n\nA *much* faster compute time for the query when the on-disk data is stored in the Parquet format.\n:::\n:::\n\n# Partitioning\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_parquet_part <- \"data/seattle-library-checkouts\"\n\nseattle_csv |>\n  group_by(CheckoutYear) |>\n  write_dataset(path = seattle_parquet_part,\n                format = \"parquet\")\n```\n:::\n\n\n\n\n::: callout-tip\n# Partitioning\n\n::: panel-tabset\n## Problems\n\n1.  Let's write the Seattle Checkout CSV data to a multi-file dataset just one more time! This time, write the data partitioned by `CheckoutType` as Parquet files.\n\n2.  Now compare the compute time between our Parquet data partitioned by `CheckoutYear` and our Parquet data partitioned by `CheckoutType` with a query of the total number of checkouts in September of 2019. Did you find a difference in compute time?\n\n## Solution 1\n\nWriting the data:\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_checkouttype <- \"data/seattle-library-checkouts-type\"\n\nseattle_csv |>\n  group_by(CheckoutType) |>\n  write_dataset(path = seattle_checkouttype,\n                format = \"parquet\")\n```\n:::\n\n\n\n\n## Solution 2\n\nTotal number of Checkouts in September of 2019 using partitioned Parquet data by `CheckoutType`:\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nopen_dataset(sources = \"data/seattle-library-checkouts-type\") |> \n  filter(CheckoutYear == 2019, CheckoutMonth == 9) |> \n  summarise(TotalCheckouts = sum(Checkouts)) |>\n  collect() |> \n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n   user  system elapsed \n  0.965   0.160   0.409 \n```\n\n\n:::\n:::\n\n\n\n\nTotal number of Checkouts in September of 2019 using partitioned Parquet data by `CheckoutYear` and `CheckoutMonth`:\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nopen_dataset(\"data/seattle-library-checkouts\") |> \n  filter(CheckoutYear == 2019, CheckoutMonth == 9) |> \n  summarise(TotalCheckouts = sum(Checkouts)) |>\n  collect() |> \n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n   user  system elapsed \n  0.058   0.006   0.052 \n```\n\n\n:::\n:::\n\n\n\n\nFaster compute time because the `filter()` call is based on the partitions.\n:::\n:::\n",
+    "markdown": "---\ntitle: \"Data Engineering with Arrow Exercises\"\nexecute:\n  echo: true\n  messages: false\n  warning: false\neditor: source  \n---\n\n\n# Schemas\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(arrow)\nlibrary(dplyr)\n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_csv <- open_dataset(sources = \"data/seattle-library-checkouts.csv\",\n                            format = \"csv\")\n```\n:::\n\n\n::: {#exercise-schema .callout-tip}\n# Data Types & Controlling the Schema\n\n::: panel-tabset\n## Problems\n\n1.  The first few thousand rows of `ISBN` are blank in the Seattle Checkouts CSV file. Read in the Seattle Checkouts CSV file with `open_dataset()` and ensure the correct data type for `ISBN` is `<string>` (or the alias `<utf8>`) instead of the `<null>` interpreted by Arrow.\n\n2.  Once you have a `Dataset` object with the metadata you are after, count the number of `Checkouts` by `CheckoutYear` and arrange the result by `CheckoutYear`.\n\n## Solution 1\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_csv <- open_dataset(sources = \"data/seattle-library-checkouts.csv\",\n  format = \"csv\",\n  schema(\n    UsageClass = utf8(),\n    CheckoutType = utf8(),\n    MaterialType = utf8(),\n    CheckoutYear = int64(),\n    CheckoutMonth = int64(),\n    Checkouts = int64(),\n    Title = utf8(),\n    ISBN = string(), #or utf8()\n    Creator = utf8(),\n    Subjects = utf8(),\n    Publisher = utf8(),\n    PublicationYear = utf8()\n  ),\n    skip = 1,\n)\n```\n:::\n\n\nor\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_csv <- open_dataset(sources = \"data/seattle-library-checkouts.csv\",\n  format = \"csv\",\n  col_types = schema(ISBN = string()) # or utf8()\n)\nseattle_csv\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\nFileSystemDataset with 1 csv file\n12 columns\nUsageClass: string\nCheckoutType: string\nMaterialType: string\nCheckoutYear: int64\nCheckoutMonth: int64\nCheckouts: int64\nTitle: string\nISBN: string\nCreator: string\nSubjects: string\nPublisher: string\nPublicationYear: string\n```\n\n\n:::\n:::\n\n\n## Solution 2\n\nThe number of `Checkouts` by `CheckoutYear` arranged by `CheckoutYear`:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_csv |>\n  group_by(CheckoutYear) |>\n  summarise(sum(Checkouts)) |>\n  arrange(CheckoutYear) |> \n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n# A tibble: 18 × 2\n   CheckoutYear `sum(Checkouts)`\n          <int>            <int>\n 1         2005          3798685\n 2         2006          6599318\n 3         2007          7126627\n 4         2008          8438486\n 5         2009          9135167\n 6         2010          8608966\n 7         2011          8321732\n 8         2012          8163046\n 9         2013          9057096\n10         2014          9136081\n11         2015          9084179\n12         2016          9021051\n13         2017          9231648\n14         2018          9149176\n15         2019          9199083\n16         2020          6053717\n17         2021          7361031\n18         2022          7001989\n```\n\n\n:::\n:::\n\n\nor\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_csv |> \n  count(CheckoutYear, wt = Checkouts) |> \n  arrange(CheckoutYear) |> \n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n# A tibble: 18 × 2\n   CheckoutYear       n\n          <int>   <int>\n 1         2005 3798685\n 2         2006 6599318\n 3         2007 7126627\n 4         2008 8438486\n 5         2009 9135167\n 6         2010 8608966\n 7         2011 8321732\n 8         2012 8163046\n 9         2013 9057096\n10         2014 9136081\n11         2015 9084179\n12         2016 9021051\n13         2017 9231648\n14         2018 9149176\n15         2019 9199083\n16         2020 6053717\n17         2021 7361031\n18         2022 7001989\n```\n\n\n:::\n:::\n\n\nTiming the query:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_csv |>\n  group_by(CheckoutYear) |>\n  summarise(sum(Checkouts)) |>\n  arrange(CheckoutYear) |> \n  collect() |> \n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n   user  system elapsed \n 10.651   1.091  10.333 \n```\n\n\n:::\n:::\n\n\nQuerying 42 million rows of data stored in a CSV on disk in \\~10 seconds, not too bad.\n:::\n:::\n\n# Parquet\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_parquet <- \"data/seattle-library-checkouts-parquet\"\n\nseattle_csv |>\n  write_dataset(path = seattle_parquet,\n                format = \"parquet\")\n```\n:::\n\n\n::: {#exercise-dataset .callout-tip}\n# Parquet\n\n::: panel-tabset\n## Problem\n\n1.  Re-run the query counting the number of `Checkouts` by `CheckoutYear` and arranging the result by `CheckoutYear`, this time using the Seattle Checkout data saved to disk as a single, Parquet file. Did you notice a difference in compute time?\n\n## Solution 1\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_parquet <- \"data/seattle-library-checkouts-parquet\"\n\nopen_dataset(sources = seattle_parquet, \n             format = \"parquet\") |>\n  group_by(CheckoutYear) |>\n  summarise(sum(Checkouts)) |>\n  arrange(CheckoutYear) |> \n  collect() |> \n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n   user  system elapsed \n  1.634   0.345   0.558 \n```\n\n\n:::\n:::\n\n\nA *much* faster compute time for the query when the on-disk data is stored in the Parquet format.\n:::\n:::\n\n# Partitioning\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_parquet_part <- \"data/seattle-library-checkouts\"\n\nseattle_csv |>\n  group_by(CheckoutYear) |>\n  write_dataset(path = seattle_parquet_part,\n                format = \"parquet\")\n```\n:::\n\n\n::: callout-tip\n# Partitioning\n\n::: panel-tabset\n## Problems\n\n1.  Let's write the Seattle Checkout CSV data to a multi-file dataset just one more time! This time, write the data partitioned by `CheckoutType` as Parquet files.\n\n2.  Now compare the compute time between our Parquet data partitioned by `CheckoutYear` and our Parquet data partitioned by `CheckoutType` with a query of the total number of checkouts in September of 2019. Did you find a difference in compute time?\n\n## Solution 1\n\nWriting the data:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_checkouttype <- \"data/seattle-library-checkouts-type\"\n\nseattle_csv |>\n  group_by(CheckoutType) |>\n  write_dataset(path = seattle_checkouttype,\n                format = \"parquet\")\n```\n:::\n\n\n## Solution 2\n\nTotal number of Checkouts in September of 2019 using partitioned Parquet data by `CheckoutType`:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nopen_dataset(sources = \"data/seattle-library-checkouts-type\") |> \n  filter(CheckoutYear == 2019, CheckoutMonth == 9) |> \n  summarise(TotalCheckouts = sum(Checkouts)) |>\n  collect() |> \n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n   user  system elapsed \n  0.777   0.072   0.296 \n```\n\n\n:::\n:::\n\n\nTotal number of Checkouts in September of 2019 using partitioned Parquet data by `CheckoutYear` and `CheckoutMonth`:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nopen_dataset(\"data/seattle-library-checkouts\") |> \n  filter(CheckoutYear == 2019, CheckoutMonth == 9) |> \n  summarise(TotalCheckouts = sum(Checkouts)) |>\n  collect() |> \n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n   user  system elapsed \n  0.034   0.005   0.030 \n```\n\n\n:::\n:::\n\n\nFaster compute time because the `filter()` call is based on the partitions.\n:::\n:::\n",
     "supporting": [],
     "filters": [
       "rmarkdown/pagebreak.lua"
diff --git a/_freeze/materials/3_data_engineering/execute-results/html.json b/_freeze/materials/3_data_engineering/execute-results/html.json
index c7ca5bd..74eb5f1 100644
--- a/_freeze/materials/3_data_engineering/execute-results/html.json
+++ b/_freeze/materials/3_data_engineering/execute-results/html.json
@@ -1,8 +1,8 @@
 {
-  "hash": "6a1fef61889b21e92adbcf1ca896a4f9",
+  "hash": "bccca98d2edd2bc29031ab127391891b",
   "result": {
     "engine": "knitr",
-    "markdown": "---\nfooter: \"[🔗 pos.it/arrow-conf24](https://pos.it/arrow-conf24)\"\nlogo: \"images/logo.png\"\nexecute:\n  echo: true\nformat:\n  revealjs: \n    theme: default\nengine: knitr\neditor: source\n---\n\n::: {.cell}\n\n:::\n\n\n\n\n# Data Engineering with Arrow {#data-eng-storage}\n\n## Data Engineering\n\n<br>\n\n![](images/data-engineering.png)\n\n<br>\n\n::: {style=\"font-size: 70%;\"}\n<https://en.wikipedia.org/wiki/Data_engineering>\n:::\n\n## .NORM Files\n\n![](images/norm_normal_file_format_2x.png){.absolute top=\"0\" left=\"400\"}\n\n<br>\n\n::: {style=\"font-size: 70%;\"}\n<https://xkcd.com/2116/>\n:::\n\n## Poll: Formats\n\n<br>\n\n**Which file formats do you use most often?**\n\n<br> \n\n- 1️⃣ CSV (.csv)\n- 2️⃣ MS Excel (.xls and .xlsx)\n- 3️⃣ Parquet (.parquet)\n- 4️⃣ Something else\n\n\n## Arrow & File Formats\n\n![](images/arrow-read-write-updated.png)\n\n## Seattle<br>Checkouts<br>Big CSV\n\n![](images/seattle-checkouts.png){.absolute top=\"0\" left=\"300\"}\n\n::: {style=\"font-size: 60%; margin-top: 440px; margin-left: 330px;\"}\n<https://data.seattle.gov/Community/Checkouts-by-Title/tmmm-ytt6>\n:::\n\n## Dataset contents\n\n![](images/datapreview.png){height=\"550\"}\n\n## arrow::open_dataset() with a CSV\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(arrow)\nlibrary(dplyr)\n\n\nseattle_csv <- open_dataset(sources = \"data/seattle-library-checkouts.csv\",\n               format = \"csv\")\nseattle_csv\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\nFileSystemDataset with 1 csv file\n12 columns\nUsageClass: string\nCheckoutType: string\nMaterialType: string\nCheckoutYear: int64\nCheckoutMonth: int64\nCheckouts: int64\nTitle: string\nISBN: null\nCreator: string\nSubjects: string\nPublisher: string\nPublicationYear: string\n```\n\n\n:::\n:::\n\n\n\n\n## arrow::schema()\n\n> Create a schema or extract one from an object.\n\n<br>\n\nLet's extract the schema:\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nschema(seattle_csv)\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\nSchema\nUsageClass: string\nCheckoutType: string\nMaterialType: string\nCheckoutYear: int64\nCheckoutMonth: int64\nCheckouts: int64\nTitle: string\nISBN: null\nCreator: string\nSubjects: string\nPublisher: string\nPublicationYear: string\n```\n\n\n:::\n:::\n\n\n\n\n## Arrow Data Types\n\nArrow has a rich data type system, including direct analogs of many R data types\n\n-   `<dbl>` == `<double>`\n-   `<chr>` == `<string>` OR `<utf8>` (aliases)\n-   `<int>` == `<int32>`\n\n<br>\n\n<https://arrow.apache.org/docs/r/articles/data_types.html>\n\n## Parsing the Metadata\n\n<br>\n\nArrow scans 👀 1MB of data to impute or \"guess\" the data types\n\n::: {style=\"font-size: 80%; margin-top: 200px;\"}\n📚 arrow vs readr blog post: <https://thisisnic.github.io/2022/11/21/type-inference-in-readr-and-arrow/>\n:::\n\n## Parsers Are Not Always Right\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nschema(seattle_csv)\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\nSchema\nUsageClass: string\nCheckoutType: string\nMaterialType: string\nCheckoutYear: int64\nCheckoutMonth: int64\nCheckouts: int64\nTitle: string\nISBN: null\nCreator: string\nSubjects: string\nPublisher: string\nPublicationYear: string\n```\n\n\n:::\n:::\n\n\n\n\n![](images/data-dict.png){.absolute top=\"300\" left=\"330\" width=\"700\"}\n\n::: notes\nInternational Standard Book Number (ISBN) is a 13-digit number that uniquely identifies books and book-like products published internationally.\n\nData Dictionaries, metadata in data catalogues should provide this info.\n\nThe number or rows used to infer the schema will vary depending on the data in each column, total number of columns, and how many bytes each value takes up in memory. \n\nIf all of the values in a column that lie within the first 1MB of the file are missing values, arrow will classify this data as null type. ISBN! Phone numbers, zip codes, leading zeros...\n\nRecommended specifying a schema when working with CSV datasets to avoid potential issues like this\n:::\n\n## Let's Control the Schema\n\nCreating a schema manually:\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nschema(\n  UsageClass = utf8(),\n  CheckoutType = utf8(),\n  MaterialType = utf8(),\n  CheckoutYear = int64(),\n  CheckoutMonth = int64(),\n  Checkouts = int64(),\n  Title = utf8(),\n  ISBN = string(), #utf8()\n  Creator = utf8(),\n  Subjects = utf8(),\n  Publisher = utf8(),\n  PublicationYear = utf8()\n)\n```\n:::\n\n\n\n\n<br>\n\nThis will take a lot of typing with 12 columns 😢\n\n## Let's Control the Schema\n\nUse the code() method to extract the code from the schema:\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_csv$schema$code() \n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\nschema(UsageClass = utf8(), CheckoutType = utf8(), MaterialType = utf8(), \n    CheckoutYear = int64(), CheckoutMonth = int64(), Checkouts = int64(), \n    Title = utf8(), ISBN = null(), Creator = utf8(), Subjects = utf8(), \n    Publisher = utf8(), PublicationYear = utf8())\n```\n\n\n:::\n:::\n\n\n\n\n<br>\n\n🤩\n\n## Let's Control the Schema\n\nSchema defines column names and types, so we need to skip the first row (skip = 1):\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code  code-line-numbers=\"|12\"}\nseattle_csv <- open_dataset(sources = \"data/seattle-library-checkouts.csv\",\n  format = \"csv\",\n  skip = 1,\n  schema = schema(\n    UsageClass = utf8(),\n    CheckoutType = utf8(),\n    MaterialType = utf8(),\n    CheckoutYear = int64(),\n    CheckoutMonth = int64(),\n    Checkouts = int64(),\n    Title = utf8(),\n    ISBN = string(), #utf8()\n    Creator = utf8(),\n    Subjects = utf8(),\n    Publisher = utf8(),\n    PublicationYear = utf8()\n  )\n)\nseattle_csv\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\nFileSystemDataset with 1 csv file\n12 columns\nUsageClass: string\nCheckoutType: string\nMaterialType: string\nCheckoutYear: int64\nCheckoutMonth: int64\nCheckouts: int64\nTitle: string\nISBN: string\nCreator: string\nSubjects: string\nPublisher: string\nPublicationYear: string\n```\n\n\n:::\n:::\n\n\n\n\n## Let's Control the Schema\n\nSupply column types for a subset of columns by providing a partial schema:\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_csv <- open_dataset(\n  sources = \"data/seattle-library-checkouts.csv\",\n  format = \"csv\",\n  col_types = schema(ISBN = string()) #utf8()\n)\nseattle_csv\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\nFileSystemDataset with 1 csv file\n12 columns\nUsageClass: string\nCheckoutType: string\nMaterialType: string\nCheckoutYear: int64\nCheckoutMonth: int64\nCheckouts: int64\nTitle: string\nISBN: string\nCreator: string\nSubjects: string\nPublisher: string\nPublicationYear: string\n```\n\n\n:::\n:::\n\n\n\n\n\n## Your Turn\n\n1.  The first few thousand rows of `ISBN` are blank in the Seattle Checkouts CSV file. Read in the Seattle Checkouts CSV file with `open_dataset()` and ensure the correct data type for `ISBN` is `<string>` instead of the `<null>` interpreted by Arrow.\n\n2.  Once you have a `Dataset` object with the metadata you are after, count the number of `Checkouts` by `CheckoutYear` and arrange the result by `CheckoutYear`.\n\n➡️ [Data Storage Engineering Exercises Page](3_data_engineering-exercises.html)\n\n## 9GB CSV file + arrow + dplyr\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_csv |>\n  group_by(CheckoutYear) |>\n  summarise(sum(Checkouts)) |>\n  arrange(CheckoutYear) |> \n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n# A tibble: 18 × 2\n   CheckoutYear `sum(Checkouts)`\n          <int>            <int>\n 1         2005          3798685\n 2         2006          6599318\n 3         2007          7126627\n 4         2008          8438486\n 5         2009          9135167\n 6         2010          8608966\n 7         2011          8321732\n 8         2012          8163046\n 9         2013          9057096\n10         2014          9136081\n11         2015          9084179\n12         2016          9021051\n13         2017          9231648\n14         2018          9149176\n15         2019          9199083\n16         2020          6053717\n17         2021          7361031\n18         2022          7001989\n```\n\n\n:::\n:::\n\n\n\n\n## 9GB CSV file + arrow + dplyr\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code  code-line-numbers=\"6\"}\nseattle_csv |>\n  group_by(CheckoutYear) |>\n  summarise(sum(Checkouts)) |>\n  arrange(CheckoutYear) |> \n  collect() |>\n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n   user  system elapsed \n 11.581   1.136  11.117 \n```\n\n\n:::\n:::\n\n\n\n\n42 million rows -- not bad, but could be faster....\n\n## File Format: Apache Parquet\n\n![](images/apache-parquet.png){.absolute top=\"100\" left=\"200\" width=\"700\"}\n\n::: {style=\"font-size: 60%; margin-top: 450px;\"}\n<https://parquet.apache.org/>\n:::\n\n\n## Parquet Files: \"row-chunked\"\n\n![](images/parquet-chunking.png)\n\n## Parquet Files: \"row-chunked & column-oriented\"\n\n![](images/parquet-columnar.png)\n\n## Parquet\n\n-   compression and encoding == usually much smaller than equivalent CSV file, less data to move from disk to memory\n-   rich type system & stores the schema along with the data == more robust pipelines\n-   \"row-chunked & column-oriented\" == work on different parts of the file at the same time or skip some chunks all together, better performance than row-by-row\n\n::: notes\n-   efficient encodings to keep file size down, and supports file compression, less data to move from disk to memory\n-   CSV has no info about data types, inferred by each parser\n:::\n\n\n## Writing to Parquet\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_parquet <- \"data/seattle-library-checkouts-parquet\"\n\nseattle_csv |>\n  write_dataset(path = seattle_parquet,\n                format = \"parquet\")\n```\n:::\n\n\n\n\n## Storage: Parquet vs CSV\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nfile <- list.files(seattle_parquet)\nfile.size(file.path(seattle_parquet, file)) / 10**9\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n[1] 4.423348\n```\n\n\n:::\n:::\n\n\n\n\n<br>\n\nParquet about half the size of the CSV file on-disk 💾\n\n## Your Turn\n\n1.  Re-run the query counting the number of `Checkouts` by `CheckoutYear` and arranging the result by `CheckoutYear`, this time using the Seattle Checkout data saved to disk as a single, Parquet file. Did you notice a difference in compute time?\n\n➡️ [Data Storage Engineering Exercises Page](3_data_engineering-exercises.html)\n\n## 4.5GB Parquet file + arrow + dplyr\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nopen_dataset(sources = seattle_parquet, \n             format = \"parquet\") |>\n  group_by(CheckoutYear) |>\n  summarise(sum(Checkouts)) |>\n  arrange(CheckoutYear) |> \n  collect() |>\n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n   user  system elapsed \n  2.018   0.265   0.595 \n```\n\n\n:::\n:::\n\n\n\n\n42 million rows -- much better! But could be *even* faster....\n\n## File Storage:<br>Partitioning\n\n<br>\n\n::: columns\n::: {.column width=\"50%\"}\nDividing data into smaller pieces, making it more easily accessible and manageable\n:::\n\n::: {.column width=\"50%\"}\n![](images/partitions.png){.absolute top=\"0\"}\n:::\n:::\n\n::: notes\nalso called multi-files or sometimes shards\n:::\n\n## Poll: Partitioning?\n\nHave you partitioned your data or used partitioned data before today?\n\n<br>\n\n- 1️⃣ Yes\n- 2️⃣ No\n- 3️⃣ Not sure, the data engineers sort that out!\n\n## Art & Science of Partitioning\n\n<br>\n\n-   avoid files \\< 20MB and \\> 2GB\n-   avoid \\> 10,000 files (🤯)\n-   partition on variables used in `filter()`\n\n::: notes\n-   guidelines not rules, results vary\n-   experiment, especially with cloud\n-   arrow suggests avoid files smaller than 20MB and larger than 2GB\n-   avoid partitions that produce more than 10,000 files\n-   partition by variables that you filter by, allows arrow to only read relevant files\n:::\n\n## Rewriting the Data Again\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_parquet_part <- \"data/seattle-library-checkouts\"\n\nseattle_csv |>\n  group_by(CheckoutYear) |>\n  write_dataset(path = seattle_parquet_part,\n                format = \"parquet\")\n```\n:::\n\n\n\n\n## What Did We \"Engineer\"?\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_parquet_part <- \"data/seattle-library-checkouts\"\n\nsizes <- tibble(\n  files = list.files(seattle_parquet_part, recursive = TRUE),\n  size_GB = file.size(file.path(seattle_parquet_part, files)) / 10**9\n)\n\nsizes\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n# A tibble: 18 × 2\n   files                            size_GB\n   <chr>                              <dbl>\n 1 CheckoutYear=2005/part-0.parquet   0.114\n 2 CheckoutYear=2006/part-0.parquet   0.172\n 3 CheckoutYear=2007/part-0.parquet   0.186\n 4 CheckoutYear=2008/part-0.parquet   0.204\n 5 CheckoutYear=2009/part-0.parquet   0.224\n 6 CheckoutYear=2010/part-0.parquet   0.233\n 7 CheckoutYear=2011/part-0.parquet   0.250\n 8 CheckoutYear=2012/part-0.parquet   0.261\n 9 CheckoutYear=2013/part-0.parquet   0.282\n10 CheckoutYear=2014/part-0.parquet   0.296\n11 CheckoutYear=2015/part-0.parquet   0.308\n12 CheckoutYear=2016/part-0.parquet   0.315\n13 CheckoutYear=2017/part-0.parquet   0.319\n14 CheckoutYear=2018/part-0.parquet   0.306\n15 CheckoutYear=2019/part-0.parquet   0.302\n16 CheckoutYear=2020/part-0.parquet   0.158\n17 CheckoutYear=2021/part-0.parquet   0.240\n18 CheckoutYear=2022/part-0.parquet   0.252\n```\n\n\n:::\n:::\n\n\n\n\n## 4.5GB partitioned Parquet files + arrow + dplyr\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_parquet_part <- \"data/seattle-library-checkouts\"\n\nopen_dataset(sources = seattle_parquet_part,\n             format = \"parquet\") |>\n  group_by(CheckoutYear) |>\n  summarise(sum(Checkouts)) |>\n  arrange(CheckoutYear) |> \n  collect() |>\n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n   user  system elapsed \n  1.640   0.220   0.267 \n```\n\n\n:::\n:::\n\n\n\n\n<br>\n\n42 million rows -- not too shabby!\n\n## Your Turn\n\n1.  Let's write the Seattle Checkout CSV data to a multi-file dataset just one more time! This time, write the data partitioned by `CheckoutType` as Parquet files.\n\n2.  Now compare the compute time between our Parquet data partitioned by `CheckoutYear` and our Parquet data partitioned by `CheckoutType` with a query of the total number of checkouts in September of 2019. Did you find a difference in compute time?\n\n➡️ [Data Storage Engineering Exercises Page](3_data_engineering-exercises.html)\n\n\n## Partitions & NA Values\n\nADD content\n\n## Partition Design\n\n::: columns\n::: {.column width=\"50%\"}\n-   Partitioning on variables commonly used in `filter()` often faster\n-   Number of partitions also important (Arrow reads the metadata of each file)\n:::\n\n::: {.column width=\"50%\"}\n![](images/partitions.png){.absolute top=\"0\"}\n:::\n:::\n\n## Performance Review: Single CSV\n\nHow long does it take to calculate the number of books checked out in each month of 2021?\n\n<br>\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nopen_dataset(sources = \"data/seattle-library-checkouts.csv\", \n  format = \"csv\") |> \n\n  filter(CheckoutYear == 2021, MaterialType == \"BOOK\") |>\n  group_by(CheckoutMonth) |>\n  summarise(TotalCheckouts = sum(Checkouts)) |>\n  arrange(desc(CheckoutMonth)) |>\n  collect() |>\n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n   user  system elapsed \n 13.362   1.763  12.438 \n```\n\n\n:::\n:::\n\n\n\n\n## Performance Review: Partitioned Parquet\n\nHow long does it take to calculate the number of books checked out in each month of 2021?\n\n<br>\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nopen_dataset(sources = \"data/seattle-library-checkouts\",\n             format = \"parquet\") |> \n  filter(CheckoutYear == 2021, MaterialType == \"BOOK\") |>\n  group_by(CheckoutMonth) |>\n  summarise(TotalCheckouts = sum(Checkouts)) |>\n  arrange(desc(CheckoutMonth)) |>\n  collect() |> \n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n   user  system elapsed \n  0.330   0.039   0.091 \n```\n\n\n:::\n:::\n\n\n\n\n## Engineering Data Tips for Improved Storage & Performance\n\n<br>\n\n-   consider \"column-oriented\" file formats like Parquet\n-   consider partitioning, experiment to get an appropriate partition design 🗂️\n-   watch your schemas 👀\n\n## R for Data Science (2e)\n\n::: columns\n::: {.column width=\"50%\"}\n![](images/r4ds-cover.jpg){.absolute top=\"100\" width=\"400\"}\n:::\n\n::: {.column width=\"50%\"}\n<br>\n\n[Chapter 23: Arrow](https://r4ds.hadley.nz/arrow.html)\n\n<br>\n\n<https://r4ds.hadley.nz/>\n:::\n:::\n",
+    "markdown": "---\nfooter: \"[🔗 pos.it/arrow-conf24](https://pos.it/arrow-conf24)\"\nlogo: \"images/logo.png\"\nexecute:\n  echo: true\nformat:\n  revealjs: \n    theme: default\nengine: knitr\neditor: source\n---\n\n::: {.cell}\n\n:::\n\n\n# Data Engineering with Arrow {#data-eng-storage}\n\n## Data Engineering\n\n<br>\n\n![](images/data-engineering.png)\n\n<br>\n\n::: {style=\"font-size: 70%;\"}\n<https://en.wikipedia.org/wiki/Data_engineering>\n:::\n\n## .NORM Files\n\n![](images/norm_normal_file_format_2x.png){.absolute top=\"0\" left=\"400\"}\n\n<br>\n\n::: {style=\"font-size: 70%;\"}\n<https://xkcd.com/2116/>\n:::\n\n## Poll: Formats\n\n<br>\n\n**Which file formats do you use most often?**\n\n<br> \n\n- 1️⃣ CSV (.csv)\n- 2️⃣ MS Excel (.xls and .xlsx)\n- 3️⃣ Parquet (.parquet)\n- 4️⃣ Something else\n\n## Arrow & File Formats\n\n![](images/arrow-read-write-updated.png)\n\n## Seattle<br>Checkouts<br>Big CSV\n\n![](images/seattle-checkouts.png){.absolute top=\"0\" left=\"300\"}\n\n::: {style=\"font-size: 60%; margin-top: 440px; margin-left: 330px;\"}\n<https://data.seattle.gov/Community/Checkouts-by-Title/tmmm-ytt6>\n:::\n\n## Dataset contents\n\n![](images/datapreview.png){height=\"550\"}\n\n## arrow::open_dataset() with a CSV\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(arrow)\nlibrary(dplyr)\n\n\nseattle_csv <- open_dataset(sources = \"data/seattle-library-checkouts.csv\",\n                            format = \"csv\")\nseattle_csv\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\nFileSystemDataset with 1 csv file\n12 columns\nUsageClass: string\nCheckoutType: string\nMaterialType: string\nCheckoutYear: int64\nCheckoutMonth: int64\nCheckouts: int64\nTitle: string\nISBN: null\nCreator: string\nSubjects: string\nPublisher: string\nPublicationYear: string\n```\n\n\n:::\n:::\n\n\n## arrow::schema()\n\n> Create a schema or extract one from an object.\n\n<br>\n\nLet's extract the schema:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nschema(seattle_csv)\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\nSchema\nUsageClass: string\nCheckoutType: string\nMaterialType: string\nCheckoutYear: int64\nCheckoutMonth: int64\nCheckouts: int64\nTitle: string\nISBN: null\nCreator: string\nSubjects: string\nPublisher: string\nPublicationYear: string\n```\n\n\n:::\n:::\n\n\n## Arrow Data Types\n\nArrow has a rich data type system, including direct analogs of many R data types\n\n-   `<dbl>` == `<double>`\n-   `<chr>` == `<string>` OR `<utf8>` (aliases)\n-   `<int>` == `<int32>`\n\n<br>\n\n<https://arrow.apache.org/docs/r/articles/data_types.html>\n\n## Parsing the Metadata\n\n<br>\n\nArrow scans 👀 1MB of data to impute or \"guess\" the data types\n\n::: {style=\"font-size: 80%; margin-top: 200px;\"}\n📚 arrow vs readr blog post: <https://thisisnic.github.io/2022/11/21/type-inference-in-readr-and-arrow/>\n:::\n\n## Parsers Are Not Always Right\n\n\n::: {.cell}\n\n```{.r .cell-code}\nschema(seattle_csv)\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\nSchema\nUsageClass: string\nCheckoutType: string\nMaterialType: string\nCheckoutYear: int64\nCheckoutMonth: int64\nCheckouts: int64\nTitle: string\nISBN: null\nCreator: string\nSubjects: string\nPublisher: string\nPublicationYear: string\n```\n\n\n:::\n:::\n\n\n![](images/data-dict.png){.absolute top=\"300\" left=\"330\" width=\"700\"}\n\n::: notes\nInternational Standard Book Number (ISBN) is a 13-digit number that uniquely identifies books and book-like products published internationally.\n\nData Dictionaries, metadata in data catalogues should provide this info.\n\nThe number or rows used to infer the schema will vary depending on the data in each column, total number of columns, and how many bytes each value takes up in memory. \n\nIf all of the values in a column that lie within the first 1MB of the file are missing values, arrow will classify this data as null type. ISBN! Phone numbers, zip codes, leading zeros...\n\nRecommended specifying a schema when working with CSV datasets to avoid potential issues like this\n:::\n\n## Let's Control the Schema\n\nCreating a schema manually:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nschema(\n  UsageClass = utf8(),\n  CheckoutType = utf8(),\n  MaterialType = utf8(),\n  CheckoutYear = int64(),\n  CheckoutMonth = int64(),\n  Checkouts = int64(),\n  Title = utf8(),\n  ISBN = string(), #utf8()\n  Creator = utf8(),\n  Subjects = utf8(),\n  Publisher = utf8(),\n  PublicationYear = utf8()\n)\n```\n:::\n\n\n<br>\n\nThis will take a lot of typing with 12 columns 😢\n\n## Let's Control the Schema\n\nUse the code() method to extract the code from the schema:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_csv$schema$code() \n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\nschema(UsageClass = utf8(), CheckoutType = utf8(), MaterialType = utf8(), \n    CheckoutYear = int64(), CheckoutMonth = int64(), Checkouts = int64(), \n    Title = utf8(), ISBN = null(), Creator = utf8(), Subjects = utf8(), \n    Publisher = utf8(), PublicationYear = utf8())\n```\n\n\n:::\n:::\n\n\n<br>\n\n🤩\n\n## Let's Control the Schema\n\nSchema defines column names and types, so we need to skip the first row (skip = 1):\n\n\n::: {.cell}\n\n```{.r .cell-code  code-line-numbers=\"|11|17\"}\nseattle_csv <- open_dataset(sources = \"data/seattle-library-checkouts.csv\",\n  format = \"csv\",\n  schema = schema(\n    UsageClass = utf8(),\n    CheckoutType = utf8(),\n    MaterialType = utf8(),\n    CheckoutYear = int64(),\n    CheckoutMonth = int64(),\n    Checkouts = int64(),\n    Title = utf8(),\n    ISBN = string(), #utf8()\n    Creator = utf8(),\n    Subjects = utf8(),\n    Publisher = utf8(),\n    PublicationYear = utf8()\n  ),\n    skip = 1,\n)\nseattle_csv\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\nFileSystemDataset with 1 csv file\n12 columns\nUsageClass: string\nCheckoutType: string\nMaterialType: string\nCheckoutYear: int64\nCheckoutMonth: int64\nCheckouts: int64\nTitle: string\nISBN: string\nCreator: string\nSubjects: string\nPublisher: string\nPublicationYear: string\n```\n\n\n:::\n:::\n\n\n## Let's Control the Schema\n\nSupply column types for a subset of columns by providing a partial schema:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_csv <- open_dataset(\n  sources = \"data/seattle-library-checkouts.csv\",\n  format = \"csv\",\n  col_types = schema(ISBN = string()) #utf8()\n)\nseattle_csv\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\nFileSystemDataset with 1 csv file\n12 columns\nUsageClass: string\nCheckoutType: string\nMaterialType: string\nCheckoutYear: int64\nCheckoutMonth: int64\nCheckouts: int64\nTitle: string\nISBN: string\nCreator: string\nSubjects: string\nPublisher: string\nPublicationYear: string\n```\n\n\n:::\n:::\n\n\n\n## Your Turn\n\n1.  The first few thousand rows of `ISBN` are blank in the Seattle Checkouts CSV file. Read in the Seattle Checkouts CSV file with `open_dataset()` and ensure the correct data type for `ISBN` is `<string>` (or the alias `<utf8>`) instead of the `<null>` interpreted by Arrow.\n\n2.  Once you have a `Dataset` object with the metadata you are after, count the number of `Checkouts` by `CheckoutYear` and arrange the result by `CheckoutYear`.\n\n➡️ [Data Storage Engineering Exercises Page](3_data_engineering-exercises.html)\n\n## 9GB CSV file + arrow + dplyr\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_csv |>\n  group_by(CheckoutYear) |>\n  summarise(sum(Checkouts)) |>\n  arrange(CheckoutYear) |> \n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n# A tibble: 18 × 2\n   CheckoutYear `sum(Checkouts)`\n          <int>            <int>\n 1         2005          3798685\n 2         2006          6599318\n 3         2007          7126627\n 4         2008          8438486\n 5         2009          9135167\n 6         2010          8608966\n 7         2011          8321732\n 8         2012          8163046\n 9         2013          9057096\n10         2014          9136081\n11         2015          9084179\n12         2016          9021051\n13         2017          9231648\n14         2018          9149176\n15         2019          9199083\n16         2020          6053717\n17         2021          7361031\n18         2022          7001989\n```\n\n\n:::\n:::\n\n\n## 9GB CSV file + arrow + dplyr\n\n\n::: {.cell}\n\n```{.r .cell-code  code-line-numbers=\"6\"}\nseattle_csv |>\n  group_by(CheckoutYear) |>\n  summarise(sum(Checkouts)) |>\n  arrange(CheckoutYear) |> \n  collect() |>\n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n   user  system elapsed \n 10.688   1.099  10.451 \n```\n\n\n:::\n:::\n\n\n42 million rows -- not bad, but could be faster....\n\n## File Format: Apache Parquet\n\n![](images/apache-parquet.png){.absolute top=\"100\" left=\"200\" width=\"700\"}\n\n::: {style=\"font-size: 60%; margin-top: 450px;\"}\n<https://parquet.apache.org/>\n:::\n\n## Parquet Files: \"row-chunked\"\n\n![](images/parquet-chunking.png)\n\n## Parquet Files: \"row-chunked & column-oriented\"\n\n![](images/parquet-columnar.png)\n\n## Parquet\n\n-   compression and encoding == usually much smaller than equivalent CSV file, less data to move from disk to memory\n-   rich type system & stores the schema along with the data == more robust pipelines\n-   \"row-chunked & column-oriented\" == work on different parts of the file at the same time or skip some chunks all together, better performance than row-by-row\n\n::: notes\n-   efficient encodings to keep file size down, and supports file compression, less data to move from disk to memory\n-   CSV has no info about data types, inferred by each parser\n:::\n\n## Writing to Parquet\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_parquet <- \"data/seattle-library-checkouts-parquet\"\n\nseattle_csv |>\n  write_dataset(path = seattle_parquet,\n                format = \"parquet\")\n```\n:::\n\n\n## Storage: Parquet vs CSV\n\n\n::: {.cell}\n\n```{.r .cell-code}\nfile <- list.files(seattle_parquet)\nfile.size(file.path(seattle_parquet, file)) / 10**9\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n[1] 4.424267\n```\n\n\n:::\n:::\n\n\n<br>\n\nParquet about half the size of the CSV file on-disk 💾\n\n## Your Turn\n\n1.  Re-run the query counting the number of `Checkouts` by `CheckoutYear` and arranging the result by `CheckoutYear`, this time using the Seattle Checkout data saved to disk as a single, Parquet file. Did you notice a difference in compute time?\n\n➡️ [Data Storage Engineering Exercises Page](3_data_engineering-exercises.html)\n\n## 4.5GB Parquet file + arrow + dplyr\n\n\n::: {.cell}\n\n```{.r .cell-code}\nopen_dataset(sources = seattle_parquet, \n             format = \"parquet\") |>\n  group_by(CheckoutYear) |>\n  summarise(sum(Checkouts)) |>\n  arrange(CheckoutYear) |> \n  collect() |>\n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n   user  system elapsed \n  1.771   0.431   0.568 \n```\n\n\n:::\n:::\n\n\n42 million rows -- much better! But could be *even* faster....\n\n## File Storage:<br>Partitioning\n\n<br>\n\n::: columns\n::: {.column width=\"50%\"}\nDividing data into smaller pieces, making it more easily accessible and manageable\n:::\n\n::: {.column width=\"50%\"}\n![](images/partitions.png){.absolute top=\"0\"}\n:::\n:::\n\n::: notes\nalso called multi-files or sometimes shards\n:::\n\n## Poll: Partitioning?\n\nHave you partitioned your data or used partitioned data before today?\n\n<br>\n\n- 1️⃣ Yes\n- 2️⃣ No\n- 3️⃣ Not sure, the data engineers sort that out!\n\n## Art & Science of Partitioning\n\n<br>\n\n-   avoid files \\< 20MB and \\> 2GB\n-   avoid \\> 10,000 files (🤯)\n-   partition on variables used in `filter()`\n\n::: notes\n-   guidelines not rules, results vary\n-   experiment, especially with cloud\n-   arrow suggests avoid files smaller than 20MB and larger than 2GB\n-   avoid partitions that produce more than 10,000 files\n-   partition by variables that you filter by, allows arrow to only read relevant files\n:::\n\n## Rewriting the Data Again\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_parquet_part <- \"data/seattle-library-checkouts\"\n\nseattle_csv |>\n  group_by(CheckoutYear) |>\n  write_dataset(path = seattle_parquet_part,\n                format = \"parquet\")\n```\n:::\n\n\n## What Did We \"Engineer\"?\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_parquet_part <- \"data/seattle-library-checkouts\"\n\nsizes <- tibble(\n  files = list.files(seattle_parquet_part, recursive = TRUE),\n  size_GB = file.size(file.path(seattle_parquet_part, files)) / 10**9\n)\n\nsizes\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n# A tibble: 18 × 2\n   files                            size_GB\n   <chr>                              <dbl>\n 1 CheckoutYear=2005/part-0.parquet   0.115\n 2 CheckoutYear=2006/part-0.parquet   0.172\n 3 CheckoutYear=2007/part-0.parquet   0.186\n 4 CheckoutYear=2008/part-0.parquet   0.204\n 5 CheckoutYear=2009/part-0.parquet   0.224\n 6 CheckoutYear=2010/part-0.parquet   0.233\n 7 CheckoutYear=2011/part-0.parquet   0.250\n 8 CheckoutYear=2012/part-0.parquet   0.261\n 9 CheckoutYear=2013/part-0.parquet   0.282\n10 CheckoutYear=2014/part-0.parquet   0.296\n11 CheckoutYear=2015/part-0.parquet   0.308\n12 CheckoutYear=2016/part-0.parquet   0.315\n13 CheckoutYear=2017/part-0.parquet   0.319\n14 CheckoutYear=2018/part-0.parquet   0.306\n15 CheckoutYear=2019/part-0.parquet   0.303\n16 CheckoutYear=2020/part-0.parquet   0.158\n17 CheckoutYear=2021/part-0.parquet   0.240\n18 CheckoutYear=2022/part-0.parquet   0.252\n```\n\n\n:::\n:::\n\n\n## 4.5GB partitioned Parquet files + arrow + dplyr\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_parquet_part <- \"data/seattle-library-checkouts\"\n\nopen_dataset(sources = seattle_parquet_part,\n             format = \"parquet\") |>\n  group_by(CheckoutYear) |>\n  summarise(sum(Checkouts)) |>\n  arrange(CheckoutYear) |> \n  collect() |>\n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n   user  system elapsed \n  1.742   0.385   0.366 \n```\n\n\n:::\n:::\n\n\n<br>\n\n42 million rows -- not too shabby!\n\n## Your Turn\n\n1.  Let's write the Seattle Checkout CSV data to a multi-file dataset just one more time! This time, write the data partitioned by `CheckoutType` as Parquet files.\n\n2.  Now compare the compute time between our Parquet data partitioned by `CheckoutYear` and our Parquet data partitioned by `CheckoutType` with a query of the total number of checkouts in September of 2019. Did you find a difference in compute time?\n\n➡️ [Data Storage Engineering Exercises Page](3_data_engineering-exercises.html)\n\n## Partition Design\n\n::: columns\n::: {.column width=\"50%\"}\n-   Partitioning on variables commonly used in `filter()` often faster\n-   Number of partitions also important (Arrow reads the metadata of each file)\n:::\n\n::: {.column width=\"50%\"}\n![](images/partitions.png){.absolute top=\"0\"}\n:::\n:::\n\n## Partitions & NA Values\n\nDefault:\n\n\n::: {.cell}\n\n```{.r .cell-code}\npartition_na_default_path <- \"data/na-partition-default\"\n\nwrite_dataset(starwars,\n              partition_na_default_path,\n              partitioning = \"hair_color\")\n\nlist.files(partition_na_default_path)\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n [1] \"hair_color=__HIVE_DEFAULT_PARTITION__\"\n [2] \"hair_color=auburn\"                    \n [3] \"hair_color=auburn%2C%20grey\"          \n [4] \"hair_color=auburn%2C%20white\"         \n [5] \"hair_color=black\"                     \n [6] \"hair_color=blond\"                     \n [7] \"hair_color=blonde\"                    \n [8] \"hair_color=brown\"                     \n [9] \"hair_color=brown%2C%20grey\"           \n[10] \"hair_color=grey\"                      \n[11] \"hair_color=none\"                      \n[12] \"hair_color=white\"                     \n```\n\n\n:::\n:::\n\n\n## Partitions & NA Values\n\nCustom:\n\n\n::: {.cell}\n\n```{.r .cell-code}\npartition_na_custom_path <- \"data/na-partition-custom\"\n\nwrite_dataset(starwars,\n              partition_na_custom_path,\n              partitioning = hive_partition(hair_color = string(),\n                                            null_fallback = \"no_color\"))\n\nlist.files(partition_na_custom_path)\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n [1] \"hair_color=auburn\"            \"hair_color=auburn%2C%20grey\" \n [3] \"hair_color=auburn%2C%20white\" \"hair_color=black\"            \n [5] \"hair_color=blond\"             \"hair_color=blonde\"           \n [7] \"hair_color=brown\"             \"hair_color=brown%2C%20grey\"  \n [9] \"hair_color=grey\"              \"hair_color=no_color\"         \n[11] \"hair_color=none\"              \"hair_color=white\"            \n```\n\n\n:::\n:::\n\n\n## Performance Review: Single CSV\n\nHow long does it take to calculate the number of books checked out in each month of 2021?\n\n<br>\n\n\n::: {.cell}\n\n```{.r .cell-code}\nopen_dataset(sources = \"data/seattle-library-checkouts.csv\", \n  format = \"csv\") |> \n\n  filter(CheckoutYear == 2021, MaterialType == \"BOOK\") |>\n  group_by(CheckoutMonth) |>\n  summarise(TotalCheckouts = sum(Checkouts)) |>\n  arrange(desc(CheckoutMonth)) |>\n  collect() |>\n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n   user  system elapsed \n 11.718   1.106  11.250 \n```\n\n\n:::\n:::\n\n\n## Performance Review: Partitioned Parquet\n\nHow long does it take to calculate the number of books checked out in each month of 2021?\n\n<br>\n\n\n::: {.cell}\n\n```{.r .cell-code}\nopen_dataset(sources = \"data/seattle-library-checkouts\",\n             format = \"parquet\") |> \n  filter(CheckoutYear == 2021, MaterialType == \"BOOK\") |>\n  group_by(CheckoutMonth) |>\n  summarise(TotalCheckouts = sum(Checkouts)) |>\n  arrange(desc(CheckoutMonth)) |>\n  collect() |> \n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n   user  system elapsed \n  0.224   0.040   0.068 \n```\n\n\n:::\n:::\n\n\n## Engineering Data Tips for Improved Storage & Performance\n\n<br>\n\n-   consider \"column-oriented\" file formats like Parquet\n-   consider partitioning, experiment to get an appropriate partition design 🗂️\n-   watch your schemas 👀\n\n## R for Data Science (2e)\n\n::: columns\n::: {.column width=\"50%\"}\n![](images/r4ds-cover.jpg){.absolute top=\"100\" width=\"400\"}\n:::\n\n::: {.column width=\"50%\"}\n<br>\n\n[Chapter 23: Arrow](https://r4ds.hadley.nz/arrow.html)\n\n<br>\n\n<https://r4ds.hadley.nz/>\n:::\n:::\n",
     "supporting": [],
     "filters": [
       "rmarkdown/pagebreak.lua"
diff --git a/_freeze/materials/5_arrow_single_file/execute-results/html.json b/_freeze/materials/5_arrow_single_file/execute-results/html.json
index e2238ec..1ad4ad9 100644
--- a/_freeze/materials/5_arrow_single_file/execute-results/html.json
+++ b/_freeze/materials/5_arrow_single_file/execute-results/html.json
@@ -1,9 +1,11 @@
 {
-  "hash": "635c69d2b9051934482ef06f39f1cc44",
+  "hash": "8e987b3003a6c0a3bf5337526f5d6598",
   "result": {
     "engine": "knitr",
-    "markdown": "---\nfooter: \"[🔗 pos.it/arrow-conf24](https://pos.it/arrow-conf24)\"\nlogo: \"images/logo.png\"\nexecute:\n  echo: true\nformat:\n  revealjs: \n    theme: default\nengine: knitr\neditor: source\n---\n\n\n\n\n# Arrow in R: In-Memory Workflows {#single-file-api}\n\n\n\n\n::: {.cell}\n\n:::\n\n\n\n\n## arrow 📦\n\n![](images/arrow-read-write-updated.png)\n\n## Arrow & Single Files\n\n<br>\n\n`library(arrow)`\n\n-   `read_parquet()`\n-   `read_csv_arrow()`\n-   `read_feather()`\n-   `read_json_arrow()`\n\n**Value**: `tibble` (the default), or an Arrow Table if `as_data_frame = FALSE` --- both *in-memory*\n\n## Your Turn\n\n1.  Read in a single NYC Taxi parquet file using `read_parquet()` as an Arrow Table\n\n2.  Convert your Arrow Table object to a `data.frame` or a `tibble`\n\n## Read a Parquet File (`tibble`)\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(arrow)\n\nparquet_file <- \"data/nyc-taxi/year=2019/month=9/part-0.parquet\"\n\ntaxi_df <- read_parquet(file = parquet_file)\ntaxi_df\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n# A tibble: 6,567,396 × 22\n   vendor_name pickup_datetime     dropoff_datetime    passenger_count\n   <chr>       <dttm>              <dttm>                        <int>\n 1 VTS         2019-09-01 06:14:09 2019-09-01 06:31:52               2\n 2 VTS         2019-09-01 06:36:17 2019-09-01 07:12:44               1\n 3 VTS         2019-09-01 06:29:19 2019-09-01 06:54:13               1\n 4 CMT         2019-09-01 06:33:09 2019-09-01 06:52:14               2\n 5 VTS         2019-09-01 06:57:43 2019-09-01 07:26:21               1\n 6 CMT         2019-09-01 06:59:16 2019-09-01 07:28:12               1\n 7 CMT         2019-09-01 06:20:06 2019-09-01 06:52:19               1\n 8 CMT         2019-09-01 06:27:54 2019-09-01 06:32:56               0\n 9 CMT         2019-09-01 06:35:08 2019-09-01 06:55:51               0\n10 CMT         2019-09-01 06:19:37 2019-09-01 06:30:52               1\n# ℹ 6,567,386 more rows\n# ℹ 18 more variables: trip_distance <dbl>, pickup_longitude <dbl>,\n#   pickup_latitude <dbl>, rate_code <chr>, store_and_fwd <chr>,\n#   dropoff_longitude <dbl>, dropoff_latitude <dbl>, payment_type <chr>,\n#   fare_amount <dbl>, extra <dbl>, mta_tax <dbl>, tip_amount <dbl>,\n#   tolls_amount <dbl>, total_amount <dbl>, improvement_surcharge <dbl>,\n#   congestion_surcharge <dbl>, pickup_location_id <int>, …\n```\n\n\n:::\n:::\n\n\n\n\n## Read a Parquet File (`Table`)\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\ntaxi_table <- read_parquet(file = parquet_file, as_data_frame = FALSE)\ntaxi_table\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\nTable\n6567396 rows x 22 columns\n$vendor_name <string>\n$pickup_datetime <timestamp[ms]>\n$dropoff_datetime <timestamp[ms]>\n$passenger_count <int64>\n$trip_distance <double>\n$pickup_longitude <double>\n$pickup_latitude <double>\n$rate_code <string>\n$store_and_fwd <string>\n$dropoff_longitude <double>\n$dropoff_latitude <double>\n$payment_type <string>\n$fare_amount <double>\n$extra <double>\n$mta_tax <double>\n$tip_amount <double>\n$tolls_amount <double>\n$total_amount <double>\n$improvement_surcharge <double>\n$congestion_surcharge <double>\n...\n2 more columns\nUse `schema()` to see entire schema\n```\n\n\n:::\n:::\n\n\n\n\n## `tibble` \\<-\\> `Table` \\<-\\> `data.frame`\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(dplyr)\n\n#change a df to a table\narrow_table(taxi_df)\n\n#change a table to a tibble\ntaxi_table |> collect()\nas_tibble(taxi_table)\n\n#change a table to a data.frame\nas.data.frame(taxi_table)\n```\n:::\n\n\n\n\n<br>\n\n-   `data.frame` & `tibble` are R objects *in-memory*\n-   `Table` is an Arrow object *in-memory*\n\n## Data frames\n\n![](images/tabular-structures-r.png)\n\n## Arrow Tables\n\n![](images/tabular-structures-arrow-1.png)\n\n::: notes\nArrow Tables are collections of chunked arrays\n:::\n\n## Table \\| Dataset: A `dplyr` pipeline\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nparquet_file |>\n  read_parquet(as_data_frame = FALSE) |>\n  group_by(vendor_name) |>\n  summarise(all_trips = n(),\n            shared_trips = sum(passenger_count > 1, na.rm = TRUE)) |>\n  mutate(pct_shared = shared_trips / all_trips * 100) |>\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n# A tibble: 3 × 4\n  vendor_name all_trips shared_trips pct_shared\n  <chr>           <int>        <int>      <dbl>\n1 VTS           4238808      1339478       31.6\n2 CMT           2294473       470344       20.5\n3 <NA>            34115            0        0  \n```\n\n\n:::\n:::\n\n\n\n\n<br>\n\nFunctions available in Arrow dplyr queries: <https://arrow.apache.org/docs/r/reference/acero.html>\n\n::: notes\nAll the same capabilities as you practiced with Arrow Dataset\n:::\n\n## Arrow for Efficient In-Memory Processing\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nparquet_file |>\n  read_parquet() |>\n  nrow()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n[1] 6567396\n```\n\n\n:::\n:::\n\n\n\n\n<br>\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code  code-line-numbers=\"|2,8\"}\nparquet_file |>\n  read_parquet() |>\n  group_by(vendor_name) |>\n  summarise(all_trips = n(),\n            shared_trips = sum(passenger_count > 1, na.rm = TRUE)) |>\n  mutate(pct_shared = shared_trips / all_trips * 100) |>\n  collect() |>\n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n   user  system elapsed \n  2.214   0.575   0.814 \n```\n\n\n:::\n:::\n\n\n\n\n## Arrow for Efficient In-Memory Processing\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nparquet_file |>\n  read_parquet(as_data_frame = FALSE) |>\n  nrow()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n[1] 6567396\n```\n\n\n:::\n:::\n\n\n\n\n<br>\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code  code-line-numbers=\"|2,8\"}\nparquet_file |>\n  read_parquet(as_data_frame = FALSE) |>\n  group_by(vendor_name) |>\n  summarise(all_trips = n(),\n            shared_trips = sum(passenger_count > 1, na.rm = TRUE)) |>\n  mutate(pct_shared = shared_trips / all_trips * 100) |>\n  collect() |>\n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n   user  system elapsed \n  1.995   0.343   0.366 \n```\n\n\n:::\n:::\n\n\n\n\n## Read a Parquet File Selectively\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nparquet_file |>\n  read_parquet(\n    col_select = c(\"vendor_name\", \"passenger_count\"),\n    as_data_frame = FALSE\n  )\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\nTable\n6567396 rows x 2 columns\n$vendor_name <string>\n$passenger_count <int64>\n```\n\n\n:::\n:::\n\n\n\n\n## Selective Reads Are Faster\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code  code-line-numbers=\"|2,3,4,11\"}\nparquet_file |>\n  read_parquet(\n    col_select = c(\"vendor_name\", \"passenger_count\"),\n    as_data_frame = FALSE\n  ) |> \n  group_by(vendor_name) |>\n  summarise(all_trips = n(),\n            shared_trips = sum(passenger_count > 1, na.rm = TRUE)) |>\n  mutate(pct_shared = shared_trips / all_trips * 100) |>\n  collect() |>\n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n   user  system elapsed \n  0.323   0.088   0.234 \n```\n\n\n:::\n:::\n\n\n\n\n\n:::notes\nNotes: row-based format readers often allow you to specify which columns to read in but the entire row must be read in and the unwanted columns discarded. Parquet’s columnar format allows you to read in only the columns you need, which is faster when you only need a subset of the data.\n:::\n\n## Arrow Table or Dataset?\n\n![](images/2022-09-decision-map.png){.absolute left=\"200\" height=\"550\"}\n\n::: {style=\"font-size: 60%; margin-top: 575px; margin-left: 250px;\"}\n<https://francoismichonneau.net/2022/10/import-big-csv/>\n:::\n\n## Arrow for Improving Those Sluggish Worklows\n\n-   a \"drop-in\" for many dplyr workflows (Arrow Table or Dataset)\n-   works when your tabular data get too big for your RAM (Arrow Dataset)\n-   provides tools for re-engineering data storage for better performance (`arrow::write_dataset()`)\n\n::: notes\nLot's of ways to speed up sluggish workflows e.g. [writing more performant tidyverse code](https://www.tidyverse.org/blog/2023/04/performant-packages/), use other data frame libraries like data.table or polars, use duckDB or other databases, Spark + splarklyr ... However, Arrow offers some attractive features for tackling this challenge, especially for dplyr users.\n:::\n",
-    "supporting": [],
+    "markdown": "---\nfooter: \"[🔗 pos.it/arrow-conf24](https://pos.it/arrow-conf24)\"\nlogo: \"images/logo.png\"\nexecute:\n  echo: true\nformat:\n  revealjs: \n    theme: default\nengine: knitr\neditor: source\n---\n\n\n# Arrow in R: In-Memory Workflows {#single-file-api}\n\n\n::: {.cell}\n\n:::\n\n\n## arrow 📦\n\n![](images/arrow-read-write-updated.png)\n\n## Arrow & Single Files\n\n<br>\n\n`library(arrow)`\n\n-   `read_parquet()`\n-   `read_csv_arrow()`\n-   `read_feather()`\n-   `read_json_arrow()`\n\n**Value**: `tibble` (the default), or an Arrow Table if `as_data_frame = FALSE` --- both *in-memory*\n\n## Your Turn\n\n1.  Read in a single NYC Taxi parquet file using `read_parquet()` as an Arrow Table\n\n2.  Convert your Arrow Table object to a `data.frame` or a `tibble`\n\n## Read a Parquet File (`tibble`)\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(arrow)\n\nparquet_file <- \"data/nyc-taxi/year=2019/month=9/part-0.parquet\"\n\ntaxi_df <- read_parquet(file = parquet_file)\ntaxi_df\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n# A tibble: 6,567,396 × 22\n   vendor_name pickup_datetime     dropoff_datetime    passenger_count\n   <chr>       <dttm>              <dttm>                        <int>\n 1 CMT         2019-08-31 18:09:30 2019-08-31 18:15:42               1\n 2 CMT         2019-08-31 18:26:30 2019-08-31 18:44:31               1\n 3 CMT         2019-08-31 18:39:35 2019-08-31 19:15:55               2\n 4 VTS         2019-08-31 18:12:26 2019-08-31 18:15:17               4\n 5 VTS         2019-08-31 18:43:16 2019-08-31 18:53:50               1\n 6 VTS         2019-08-31 18:26:13 2019-08-31 18:45:35               1\n 7 CMT         2019-08-31 18:34:52 2019-08-31 18:42:03               1\n 8 CMT         2019-08-31 18:50:02 2019-08-31 18:58:16               1\n 9 CMT         2019-08-31 18:08:02 2019-08-31 18:14:44               0\n10 VTS         2019-08-31 18:11:38 2019-08-31 18:26:47               1\n# ℹ 6,567,386 more rows\n# ℹ 18 more variables: trip_distance <dbl>, pickup_longitude <dbl>,\n#   pickup_latitude <dbl>, rate_code <chr>, store_and_fwd <chr>,\n#   dropoff_longitude <dbl>, dropoff_latitude <dbl>, payment_type <chr>,\n#   fare_amount <dbl>, extra <dbl>, mta_tax <dbl>, tip_amount <dbl>,\n#   tolls_amount <dbl>, total_amount <dbl>, improvement_surcharge <dbl>,\n#   congestion_surcharge <dbl>, pickup_location_id <int>, …\n```\n\n\n:::\n:::\n\n\n## Read a Parquet File (`Table`)\n\n\n::: {.cell}\n\n```{.r .cell-code}\ntaxi_table <- read_parquet(file = parquet_file, as_data_frame = FALSE)\ntaxi_table\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\nTable\n6567396 rows x 22 columns\n$vendor_name <string>\n$pickup_datetime <timestamp[ms]>\n$dropoff_datetime <timestamp[ms]>\n$passenger_count <int64>\n$trip_distance <double>\n$pickup_longitude <double>\n$pickup_latitude <double>\n$rate_code <string>\n$store_and_fwd <string>\n$dropoff_longitude <double>\n$dropoff_latitude <double>\n$payment_type <string>\n$fare_amount <double>\n$extra <double>\n$mta_tax <double>\n$tip_amount <double>\n$tolls_amount <double>\n$total_amount <double>\n$improvement_surcharge <double>\n$congestion_surcharge <double>\n...\n2 more columns\nUse `schema()` to see entire schema\n```\n\n\n:::\n:::\n\n\n## `tibble` \\<-\\> `Table` \\<-\\> `data.frame`\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(dplyr)\n\n#change a df to a table\narrow_table(taxi_df)\n\n#change a table to a tibble\ntaxi_table |> collect()\nas_tibble(taxi_table)\n\n#change a table to a data.frame\nas.data.frame(taxi_table)\n```\n:::\n\n\n<br>\n\n-   `data.frame` & `tibble` are R objects *in-memory*\n-   `Table` is an Arrow object *in-memory*\n\n## Watch Your Schemas 👀\n\n\n:::: {.columns}\n\n::: {.column width=\"50%\"}\n\n::: {.cell}\n\n```{.r .cell-code}\nschema(taxi_df)\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\nSchema\nvendor_name: string\npickup_datetime: timestamp[us, tz=America/Vancouver]\ndropoff_datetime: timestamp[us, tz=America/Vancouver]\npassenger_count: int32\ntrip_distance: double\npickup_longitude: double\npickup_latitude: double\nrate_code: string\nstore_and_fwd: string\ndropoff_longitude: double\ndropoff_latitude: double\npayment_type: string\nfare_amount: double\nextra: double\nmta_tax: double\ntip_amount: double\ntolls_amount: double\ntotal_amount: double\nimprovement_surcharge: double\ncongestion_surcharge: double\npickup_location_id: int32\ndropoff_location_id: int32\n```\n\n\n:::\n:::\n\n:::\n\n::: {.column width=\"50%\"}\n\n::: {.cell}\n\n```{.r .cell-code}\nschema(taxi_table)\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\nSchema\nvendor_name: string\npickup_datetime: timestamp[ms]\ndropoff_datetime: timestamp[ms]\npassenger_count: int64\ntrip_distance: double\npickup_longitude: double\npickup_latitude: double\nrate_code: string\nstore_and_fwd: string\ndropoff_longitude: double\ndropoff_latitude: double\npayment_type: string\nfare_amount: double\nextra: double\nmta_tax: double\ntip_amount: double\ntolls_amount: double\ntotal_amount: double\nimprovement_surcharge: double\ncongestion_surcharge: double\npickup_location_id: int64\ndropoff_location_id: int64\n```\n\n\n:::\n:::\n\n:::\n\n::::\n\n## Data frames\n\n![](images/tabular-structures-r.png)\n\n## Arrow Tables\n\n![](images/tabular-structures-arrow-1.png)\n\n::: notes\nArrow Tables are collections of chunked arrays\n:::\n\n## Table \\| Dataset: A `dplyr` pipeline\n\n\n::: {.cell}\n\n```{.r .cell-code}\nparquet_file |>\n  read_parquet(as_data_frame = FALSE) |>\n  group_by(vendor_name) |>\n  summarise(all_trips = n(),\n            shared_trips = sum(passenger_count > 1, na.rm = TRUE)) |>\n  mutate(pct_shared = shared_trips / all_trips * 100) |>\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n# A tibble: 3 × 4\n  vendor_name all_trips shared_trips pct_shared\n  <chr>           <int>        <int>      <dbl>\n1 VTS           4238808      1339478       31.6\n2 CMT           2294473       470344       20.5\n3 <NA>            34115            0        0  \n```\n\n\n:::\n:::\n\n\n<br>\n\nFunctions available in Arrow dplyr queries: <https://arrow.apache.org/docs/r/reference/acero.html>\n\n::: notes\nAll the same capabilities as you practiced with Arrow Dataset\n:::\n\n## Arrow for Efficient In-Memory Processing\n\n\n::: {.cell}\n\n```{.r .cell-code}\nparquet_file |>\n  read_parquet() |>\n  nrow()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n[1] 6567396\n```\n\n\n:::\n:::\n\n\n<br>\n\n\n::: {.cell}\n\n```{.r .cell-code  code-line-numbers=\"|2,8\"}\nparquet_file |>\n  read_parquet() |>\n  group_by(vendor_name) |>\n  summarise(all_trips = n(),\n            shared_trips = sum(passenger_count > 1, na.rm = TRUE)) |>\n  mutate(pct_shared = shared_trips / all_trips * 100) |>\n  collect() |>\n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n   user  system elapsed \n  1.157   0.261   0.509 \n```\n\n\n:::\n:::\n\n\n## Arrow for Efficient In-Memory Processing\n\n\n::: {.cell}\n\n```{.r .cell-code}\nparquet_file |>\n  read_parquet(as_data_frame = FALSE) |>\n  nrow()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n[1] 6567396\n```\n\n\n:::\n:::\n\n\n<br>\n\n\n::: {.cell}\n\n```{.r .cell-code  code-line-numbers=\"|2,8\"}\nparquet_file |>\n  read_parquet(as_data_frame = FALSE) |>\n  group_by(vendor_name) |>\n  summarise(all_trips = n(),\n            shared_trips = sum(passenger_count > 1, na.rm = TRUE)) |>\n  mutate(pct_shared = shared_trips / all_trips * 100) |>\n  collect() |>\n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n   user  system elapsed \n  1.047   0.203   0.220 \n```\n\n\n:::\n:::\n\n\n## Read a Parquet File Selectively\n\n\n::: {.cell}\n\n```{.r .cell-code}\nparquet_file |>\n  read_parquet(\n    col_select = c(\"vendor_name\", \"passenger_count\"),\n    as_data_frame = FALSE\n  )\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\nTable\n6567396 rows x 2 columns\n$vendor_name <string>\n$passenger_count <int64>\n```\n\n\n:::\n:::\n\n\n## Selective Reads Are Faster\n\n\n::: {.cell}\n\n```{.r .cell-code  code-line-numbers=\"|2,3,4,11\"}\nparquet_file |>\n  read_parquet(\n    col_select = c(\"vendor_name\", \"passenger_count\"),\n    as_data_frame = FALSE\n  ) |> \n  group_by(vendor_name) |>\n  summarise(all_trips = n(),\n            shared_trips = sum(passenger_count > 1, na.rm = TRUE)) |>\n  mutate(pct_shared = shared_trips / all_trips * 100) |>\n  collect() |>\n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n   user  system elapsed \n  0.258   0.011   0.131 \n```\n\n\n:::\n:::\n\n\n\n:::notes\nNotes: row-based format readers often allow you to specify which columns to read in but the entire row must be read in and the unwanted columns discarded. Parquet’s columnar format allows you to read in only the columns you need, which is faster when you only need a subset of the data.\n:::\n\n## Arrow Table or Dataset?\n\n![](images/2022-09-decision-map.png){.absolute left=\"200\" height=\"550\"}\n\n::: {style=\"font-size: 60%; margin-top: 575px; margin-left: 250px;\"}\n<https://francoismichonneau.net/2022/10/import-big-csv/>\n:::\n\n## Arrow for Improving Those Sluggish Worklows\n\n-   a \"drop-in\" for many dplyr workflows (Arrow Table or Dataset)\n-   works when your tabular data get too big for your RAM (Arrow Dataset)\n-   provides tools for re-engineering data storage for better performance (`arrow::write_dataset()`)\n\n::: notes\nLot's of ways to speed up sluggish workflows e.g. [writing more performant tidyverse code](https://www.tidyverse.org/blog/2023/04/performant-packages/), use other data frame libraries like data.table or polars, use duckDB or other databases, Spark + splarklyr ... However, Arrow offers some attractive features for tackling this challenge, especially for dplyr users.\n:::\n",
+    "supporting": [
+      "5_arrow_single_file_files"
+    ],
     "filters": [
       "rmarkdown/pagebreak.lua"
     ],
diff --git a/_freeze/materials/6_wrapping_up/execute-results/html.json b/_freeze/materials/6_wrapping_up/execute-results/html.json
index 98dcfb7..b202311 100644
--- a/_freeze/materials/6_wrapping_up/execute-results/html.json
+++ b/_freeze/materials/6_wrapping_up/execute-results/html.json
@@ -1,8 +1,8 @@
 {
-  "hash": "9d920c068e646165f1cf8a7318ab23f8",
+  "hash": "295b9a54170accc4d78ea58e63476007",
   "result": {
     "engine": "knitr",
-    "markdown": "---\nfooter: \"[🔗 pos.it/arrow-conf24](https://pos.it/arrow-conf24)\"\nlogo: \"images/logo.png\"\nexecute:\n  echo: true\nformat:\n  revealjs: \n    theme: default\nengine: knitr\neditor: source\n---\n\n\n\n\n# Wrapping Up: 'Big' Data Analysis Pipelines with R {#wrapping-up}\n\n## Arrow\n\n-   efficiently read + filter + join + summarise 1.15 billion rows\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(arrow)\nlibrary(dplyr)\nlibrary(janitor)\nlibrary(stringr)\n\nnyc_taxi_zones <- read_csv_arrow(\"data/taxi_zone_lookup.csv\",\n                                 as_data_frame = FALSE) |>\n  clean_names()\n  \nairport_zones <- nyc_taxi_zones |>\n  filter(str_detect(zone, \"Airport\")) |>\n  pull(location_id, as_vector = TRUE)\n\ndropoff_zones <- nyc_taxi_zones |>\n  select(dropoff_location_id = location_id,\n         dropoff_zone = zone) |> \n  compute() # run the query but don't pull results into R session\n\nairport_pickups <- open_dataset(\"data/nyc-taxi/\") |>\n  filter(pickup_location_id %in% airport_zones) |>\n  select(\n    matches(\"datetime\"),\n    matches(\"location_id\")\n  ) |>\n  left_join(dropoff_zones) |>\n  count(dropoff_zone) |>\n  arrange(desc(n)) |>\n  collect()\n```\n:::\n\n\n\n\n## R\n\n-   read + wrangle spatial data + 🤩 graphics\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(sf)\nlibrary(ggplot2)\nlibrary(ggrepel)\nlibrary(stringr)\nlibrary(scales)\n\nmap <- read_sf(\"data/taxi_zones/taxi_zones.shp\") |>\n  clean_names() |>\n  left_join(airport_pickups,\n            by = c(\"zone\" = \"dropoff_zone\")) |>\n  arrange(desc(n))\n\narrow_r_together <- ggplot(data = map, aes(fill = n)) +\n  geom_sf(size = .1) +\n  scale_fill_distiller(\n    name = \"Number of trips\",\n    labels = label_comma(),\n    palette = \"Reds\",\n    direction = 1\n  ) +\n  geom_label_repel(\n    stat = \"sf_coordinates\",\n    data = map |>\n      mutate(zone_label = case_when(\n        str_detect(zone, \"Airport\") ~ zone,\n        str_detect(zone, \"Times\") ~ zone,\n        .default = \"\"\n      )),\n    mapping = aes(label = zone_label, geometry = geometry),\n    max.overlaps = 60,\n    label.padding = .3,\n    fill = \"white\"\n  ) +\n  theme_void()\n```\n:::\n\n\n\n\n## Arrow + R Together: {arrow}\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\narrow_r_together\n```\n\n::: {.cell-output-display}\n![](6_wrapping_up_files/figure-revealjs/arrow_r_together-1.png){width=960}\n:::\n:::\n",
+    "markdown": "---\nfooter: \"[🔗 pos.it/arrow-conf24](https://pos.it/arrow-conf24)\"\nlogo: \"images/logo.png\"\nexecute:\n  echo: true\nformat:\n  revealjs: \n    theme: default\nengine: knitr\neditor: source\n---\n\n\n# Wrapping Up: 'Big' Data Analysis Pipelines with R {#wrapping-up}\n\n## Arrow\n\n-   efficiently read + filter + join + summarise 1.15 billion rows\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(arrow)\nlibrary(dplyr)\nlibrary(janitor)\nlibrary(stringr)\n\nnyc_taxi_zones <- read_csv_arrow(\"data/taxi_zone_lookup.csv\",\n                                 as_data_frame = FALSE) |>\n  clean_names()\n  \nairport_zones <- nyc_taxi_zones |>\n  filter(str_detect(zone, \"Airport\")) |>\n  pull(location_id, as_vector = TRUE)\n\ndropoff_zones <- nyc_taxi_zones |>\n  select(dropoff_location_id = location_id,\n         dropoff_zone = zone) |> \n  collect(as_data_frame = FALSE)\n\nairport_pickups <- open_dataset(\"data/nyc-taxi/\") |>\n  filter(pickup_location_id %in% airport_zones) |>\n  select(\n    matches(\"datetime\"),\n    matches(\"location_id\")\n  ) |>\n  left_join(dropoff_zones) |>\n  count(dropoff_zone) |>\n  arrange(desc(n)) |>\n  collect()\n```\n:::\n\n\n## R\n\n-   read + wrangle spatial data + 🤩 graphics\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(sf)\nlibrary(ggplot2)\nlibrary(ggrepel)\nlibrary(stringr)\nlibrary(scales)\n\nmap <- read_sf(\"data/taxi_zones/taxi_zones.shp\") |>\n  clean_names() |>\n  left_join(airport_pickups,\n            by = c(\"zone\" = \"dropoff_zone\")) |>\n  arrange(desc(n))\n\narrow_r_together <- ggplot(data = map, aes(fill = n)) +\n  geom_sf(size = .1) +\n  scale_fill_distiller(\n    name = \"Number of trips\",\n    labels = label_comma(),\n    palette = \"Reds\",\n    direction = 1\n  ) +\n  geom_label_repel(\n    stat = \"sf_coordinates\",\n    data = map |>\n      mutate(zone_label = case_when(\n        str_detect(zone, \"Airport\") ~ zone,\n        str_detect(zone, \"Times\") ~ zone,\n        .default = \"\"\n      )),\n    mapping = aes(label = zone_label, geometry = geometry),\n    max.overlaps = 60,\n    label.padding = .3,\n    fill = \"white\"\n  ) +\n  theme_void()\n```\n:::\n\n\n## Arrow + R Together: {arrow}\n\n\n::: {.cell}\n\n```{.r .cell-code}\narrow_r_together\n```\n\n::: {.cell-output-display}\n![](6_wrapping_up_files/figure-revealjs/arrow_r_together-1.png){width=960}\n:::\n:::\n",
     "supporting": [
       "6_wrapping_up_files"
     ],
diff --git a/_freeze/materials/6_wrapping_up/figure-revealjs/arrow_r_together-1.png b/_freeze/materials/6_wrapping_up/figure-revealjs/arrow_r_together-1.png
index e27b280..358c16e 100644
Binary files a/_freeze/materials/6_wrapping_up/figure-revealjs/arrow_r_together-1.png and b/_freeze/materials/6_wrapping_up/figure-revealjs/arrow_r_together-1.png differ
diff --git a/_site/materials/0_housekeeping.html b/_site/materials/0_housekeeping.html
index c76174c..c12d2f6 100644
--- a/_site/materials/0_housekeeping.html
+++ b/_site/materials/0_housekeeping.html
@@ -434,6 +434,15 @@ <h2>We Assume</h2>
 <li>You have data in your life that is too large to fit into memory or sluggish in memory</li>
 <li>You want to learn how to engineer your data storage for more performant access and analysis</li>
 </ul>
+</section>
+<section id="setup" class="slide level2">
+<h2>Setup</h2>
+<ul>
+<li>Log onto Workbench at the following URL: <add on="" conf="" day=""></add></li>
+<li>Create a new session; <strong>select “Resource Profile: Large”</strong></li>
+<li>Run <code>usethis::use_course("posit-conf-2024/arrow")</code></li>
+<li>Open <code>data/setup.R</code> and run the script</li>
+</ul>
 
 <div class="quarto-auto-generated-content">
 <p><img src="images/logo.png" class="slide-logo"></p>
diff --git a/_site/materials/3_data_engineering-exercises.html b/_site/materials/3_data_engineering-exercises.html
index 5c768cb..0fe945e 100644
--- a/_site/materials/3_data_engineering-exercises.html
+++ b/_site/materials/3_data_engineering-exercises.html
@@ -267,8 +267,7 @@ <h1>Schemas</h1>
 </div>
 <div class="cell">
 <div class="sourceCode cell-code" id="cb2"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a>seattle_csv <span class="ot">&lt;-</span> <span class="fu">open_dataset</span>(<span class="at">sources =</span> <span class="st">"data/seattle-library-checkouts.csv"</span>,</span>
-<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a>  <span class="at">format =</span> <span class="st">"csv"</span></span>
-<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a>                            <span class="at">format =</span> <span class="st">"csv"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <div id="exercise-schema" class="callout callout-style-default callout-tip callout-titled">
 <div class="callout-header d-flex align-content-center">
@@ -293,28 +292,28 @@ <h1>Schemas</h1>
 <div class="cell">
 <div class="sourceCode cell-code" id="cb3"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a>seattle_csv <span class="ot">&lt;-</span> <span class="fu">open_dataset</span>(<span class="at">sources =</span> <span class="st">"data/seattle-library-checkouts.csv"</span>,</span>
 <span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a>  <span class="at">format =</span> <span class="st">"csv"</span>,</span>
-<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a>  <span class="at">skip =</span> <span class="dv">1</span>,</span>
-<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a>  <span class="fu">schema</span>(</span>
-<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a>    <span class="at">UsageClass =</span> <span class="fu">utf8</span>(),</span>
-<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a>    <span class="at">CheckoutType =</span> <span class="fu">utf8</span>(),</span>
-<span id="cb3-7"><a href="#cb3-7" aria-hidden="true" tabindex="-1"></a>    <span class="at">MaterialType =</span> <span class="fu">utf8</span>(),</span>
-<span id="cb3-8"><a href="#cb3-8" aria-hidden="true" tabindex="-1"></a>    <span class="at">CheckoutYear =</span> <span class="fu">int64</span>(),</span>
-<span id="cb3-9"><a href="#cb3-9" aria-hidden="true" tabindex="-1"></a>    <span class="at">CheckoutMonth =</span> <span class="fu">int64</span>(),</span>
-<span id="cb3-10"><a href="#cb3-10" aria-hidden="true" tabindex="-1"></a>    <span class="at">Checkouts =</span> <span class="fu">int64</span>(),</span>
-<span id="cb3-11"><a href="#cb3-11" aria-hidden="true" tabindex="-1"></a>    <span class="at">Title =</span> <span class="fu">utf8</span>(),</span>
-<span id="cb3-12"><a href="#cb3-12" aria-hidden="true" tabindex="-1"></a>    <span class="at">ISBN =</span> <span class="fu">string</span>(), <span class="co">#or utf8()</span></span>
-<span id="cb3-13"><a href="#cb3-13" aria-hidden="true" tabindex="-1"></a>    <span class="at">Creator =</span> <span class="fu">utf8</span>(),</span>
-<span id="cb3-14"><a href="#cb3-14" aria-hidden="true" tabindex="-1"></a>    <span class="at">Subjects =</span> <span class="fu">utf8</span>(),</span>
-<span id="cb3-15"><a href="#cb3-15" aria-hidden="true" tabindex="-1"></a>    <span class="at">Publisher =</span> <span class="fu">utf8</span>(),</span>
-<span id="cb3-16"><a href="#cb3-16" aria-hidden="true" tabindex="-1"></a>    <span class="at">PublicationYear =</span> <span class="fu">utf8</span>()</span>
-<span id="cb3-17"><a href="#cb3-17" aria-hidden="true" tabindex="-1"></a>  )</span>
+<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a>  <span class="fu">schema</span>(</span>
+<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a>    <span class="at">UsageClass =</span> <span class="fu">utf8</span>(),</span>
+<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a>    <span class="at">CheckoutType =</span> <span class="fu">utf8</span>(),</span>
+<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a>    <span class="at">MaterialType =</span> <span class="fu">utf8</span>(),</span>
+<span id="cb3-7"><a href="#cb3-7" aria-hidden="true" tabindex="-1"></a>    <span class="at">CheckoutYear =</span> <span class="fu">int64</span>(),</span>
+<span id="cb3-8"><a href="#cb3-8" aria-hidden="true" tabindex="-1"></a>    <span class="at">CheckoutMonth =</span> <span class="fu">int64</span>(),</span>
+<span id="cb3-9"><a href="#cb3-9" aria-hidden="true" tabindex="-1"></a>    <span class="at">Checkouts =</span> <span class="fu">int64</span>(),</span>
+<span id="cb3-10"><a href="#cb3-10" aria-hidden="true" tabindex="-1"></a>    <span class="at">Title =</span> <span class="fu">utf8</span>(),</span>
+<span id="cb3-11"><a href="#cb3-11" aria-hidden="true" tabindex="-1"></a>    <span class="at">ISBN =</span> <span class="fu">string</span>(), <span class="co">#or utf8()</span></span>
+<span id="cb3-12"><a href="#cb3-12" aria-hidden="true" tabindex="-1"></a>    <span class="at">Creator =</span> <span class="fu">utf8</span>(),</span>
+<span id="cb3-13"><a href="#cb3-13" aria-hidden="true" tabindex="-1"></a>    <span class="at">Subjects =</span> <span class="fu">utf8</span>(),</span>
+<span id="cb3-14"><a href="#cb3-14" aria-hidden="true" tabindex="-1"></a>    <span class="at">Publisher =</span> <span class="fu">utf8</span>(),</span>
+<span id="cb3-15"><a href="#cb3-15" aria-hidden="true" tabindex="-1"></a>    <span class="at">PublicationYear =</span> <span class="fu">utf8</span>()</span>
+<span id="cb3-16"><a href="#cb3-16" aria-hidden="true" tabindex="-1"></a>  ),</span>
+<span id="cb3-17"><a href="#cb3-17" aria-hidden="true" tabindex="-1"></a>    <span class="at">skip =</span> <span class="dv">1</span>,</span>
 <span id="cb3-18"><a href="#cb3-18" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>or</p>
 <div class="cell">
 <div class="sourceCode cell-code" id="cb4"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a>seattle_csv <span class="ot">&lt;-</span> <span class="fu">open_dataset</span>(<span class="at">sources =</span> <span class="st">"data/seattle-library-checkouts.csv"</span>,</span>
 <span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a>  <span class="at">format =</span> <span class="st">"csv"</span>,</span>
-<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a>  <span class="at">col_types =</span> <span class="fu">schema</span>(<span class="at">ISBN =</span> <span class="fu">string</span>()) <span class="co">#utf8()</span></span>
+<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a>  <span class="at">col_types =</span> <span class="fu">schema</span>(<span class="at">ISBN =</span> <span class="fu">string</span>()) <span class="co"># or utf8()</span></span>
 <span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a>)</span>
 <span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a>seattle_csv</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
@@ -407,7 +406,7 @@ <h1>Schemas</h1>
 <span id="cb10-6"><a href="#cb10-6" aria-hidden="true" tabindex="-1"></a>  <span class="fu">system.time</span>()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
 <pre><code>   user  system elapsed 
- 11.474   1.084  11.003 </code></pre>
+ 10.651   1.091  10.333 </code></pre>
 </div>
 </div>
 <p>Querying 42 million rows of data stored in a CSV on disk in ~10 seconds, not too bad.</p>
@@ -457,7 +456,7 @@ <h1>Parquet</h1>
 <span id="cb13-9"><a href="#cb13-9" aria-hidden="true" tabindex="-1"></a>  <span class="fu">system.time</span>()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
 <pre><code>   user  system elapsed 
-  2.076   0.287   0.646 </code></pre>
+  1.634   0.345   0.558 </code></pre>
 </div>
 </div>
 <p>A <em>much</em> faster compute time for the query when the on-disk data is stored in the Parquet format.</p>
@@ -517,7 +516,7 @@ <h1>Partitioning</h1>
 <span id="cb17-5"><a href="#cb17-5" aria-hidden="true" tabindex="-1"></a>  <span class="fu">system.time</span>()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
 <pre><code>   user  system elapsed 
-  0.965   0.160   0.409 </code></pre>
+  0.777   0.072   0.296 </code></pre>
 </div>
 </div>
 <p>Total number of Checkouts in September of 2019 using partitioned Parquet data by <code>CheckoutYear</code> and <code>CheckoutMonth</code>:</p>
@@ -529,7 +528,7 @@ <h1>Partitioning</h1>
 <span id="cb19-5"><a href="#cb19-5" aria-hidden="true" tabindex="-1"></a>  <span class="fu">system.time</span>()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
 <pre><code>   user  system elapsed 
-  0.058   0.006   0.052 </code></pre>
+  0.034   0.005   0.030 </code></pre>
 </div>
 </div>
 <p>Faster compute time because the <code>filter()</code> call is based on the partitions.</p>
diff --git a/_site/materials/3_data_engineering.html b/_site/materials/3_data_engineering.html
index 5259795..757fada 100644
--- a/_site/materials/3_data_engineering.html
+++ b/_site/materials/3_data_engineering.html
@@ -447,7 +447,7 @@ <h2>arrow::open_dataset() with a CSV</h2>
 <span id="cb1-3"><a></a></span>
 <span id="cb1-4"><a></a></span>
 <span id="cb1-5"><a></a>seattle_csv <span class="ot">&lt;-</span> <span class="fu">open_dataset</span>(<span class="at">sources =</span> <span class="st">"data/seattle-library-checkouts.csv"</span>,</span>
-<span id="cb1-6"><a></a>               <span class="at">format =</span> <span class="st">"csv"</span>)</span>
+<span id="cb1-6"><a></a>                            <span class="at">format =</span> <span class="st">"csv"</span>)</span>
 <span id="cb1-7"><a></a>seattle_csv</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
 <pre><code>FileSystemDataset with 1 csv file
@@ -592,23 +592,23 @@ <h2>Let’s Control the Schema</h2>
 <h2>Let’s Control the Schema</h2>
 <p>Schema defines column names and types, so we need to skip the first row (skip = 1):</p>
 <div class="cell">
-<div class="sourceCode cell-code" id="cb10" data-code-line-numbers="|12"><pre class="sourceCode numberSource r number-lines code-with-copy"><code class="sourceCode r"><span id="cb10-1"><a></a>seattle_csv <span class="ot">&lt;-</span> <span class="fu">open_dataset</span>(<span class="at">sources =</span> <span class="st">"data/seattle-library-checkouts.csv"</span>,</span>
+<div class="sourceCode cell-code" id="cb10" data-code-line-numbers="|11|17"><pre class="sourceCode numberSource r number-lines code-with-copy"><code class="sourceCode r"><span id="cb10-1"><a></a>seattle_csv <span class="ot">&lt;-</span> <span class="fu">open_dataset</span>(<span class="at">sources =</span> <span class="st">"data/seattle-library-checkouts.csv"</span>,</span>
 <span id="cb10-2"><a></a>  <span class="at">format =</span> <span class="st">"csv"</span>,</span>
-<span id="cb10-3"><a></a>  <span class="at">skip =</span> <span class="dv">1</span>,</span>
-<span id="cb10-4"><a></a>  <span class="at">schema =</span> <span class="fu">schema</span>(</span>
-<span id="cb10-5"><a></a>    <span class="at">UsageClass =</span> <span class="fu">utf8</span>(),</span>
-<span id="cb10-6"><a></a>    <span class="at">CheckoutType =</span> <span class="fu">utf8</span>(),</span>
-<span id="cb10-7"><a></a>    <span class="at">MaterialType =</span> <span class="fu">utf8</span>(),</span>
-<span id="cb10-8"><a></a>    <span class="at">CheckoutYear =</span> <span class="fu">int64</span>(),</span>
-<span id="cb10-9"><a></a>    <span class="at">CheckoutMonth =</span> <span class="fu">int64</span>(),</span>
-<span id="cb10-10"><a></a>    <span class="at">Checkouts =</span> <span class="fu">int64</span>(),</span>
-<span id="cb10-11"><a></a>    <span class="at">Title =</span> <span class="fu">utf8</span>(),</span>
-<span id="cb10-12"><a></a>    <span class="at">ISBN =</span> <span class="fu">string</span>(), <span class="co">#utf8()</span></span>
-<span id="cb10-13"><a></a>    <span class="at">Creator =</span> <span class="fu">utf8</span>(),</span>
-<span id="cb10-14"><a></a>    <span class="at">Subjects =</span> <span class="fu">utf8</span>(),</span>
-<span id="cb10-15"><a></a>    <span class="at">Publisher =</span> <span class="fu">utf8</span>(),</span>
-<span id="cb10-16"><a></a>    <span class="at">PublicationYear =</span> <span class="fu">utf8</span>()</span>
-<span id="cb10-17"><a></a>  )</span>
+<span id="cb10-3"><a></a>  <span class="at">schema =</span> <span class="fu">schema</span>(</span>
+<span id="cb10-4"><a></a>    <span class="at">UsageClass =</span> <span class="fu">utf8</span>(),</span>
+<span id="cb10-5"><a></a>    <span class="at">CheckoutType =</span> <span class="fu">utf8</span>(),</span>
+<span id="cb10-6"><a></a>    <span class="at">MaterialType =</span> <span class="fu">utf8</span>(),</span>
+<span id="cb10-7"><a></a>    <span class="at">CheckoutYear =</span> <span class="fu">int64</span>(),</span>
+<span id="cb10-8"><a></a>    <span class="at">CheckoutMonth =</span> <span class="fu">int64</span>(),</span>
+<span id="cb10-9"><a></a>    <span class="at">Checkouts =</span> <span class="fu">int64</span>(),</span>
+<span id="cb10-10"><a></a>    <span class="at">Title =</span> <span class="fu">utf8</span>(),</span>
+<span id="cb10-11"><a></a>    <span class="at">ISBN =</span> <span class="fu">string</span>(), <span class="co">#utf8()</span></span>
+<span id="cb10-12"><a></a>    <span class="at">Creator =</span> <span class="fu">utf8</span>(),</span>
+<span id="cb10-13"><a></a>    <span class="at">Subjects =</span> <span class="fu">utf8</span>(),</span>
+<span id="cb10-14"><a></a>    <span class="at">Publisher =</span> <span class="fu">utf8</span>(),</span>
+<span id="cb10-15"><a></a>    <span class="at">PublicationYear =</span> <span class="fu">utf8</span>()</span>
+<span id="cb10-16"><a></a>  ),</span>
+<span id="cb10-17"><a></a>    <span class="at">skip =</span> <span class="dv">1</span>,</span>
 <span id="cb10-18"><a></a>)</span>
 <span id="cb10-19"><a></a>seattle_csv</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
@@ -660,7 +660,7 @@ <h2>Let’s Control the Schema</h2>
 <section id="your-turn" class="slide level2">
 <h2>Your Turn</h2>
 <ol type="1">
-<li><p>The first few thousand rows of <code>ISBN</code> are blank in the Seattle Checkouts CSV file. Read in the Seattle Checkouts CSV file with <code>open_dataset()</code> and ensure the correct data type for <code>ISBN</code> is <code>&lt;string&gt;</code> instead of the <code>&lt;null&gt;</code> interpreted by Arrow.</p></li>
+<li><p>The first few thousand rows of <code>ISBN</code> are blank in the Seattle Checkouts CSV file. Read in the Seattle Checkouts CSV file with <code>open_dataset()</code> and ensure the correct data type for <code>ISBN</code> is <code>&lt;string&gt;</code> (or the alias <code>&lt;utf8&gt;</code>) instead of the <code>&lt;null&gt;</code> interpreted by Arrow.</p></li>
 <li><p>Once you have a <code>Dataset</code> object with the metadata you are after, count the number of <code>Checkouts</code> by <code>CheckoutYear</code> and arrange the result by <code>CheckoutYear</code>.</p></li>
 </ol>
 <p>➡️ <a href="3_data_engineering-exercises.html">Data Storage Engineering Exercises Page</a></p>
@@ -709,7 +709,7 @@ <h2>9GB CSV file + arrow + dplyr</h2>
 <span id="cb16-6"><a></a>  <span class="fu">system.time</span>()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
 <pre><code>   user  system elapsed 
- 11.581   1.136  11.117 </code></pre>
+ 10.688   1.099  10.451 </code></pre>
 </div>
 </div>
 <p>42 million rows – not bad, but could be faster….</p>
@@ -769,7 +769,7 @@ <h2>Storage: Parquet vs CSV</h2>
 <div class="sourceCode cell-code" id="cb19"><pre class="sourceCode numberSource r number-lines code-with-copy"><code class="sourceCode r"><span id="cb19-1"><a></a>file <span class="ot">&lt;-</span> <span class="fu">list.files</span>(seattle_parquet)</span>
 <span id="cb19-2"><a></a><span class="fu">file.size</span>(<span class="fu">file.path</span>(seattle_parquet, file)) <span class="sc">/</span> <span class="dv">10</span><span class="sc">**</span><span class="dv">9</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
-<pre><code>[1] 4.423348</code></pre>
+<pre><code>[1] 4.424267</code></pre>
 </div>
 </div>
 <p><br></p>
@@ -794,7 +794,7 @@ <h2>4.5GB Parquet file + arrow + dplyr</h2>
 <span id="cb21-7"><a></a>  <span class="fu">system.time</span>()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
 <pre><code>   user  system elapsed 
-  2.018   0.265   0.595 </code></pre>
+  1.771   0.431   0.568 </code></pre>
 </div>
 </div>
 <p>42 million rows – much better! But could be <em>even</em> faster….</p>
@@ -887,7 +887,7 @@ <h2>What Did We “Engineer”?</h2>
 <pre><code># A tibble: 18 × 2
    files                            size_GB
    &lt;chr&gt;                              &lt;dbl&gt;
- 1 CheckoutYear=2005/part-0.parquet   0.114
+ 1 CheckoutYear=2005/part-0.parquet   0.115
  2 CheckoutYear=2006/part-0.parquet   0.172
  3 CheckoutYear=2007/part-0.parquet   0.186
  4 CheckoutYear=2008/part-0.parquet   0.204
@@ -901,7 +901,7 @@ <h2>What Did We “Engineer”?</h2>
 12 CheckoutYear=2016/part-0.parquet   0.315
 13 CheckoutYear=2017/part-0.parquet   0.319
 14 CheckoutYear=2018/part-0.parquet   0.306
-15 CheckoutYear=2019/part-0.parquet   0.302
+15 CheckoutYear=2019/part-0.parquet   0.303
 16 CheckoutYear=2020/part-0.parquet   0.158
 17 CheckoutYear=2021/part-0.parquet   0.240
 18 CheckoutYear=2022/part-0.parquet   0.252</code></pre>
@@ -922,7 +922,7 @@ <h2>4.5GB partitioned Parquet files + arrow + dplyr</h2>
 <span id="cb26-9"><a></a>  <span class="fu">system.time</span>()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
 <pre><code>   user  system elapsed 
-  1.640   0.220   0.267 </code></pre>
+  1.742   0.385   0.366 </code></pre>
 </div>
 </div>
 <p><br></p>
@@ -936,10 +936,6 @@ <h2>Your Turn</h2>
 </ol>
 <p>➡️ <a href="3_data_engineering-exercises.html">Data Storage Engineering Exercises Page</a></p>
 </section>
-<section id="partitions-na-values" class="slide level2">
-<h2>Partitions &amp; NA Values</h2>
-<p>ADD content</p>
-</section>
 <section id="partition-design" class="slide level2">
 <h2>Partition Design</h2>
 <div class="columns">
@@ -953,23 +949,72 @@ <h2>Partition Design</h2>
 </div>
 </div>
 </section>
+<section id="partitions-na-values" class="slide level2">
+<h2>Partitions &amp; NA Values</h2>
+<p>Default:</p>
+<div class="cell">
+<div class="sourceCode cell-code" id="cb28"><pre class="sourceCode numberSource r number-lines code-with-copy"><code class="sourceCode r"><span id="cb28-1"><a></a>partition_na_default_path <span class="ot">&lt;-</span> <span class="st">"data/na-partition-default"</span></span>
+<span id="cb28-2"><a></a></span>
+<span id="cb28-3"><a></a><span class="fu">write_dataset</span>(starwars,</span>
+<span id="cb28-4"><a></a>              partition_na_default_path,</span>
+<span id="cb28-5"><a></a>              <span class="at">partitioning =</span> <span class="st">"hair_color"</span>)</span>
+<span id="cb28-6"><a></a></span>
+<span id="cb28-7"><a></a><span class="fu">list.files</span>(partition_na_default_path)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code> [1] "hair_color=__HIVE_DEFAULT_PARTITION__"
+ [2] "hair_color=auburn"                    
+ [3] "hair_color=auburn%2C%20grey"          
+ [4] "hair_color=auburn%2C%20white"         
+ [5] "hair_color=black"                     
+ [6] "hair_color=blond"                     
+ [7] "hair_color=blonde"                    
+ [8] "hair_color=brown"                     
+ [9] "hair_color=brown%2C%20grey"           
+[10] "hair_color=grey"                      
+[11] "hair_color=none"                      
+[12] "hair_color=white"                     </code></pre>
+</div>
+</div>
+</section>
+<section id="partitions-na-values-1" class="slide level2">
+<h2>Partitions &amp; NA Values</h2>
+<p>Custom:</p>
+<div class="cell">
+<div class="sourceCode cell-code" id="cb30"><pre class="sourceCode numberSource r number-lines code-with-copy"><code class="sourceCode r"><span id="cb30-1"><a></a>partition_na_custom_path <span class="ot">&lt;-</span> <span class="st">"data/na-partition-custom"</span></span>
+<span id="cb30-2"><a></a></span>
+<span id="cb30-3"><a></a><span class="fu">write_dataset</span>(starwars,</span>
+<span id="cb30-4"><a></a>              partition_na_custom_path,</span>
+<span id="cb30-5"><a></a>              <span class="at">partitioning =</span> <span class="fu">hive_partition</span>(<span class="at">hair_color =</span> <span class="fu">string</span>(),</span>
+<span id="cb30-6"><a></a>                                            <span class="at">null_fallback =</span> <span class="st">"no_color"</span>))</span>
+<span id="cb30-7"><a></a></span>
+<span id="cb30-8"><a></a><span class="fu">list.files</span>(partition_na_custom_path)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code> [1] "hair_color=auburn"            "hair_color=auburn%2C%20grey" 
+ [3] "hair_color=auburn%2C%20white" "hair_color=black"            
+ [5] "hair_color=blond"             "hair_color=blonde"           
+ [7] "hair_color=brown"             "hair_color=brown%2C%20grey"  
+ [9] "hair_color=grey"              "hair_color=no_color"         
+[11] "hair_color=none"              "hair_color=white"            </code></pre>
+</div>
+</div>
+</section>
 <section id="performance-review-single-csv" class="slide level2">
 <h2>Performance Review: Single CSV</h2>
 <p>How long does it take to calculate the number of books checked out in each month of 2021?</p>
 <p><br></p>
 <div class="cell">
-<div class="sourceCode cell-code" id="cb28"><pre class="sourceCode numberSource r number-lines code-with-copy"><code class="sourceCode r"><span id="cb28-1"><a></a><span class="fu">open_dataset</span>(<span class="at">sources =</span> <span class="st">"data/seattle-library-checkouts.csv"</span>, </span>
-<span id="cb28-2"><a></a>  <span class="at">format =</span> <span class="st">"csv"</span>) <span class="sc">|&gt;</span> </span>
-<span id="cb28-3"><a></a></span>
-<span id="cb28-4"><a></a>  <span class="fu">filter</span>(CheckoutYear <span class="sc">==</span> <span class="dv">2021</span>, MaterialType <span class="sc">==</span> <span class="st">"BOOK"</span>) <span class="sc">|&gt;</span></span>
-<span id="cb28-5"><a></a>  <span class="fu">group_by</span>(CheckoutMonth) <span class="sc">|&gt;</span></span>
-<span id="cb28-6"><a></a>  <span class="fu">summarise</span>(<span class="at">TotalCheckouts =</span> <span class="fu">sum</span>(Checkouts)) <span class="sc">|&gt;</span></span>
-<span id="cb28-7"><a></a>  <span class="fu">arrange</span>(<span class="fu">desc</span>(CheckoutMonth)) <span class="sc">|&gt;</span></span>
-<span id="cb28-8"><a></a>  <span class="fu">collect</span>() <span class="sc">|&gt;</span></span>
-<span id="cb28-9"><a></a>  <span class="fu">system.time</span>()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb32"><pre class="sourceCode numberSource r number-lines code-with-copy"><code class="sourceCode r"><span id="cb32-1"><a></a><span class="fu">open_dataset</span>(<span class="at">sources =</span> <span class="st">"data/seattle-library-checkouts.csv"</span>, </span>
+<span id="cb32-2"><a></a>  <span class="at">format =</span> <span class="st">"csv"</span>) <span class="sc">|&gt;</span> </span>
+<span id="cb32-3"><a></a></span>
+<span id="cb32-4"><a></a>  <span class="fu">filter</span>(CheckoutYear <span class="sc">==</span> <span class="dv">2021</span>, MaterialType <span class="sc">==</span> <span class="st">"BOOK"</span>) <span class="sc">|&gt;</span></span>
+<span id="cb32-5"><a></a>  <span class="fu">group_by</span>(CheckoutMonth) <span class="sc">|&gt;</span></span>
+<span id="cb32-6"><a></a>  <span class="fu">summarise</span>(<span class="at">TotalCheckouts =</span> <span class="fu">sum</span>(Checkouts)) <span class="sc">|&gt;</span></span>
+<span id="cb32-7"><a></a>  <span class="fu">arrange</span>(<span class="fu">desc</span>(CheckoutMonth)) <span class="sc">|&gt;</span></span>
+<span id="cb32-8"><a></a>  <span class="fu">collect</span>() <span class="sc">|&gt;</span></span>
+<span id="cb32-9"><a></a>  <span class="fu">system.time</span>()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
 <pre><code>   user  system elapsed 
- 13.362   1.763  12.438 </code></pre>
+ 11.718   1.106  11.250 </code></pre>
 </div>
 </div>
 </section>
@@ -978,17 +1023,17 @@ <h2>Performance Review: Partitioned Parquet</h2>
 <p>How long does it take to calculate the number of books checked out in each month of 2021?</p>
 <p><br></p>
 <div class="cell">
-<div class="sourceCode cell-code" id="cb30"><pre class="sourceCode numberSource r number-lines code-with-copy"><code class="sourceCode r"><span id="cb30-1"><a></a><span class="fu">open_dataset</span>(<span class="at">sources =</span> <span class="st">"data/seattle-library-checkouts"</span>,</span>
-<span id="cb30-2"><a></a>             <span class="at">format =</span> <span class="st">"parquet"</span>) <span class="sc">|&gt;</span> </span>
-<span id="cb30-3"><a></a>  <span class="fu">filter</span>(CheckoutYear <span class="sc">==</span> <span class="dv">2021</span>, MaterialType <span class="sc">==</span> <span class="st">"BOOK"</span>) <span class="sc">|&gt;</span></span>
-<span id="cb30-4"><a></a>  <span class="fu">group_by</span>(CheckoutMonth) <span class="sc">|&gt;</span></span>
-<span id="cb30-5"><a></a>  <span class="fu">summarise</span>(<span class="at">TotalCheckouts =</span> <span class="fu">sum</span>(Checkouts)) <span class="sc">|&gt;</span></span>
-<span id="cb30-6"><a></a>  <span class="fu">arrange</span>(<span class="fu">desc</span>(CheckoutMonth)) <span class="sc">|&gt;</span></span>
-<span id="cb30-7"><a></a>  <span class="fu">collect</span>() <span class="sc">|&gt;</span> </span>
-<span id="cb30-8"><a></a>  <span class="fu">system.time</span>()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb34"><pre class="sourceCode numberSource r number-lines code-with-copy"><code class="sourceCode r"><span id="cb34-1"><a></a><span class="fu">open_dataset</span>(<span class="at">sources =</span> <span class="st">"data/seattle-library-checkouts"</span>,</span>
+<span id="cb34-2"><a></a>             <span class="at">format =</span> <span class="st">"parquet"</span>) <span class="sc">|&gt;</span> </span>
+<span id="cb34-3"><a></a>  <span class="fu">filter</span>(CheckoutYear <span class="sc">==</span> <span class="dv">2021</span>, MaterialType <span class="sc">==</span> <span class="st">"BOOK"</span>) <span class="sc">|&gt;</span></span>
+<span id="cb34-4"><a></a>  <span class="fu">group_by</span>(CheckoutMonth) <span class="sc">|&gt;</span></span>
+<span id="cb34-5"><a></a>  <span class="fu">summarise</span>(<span class="at">TotalCheckouts =</span> <span class="fu">sum</span>(Checkouts)) <span class="sc">|&gt;</span></span>
+<span id="cb34-6"><a></a>  <span class="fu">arrange</span>(<span class="fu">desc</span>(CheckoutMonth)) <span class="sc">|&gt;</span></span>
+<span id="cb34-7"><a></a>  <span class="fu">collect</span>() <span class="sc">|&gt;</span> </span>
+<span id="cb34-8"><a></a>  <span class="fu">system.time</span>()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
 <pre><code>   user  system elapsed 
-  0.330   0.039   0.091 </code></pre>
+  0.224   0.040   0.068 </code></pre>
 </div>
 </div>
 </section>
diff --git a/_site/materials/5_arrow_single_file.html b/_site/materials/5_arrow_single_file.html
index 06a9782..bbc5840 100644
--- a/_site/materials/5_arrow_single_file.html
+++ b/_site/materials/5_arrow_single_file.html
@@ -431,16 +431,16 @@ <h2>Read a Parquet File (<code>tibble</code>)</h2>
 <pre><code># A tibble: 6,567,396 × 22
    vendor_name pickup_datetime     dropoff_datetime    passenger_count
    &lt;chr&gt;       &lt;dttm&gt;              &lt;dttm&gt;                        &lt;int&gt;
- 1 VTS         2019-09-01 06:14:09 2019-09-01 06:31:52               2
- 2 VTS         2019-09-01 06:36:17 2019-09-01 07:12:44               1
- 3 VTS         2019-09-01 06:29:19 2019-09-01 06:54:13               1
- 4 CMT         2019-09-01 06:33:09 2019-09-01 06:52:14               2
- 5 VTS         2019-09-01 06:57:43 2019-09-01 07:26:21               1
- 6 CMT         2019-09-01 06:59:16 2019-09-01 07:28:12               1
- 7 CMT         2019-09-01 06:20:06 2019-09-01 06:52:19               1
- 8 CMT         2019-09-01 06:27:54 2019-09-01 06:32:56               0
- 9 CMT         2019-09-01 06:35:08 2019-09-01 06:55:51               0
-10 CMT         2019-09-01 06:19:37 2019-09-01 06:30:52               1
+ 1 CMT         2019-08-31 18:09:30 2019-08-31 18:15:42               1
+ 2 CMT         2019-08-31 18:26:30 2019-08-31 18:44:31               1
+ 3 CMT         2019-08-31 18:39:35 2019-08-31 19:15:55               2
+ 4 VTS         2019-08-31 18:12:26 2019-08-31 18:15:17               4
+ 5 VTS         2019-08-31 18:43:16 2019-08-31 18:53:50               1
+ 6 VTS         2019-08-31 18:26:13 2019-08-31 18:45:35               1
+ 7 CMT         2019-08-31 18:34:52 2019-08-31 18:42:03               1
+ 8 CMT         2019-08-31 18:50:02 2019-08-31 18:58:16               1
+ 9 CMT         2019-08-31 18:08:02 2019-08-31 18:14:44               0
+10 VTS         2019-08-31 18:11:38 2019-08-31 18:26:47               1
 # ℹ 6,567,386 more rows
 # ℹ 18 more variables: trip_distance &lt;dbl&gt;, pickup_longitude &lt;dbl&gt;,
 #   pickup_latitude &lt;dbl&gt;, rate_code &lt;chr&gt;, store_and_fwd &lt;chr&gt;,
@@ -506,6 +506,70 @@ <h2><code>tibble</code> &lt;-&gt; <code>Table</code> &lt;-&gt; <code>data.frame<
 <li><code>Table</code> is an Arrow object <em>in-memory</em></li>
 </ul>
 </section>
+<section id="watch-your-schemas" class="slide level2">
+<h2>Watch Your Schemas 👀</h2>
+<div class="columns">
+<div class="column" style="width:50%;">
+<div class="cell">
+<div class="sourceCode cell-code" id="cb6"><pre class="sourceCode numberSource r number-lines code-with-copy"><code class="sourceCode r"><span id="cb6-1"><a></a><span class="fu">schema</span>(taxi_df)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code>Schema
+vendor_name: string
+pickup_datetime: timestamp[us, tz=America/Vancouver]
+dropoff_datetime: timestamp[us, tz=America/Vancouver]
+passenger_count: int32
+trip_distance: double
+pickup_longitude: double
+pickup_latitude: double
+rate_code: string
+store_and_fwd: string
+dropoff_longitude: double
+dropoff_latitude: double
+payment_type: string
+fare_amount: double
+extra: double
+mta_tax: double
+tip_amount: double
+tolls_amount: double
+total_amount: double
+improvement_surcharge: double
+congestion_surcharge: double
+pickup_location_id: int32
+dropoff_location_id: int32</code></pre>
+</div>
+</div>
+</div><div class="column" style="width:50%;">
+<div class="cell">
+<div class="sourceCode cell-code" id="cb8"><pre class="sourceCode numberSource r number-lines code-with-copy"><code class="sourceCode r"><span id="cb8-1"><a></a><span class="fu">schema</span>(taxi_table)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code>Schema
+vendor_name: string
+pickup_datetime: timestamp[ms]
+dropoff_datetime: timestamp[ms]
+passenger_count: int64
+trip_distance: double
+pickup_longitude: double
+pickup_latitude: double
+rate_code: string
+store_and_fwd: string
+dropoff_longitude: double
+dropoff_latitude: double
+payment_type: string
+fare_amount: double
+extra: double
+mta_tax: double
+tip_amount: double
+tolls_amount: double
+total_amount: double
+improvement_surcharge: double
+congestion_surcharge: double
+pickup_location_id: int64
+dropoff_location_id: int64</code></pre>
+</div>
+</div>
+</div>
+</div>
+</section>
 <section id="data-frames" class="slide level2">
 <h2>Data frames</h2>
 
@@ -530,13 +594,13 @@ <h2>Arrow Tables</h2>
 <section id="table-dataset-a-dplyr-pipeline" class="slide level2">
 <h2>Table | Dataset: A <code>dplyr</code> pipeline</h2>
 <div class="cell">
-<div class="sourceCode cell-code" id="cb6"><pre class="sourceCode numberSource r number-lines code-with-copy"><code class="sourceCode r"><span id="cb6-1"><a></a>parquet_file <span class="sc">|&gt;</span></span>
-<span id="cb6-2"><a></a>  <span class="fu">read_parquet</span>(<span class="at">as_data_frame =</span> <span class="cn">FALSE</span>) <span class="sc">|&gt;</span></span>
-<span id="cb6-3"><a></a>  <span class="fu">group_by</span>(vendor_name) <span class="sc">|&gt;</span></span>
-<span id="cb6-4"><a></a>  <span class="fu">summarise</span>(<span class="at">all_trips =</span> <span class="fu">n</span>(),</span>
-<span id="cb6-5"><a></a>            <span class="at">shared_trips =</span> <span class="fu">sum</span>(passenger_count <span class="sc">&gt;</span> <span class="dv">1</span>, <span class="at">na.rm =</span> <span class="cn">TRUE</span>)) <span class="sc">|&gt;</span></span>
-<span id="cb6-6"><a></a>  <span class="fu">mutate</span>(<span class="at">pct_shared =</span> shared_trips <span class="sc">/</span> all_trips <span class="sc">*</span> <span class="dv">100</span>) <span class="sc">|&gt;</span></span>
-<span id="cb6-7"><a></a>  <span class="fu">collect</span>()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb10"><pre class="sourceCode numberSource r number-lines code-with-copy"><code class="sourceCode r"><span id="cb10-1"><a></a>parquet_file <span class="sc">|&gt;</span></span>
+<span id="cb10-2"><a></a>  <span class="fu">read_parquet</span>(<span class="at">as_data_frame =</span> <span class="cn">FALSE</span>) <span class="sc">|&gt;</span></span>
+<span id="cb10-3"><a></a>  <span class="fu">group_by</span>(vendor_name) <span class="sc">|&gt;</span></span>
+<span id="cb10-4"><a></a>  <span class="fu">summarise</span>(<span class="at">all_trips =</span> <span class="fu">n</span>(),</span>
+<span id="cb10-5"><a></a>            <span class="at">shared_trips =</span> <span class="fu">sum</span>(passenger_count <span class="sc">&gt;</span> <span class="dv">1</span>, <span class="at">na.rm =</span> <span class="cn">TRUE</span>)) <span class="sc">|&gt;</span></span>
+<span id="cb10-6"><a></a>  <span class="fu">mutate</span>(<span class="at">pct_shared =</span> shared_trips <span class="sc">/</span> all_trips <span class="sc">*</span> <span class="dv">100</span>) <span class="sc">|&gt;</span></span>
+<span id="cb10-7"><a></a>  <span class="fu">collect</span>()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
 <pre><code># A tibble: 3 × 4
   vendor_name all_trips shared_trips pct_shared
@@ -565,63 +629,63 @@ <h2>Table | Dataset: A <code>dplyr</code> pipeline</h2>
 <section id="arrow-for-efficient-in-memory-processing" class="slide level2">
 <h2>Arrow for Efficient In-Memory Processing</h2>
 <div class="cell">
-<div class="sourceCode cell-code" id="cb8"><pre class="sourceCode numberSource r number-lines code-with-copy"><code class="sourceCode r"><span id="cb8-1"><a></a>parquet_file <span class="sc">|&gt;</span></span>
-<span id="cb8-2"><a></a>  <span class="fu">read_parquet</span>() <span class="sc">|&gt;</span></span>
-<span id="cb8-3"><a></a>  <span class="fu">nrow</span>()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb12"><pre class="sourceCode numberSource r number-lines code-with-copy"><code class="sourceCode r"><span id="cb12-1"><a></a>parquet_file <span class="sc">|&gt;</span></span>
+<span id="cb12-2"><a></a>  <span class="fu">read_parquet</span>() <span class="sc">|&gt;</span></span>
+<span id="cb12-3"><a></a>  <span class="fu">nrow</span>()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
 <pre><code>[1] 6567396</code></pre>
 </div>
 </div>
 <p><br></p>
 <div class="cell">
-<div class="sourceCode cell-code" id="cb10" data-code-line-numbers="|2,8"><pre class="sourceCode numberSource r number-lines code-with-copy"><code class="sourceCode r"><span id="cb10-1"><a></a>parquet_file <span class="sc">|&gt;</span></span>
-<span id="cb10-2"><a></a>  <span class="fu">read_parquet</span>() <span class="sc">|&gt;</span></span>
-<span id="cb10-3"><a></a>  <span class="fu">group_by</span>(vendor_name) <span class="sc">|&gt;</span></span>
-<span id="cb10-4"><a></a>  <span class="fu">summarise</span>(<span class="at">all_trips =</span> <span class="fu">n</span>(),</span>
-<span id="cb10-5"><a></a>            <span class="at">shared_trips =</span> <span class="fu">sum</span>(passenger_count <span class="sc">&gt;</span> <span class="dv">1</span>, <span class="at">na.rm =</span> <span class="cn">TRUE</span>)) <span class="sc">|&gt;</span></span>
-<span id="cb10-6"><a></a>  <span class="fu">mutate</span>(<span class="at">pct_shared =</span> shared_trips <span class="sc">/</span> all_trips <span class="sc">*</span> <span class="dv">100</span>) <span class="sc">|&gt;</span></span>
-<span id="cb10-7"><a></a>  <span class="fu">collect</span>() <span class="sc">|&gt;</span></span>
-<span id="cb10-8"><a></a>  <span class="fu">system.time</span>()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb14" data-code-line-numbers="|2,8"><pre class="sourceCode numberSource r number-lines code-with-copy"><code class="sourceCode r"><span id="cb14-1"><a></a>parquet_file <span class="sc">|&gt;</span></span>
+<span id="cb14-2"><a></a>  <span class="fu">read_parquet</span>() <span class="sc">|&gt;</span></span>
+<span id="cb14-3"><a></a>  <span class="fu">group_by</span>(vendor_name) <span class="sc">|&gt;</span></span>
+<span id="cb14-4"><a></a>  <span class="fu">summarise</span>(<span class="at">all_trips =</span> <span class="fu">n</span>(),</span>
+<span id="cb14-5"><a></a>            <span class="at">shared_trips =</span> <span class="fu">sum</span>(passenger_count <span class="sc">&gt;</span> <span class="dv">1</span>, <span class="at">na.rm =</span> <span class="cn">TRUE</span>)) <span class="sc">|&gt;</span></span>
+<span id="cb14-6"><a></a>  <span class="fu">mutate</span>(<span class="at">pct_shared =</span> shared_trips <span class="sc">/</span> all_trips <span class="sc">*</span> <span class="dv">100</span>) <span class="sc">|&gt;</span></span>
+<span id="cb14-7"><a></a>  <span class="fu">collect</span>() <span class="sc">|&gt;</span></span>
+<span id="cb14-8"><a></a>  <span class="fu">system.time</span>()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
 <pre><code>   user  system elapsed 
-  2.214   0.575   0.814 </code></pre>
+  1.157   0.261   0.509 </code></pre>
 </div>
 </div>
 </section>
 <section id="arrow-for-efficient-in-memory-processing-1" class="slide level2">
 <h2>Arrow for Efficient In-Memory Processing</h2>
 <div class="cell">
-<div class="sourceCode cell-code" id="cb12"><pre class="sourceCode numberSource r number-lines code-with-copy"><code class="sourceCode r"><span id="cb12-1"><a></a>parquet_file <span class="sc">|&gt;</span></span>
-<span id="cb12-2"><a></a>  <span class="fu">read_parquet</span>(<span class="at">as_data_frame =</span> <span class="cn">FALSE</span>) <span class="sc">|&gt;</span></span>
-<span id="cb12-3"><a></a>  <span class="fu">nrow</span>()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb16"><pre class="sourceCode numberSource r number-lines code-with-copy"><code class="sourceCode r"><span id="cb16-1"><a></a>parquet_file <span class="sc">|&gt;</span></span>
+<span id="cb16-2"><a></a>  <span class="fu">read_parquet</span>(<span class="at">as_data_frame =</span> <span class="cn">FALSE</span>) <span class="sc">|&gt;</span></span>
+<span id="cb16-3"><a></a>  <span class="fu">nrow</span>()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
 <pre><code>[1] 6567396</code></pre>
 </div>
 </div>
 <p><br></p>
 <div class="cell">
-<div class="sourceCode cell-code" id="cb14" data-code-line-numbers="|2,8"><pre class="sourceCode numberSource r number-lines code-with-copy"><code class="sourceCode r"><span id="cb14-1"><a></a>parquet_file <span class="sc">|&gt;</span></span>
-<span id="cb14-2"><a></a>  <span class="fu">read_parquet</span>(<span class="at">as_data_frame =</span> <span class="cn">FALSE</span>) <span class="sc">|&gt;</span></span>
-<span id="cb14-3"><a></a>  <span class="fu">group_by</span>(vendor_name) <span class="sc">|&gt;</span></span>
-<span id="cb14-4"><a></a>  <span class="fu">summarise</span>(<span class="at">all_trips =</span> <span class="fu">n</span>(),</span>
-<span id="cb14-5"><a></a>            <span class="at">shared_trips =</span> <span class="fu">sum</span>(passenger_count <span class="sc">&gt;</span> <span class="dv">1</span>, <span class="at">na.rm =</span> <span class="cn">TRUE</span>)) <span class="sc">|&gt;</span></span>
-<span id="cb14-6"><a></a>  <span class="fu">mutate</span>(<span class="at">pct_shared =</span> shared_trips <span class="sc">/</span> all_trips <span class="sc">*</span> <span class="dv">100</span>) <span class="sc">|&gt;</span></span>
-<span id="cb14-7"><a></a>  <span class="fu">collect</span>() <span class="sc">|&gt;</span></span>
-<span id="cb14-8"><a></a>  <span class="fu">system.time</span>()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb18" data-code-line-numbers="|2,8"><pre class="sourceCode numberSource r number-lines code-with-copy"><code class="sourceCode r"><span id="cb18-1"><a></a>parquet_file <span class="sc">|&gt;</span></span>
+<span id="cb18-2"><a></a>  <span class="fu">read_parquet</span>(<span class="at">as_data_frame =</span> <span class="cn">FALSE</span>) <span class="sc">|&gt;</span></span>
+<span id="cb18-3"><a></a>  <span class="fu">group_by</span>(vendor_name) <span class="sc">|&gt;</span></span>
+<span id="cb18-4"><a></a>  <span class="fu">summarise</span>(<span class="at">all_trips =</span> <span class="fu">n</span>(),</span>
+<span id="cb18-5"><a></a>            <span class="at">shared_trips =</span> <span class="fu">sum</span>(passenger_count <span class="sc">&gt;</span> <span class="dv">1</span>, <span class="at">na.rm =</span> <span class="cn">TRUE</span>)) <span class="sc">|&gt;</span></span>
+<span id="cb18-6"><a></a>  <span class="fu">mutate</span>(<span class="at">pct_shared =</span> shared_trips <span class="sc">/</span> all_trips <span class="sc">*</span> <span class="dv">100</span>) <span class="sc">|&gt;</span></span>
+<span id="cb18-7"><a></a>  <span class="fu">collect</span>() <span class="sc">|&gt;</span></span>
+<span id="cb18-8"><a></a>  <span class="fu">system.time</span>()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
 <pre><code>   user  system elapsed 
-  1.995   0.343   0.366 </code></pre>
+  1.047   0.203   0.220 </code></pre>
 </div>
 </div>
 </section>
 <section id="read-a-parquet-file-selectively" class="slide level2">
 <h2>Read a Parquet File Selectively</h2>
 <div class="cell">
-<div class="sourceCode cell-code" id="cb16"><pre class="sourceCode numberSource r number-lines code-with-copy"><code class="sourceCode r"><span id="cb16-1"><a></a>parquet_file <span class="sc">|&gt;</span></span>
-<span id="cb16-2"><a></a>  <span class="fu">read_parquet</span>(</span>
-<span id="cb16-3"><a></a>    <span class="at">col_select =</span> <span class="fu">c</span>(<span class="st">"vendor_name"</span>, <span class="st">"passenger_count"</span>),</span>
-<span id="cb16-4"><a></a>    <span class="at">as_data_frame =</span> <span class="cn">FALSE</span></span>
-<span id="cb16-5"><a></a>  )</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb20"><pre class="sourceCode numberSource r number-lines code-with-copy"><code class="sourceCode r"><span id="cb20-1"><a></a>parquet_file <span class="sc">|&gt;</span></span>
+<span id="cb20-2"><a></a>  <span class="fu">read_parquet</span>(</span>
+<span id="cb20-3"><a></a>    <span class="at">col_select =</span> <span class="fu">c</span>(<span class="st">"vendor_name"</span>, <span class="st">"passenger_count"</span>),</span>
+<span id="cb20-4"><a></a>    <span class="at">as_data_frame =</span> <span class="cn">FALSE</span></span>
+<span id="cb20-5"><a></a>  )</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
 <pre><code>Table
 6567396 rows x 2 columns
@@ -633,20 +697,20 @@ <h2>Read a Parquet File Selectively</h2>
 <section id="selective-reads-are-faster" class="slide level2">
 <h2>Selective Reads Are Faster</h2>
 <div class="cell">
-<div class="sourceCode cell-code" id="cb18" data-code-line-numbers="|2,3,4,11"><pre class="sourceCode numberSource r number-lines code-with-copy"><code class="sourceCode r"><span id="cb18-1"><a></a>parquet_file <span class="sc">|&gt;</span></span>
-<span id="cb18-2"><a></a>  <span class="fu">read_parquet</span>(</span>
-<span id="cb18-3"><a></a>    <span class="at">col_select =</span> <span class="fu">c</span>(<span class="st">"vendor_name"</span>, <span class="st">"passenger_count"</span>),</span>
-<span id="cb18-4"><a></a>    <span class="at">as_data_frame =</span> <span class="cn">FALSE</span></span>
-<span id="cb18-5"><a></a>  ) <span class="sc">|&gt;</span> </span>
-<span id="cb18-6"><a></a>  <span class="fu">group_by</span>(vendor_name) <span class="sc">|&gt;</span></span>
-<span id="cb18-7"><a></a>  <span class="fu">summarise</span>(<span class="at">all_trips =</span> <span class="fu">n</span>(),</span>
-<span id="cb18-8"><a></a>            <span class="at">shared_trips =</span> <span class="fu">sum</span>(passenger_count <span class="sc">&gt;</span> <span class="dv">1</span>, <span class="at">na.rm =</span> <span class="cn">TRUE</span>)) <span class="sc">|&gt;</span></span>
-<span id="cb18-9"><a></a>  <span class="fu">mutate</span>(<span class="at">pct_shared =</span> shared_trips <span class="sc">/</span> all_trips <span class="sc">*</span> <span class="dv">100</span>) <span class="sc">|&gt;</span></span>
-<span id="cb18-10"><a></a>  <span class="fu">collect</span>() <span class="sc">|&gt;</span></span>
-<span id="cb18-11"><a></a>  <span class="fu">system.time</span>()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb22" data-code-line-numbers="|2,3,4,11"><pre class="sourceCode numberSource r number-lines code-with-copy"><code class="sourceCode r"><span id="cb22-1"><a></a>parquet_file <span class="sc">|&gt;</span></span>
+<span id="cb22-2"><a></a>  <span class="fu">read_parquet</span>(</span>
+<span id="cb22-3"><a></a>    <span class="at">col_select =</span> <span class="fu">c</span>(<span class="st">"vendor_name"</span>, <span class="st">"passenger_count"</span>),</span>
+<span id="cb22-4"><a></a>    <span class="at">as_data_frame =</span> <span class="cn">FALSE</span></span>
+<span id="cb22-5"><a></a>  ) <span class="sc">|&gt;</span> </span>
+<span id="cb22-6"><a></a>  <span class="fu">group_by</span>(vendor_name) <span class="sc">|&gt;</span></span>
+<span id="cb22-7"><a></a>  <span class="fu">summarise</span>(<span class="at">all_trips =</span> <span class="fu">n</span>(),</span>
+<span id="cb22-8"><a></a>            <span class="at">shared_trips =</span> <span class="fu">sum</span>(passenger_count <span class="sc">&gt;</span> <span class="dv">1</span>, <span class="at">na.rm =</span> <span class="cn">TRUE</span>)) <span class="sc">|&gt;</span></span>
+<span id="cb22-9"><a></a>  <span class="fu">mutate</span>(<span class="at">pct_shared =</span> shared_trips <span class="sc">/</span> all_trips <span class="sc">*</span> <span class="dv">100</span>) <span class="sc">|&gt;</span></span>
+<span id="cb22-10"><a></a>  <span class="fu">collect</span>() <span class="sc">|&gt;</span></span>
+<span id="cb22-11"><a></a>  <span class="fu">system.time</span>()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
 <pre><code>   user  system elapsed 
-  0.323   0.088   0.234 </code></pre>
+  0.258   0.011   0.131 </code></pre>
 </div>
 </div>
 <aside class="notes">
diff --git a/_site/materials/6_wrapping_up.html b/_site/materials/6_wrapping_up.html
index 47c3041..51470f3 100644
--- a/_site/materials/6_wrapping_up.html
+++ b/_site/materials/6_wrapping_up.html
@@ -417,7 +417,7 @@ <h2>Arrow</h2>
 <span id="cb1-14"><a></a>dropoff_zones <span class="ot">&lt;-</span> nyc_taxi_zones <span class="sc">|&gt;</span></span>
 <span id="cb1-15"><a></a>  <span class="fu">select</span>(<span class="at">dropoff_location_id =</span> location_id,</span>
 <span id="cb1-16"><a></a>         <span class="at">dropoff_zone =</span> zone) <span class="sc">|&gt;</span> </span>
-<span id="cb1-17"><a></a>  <span class="fu">compute</span>() <span class="co"># run the query but don't pull results into R session</span></span>
+<span id="cb1-17"><a></a>  <span class="fu">collect</span>(<span class="at">as_data_frame =</span> <span class="cn">FALSE</span>)</span>
 <span id="cb1-18"><a></a></span>
 <span id="cb1-19"><a></a>airport_pickups <span class="ot">&lt;-</span> <span class="fu">open_dataset</span>(<span class="st">"data/nyc-taxi/"</span>) <span class="sc">|&gt;</span></span>
 <span id="cb1-20"><a></a>  <span class="fu">filter</span>(pickup_location_id <span class="sc">%in%</span> airport_zones) <span class="sc">|&gt;</span></span>
diff --git a/_site/materials/6_wrapping_up_files/figure-revealjs/arrow_r_together-1.png b/_site/materials/6_wrapping_up_files/figure-revealjs/arrow_r_together-1.png
index e27b280..358c16e 100644
Binary files a/_site/materials/6_wrapping_up_files/figure-revealjs/arrow_r_together-1.png and b/_site/materials/6_wrapping_up_files/figure-revealjs/arrow_r_together-1.png differ
diff --git a/_site/search.json b/_site/search.json
index 08e537b..0e28af2 100644
--- a/_site/search.json
+++ b/_site/search.json
@@ -95,7 +95,7 @@
     "href": "materials/5_arrow_single_file.html#read-a-parquet-file-tibble",
     "title": "Big Data in R with Arrow",
     "section": "Read a Parquet File (tibble)",
-    "text": "Read a Parquet File (tibble)\n\nlibrary(arrow)\n\nparquet_file &lt;- \"data/nyc-taxi/year=2019/month=9/part-0.parquet\"\n\ntaxi_df &lt;- read_parquet(file = parquet_file)\ntaxi_df\n\n# A tibble: 6,567,396 × 22\n   vendor_name pickup_datetime     dropoff_datetime    passenger_count\n   &lt;chr&gt;       &lt;dttm&gt;              &lt;dttm&gt;                        &lt;int&gt;\n 1 VTS         2019-09-01 06:14:09 2019-09-01 06:31:52               2\n 2 VTS         2019-09-01 06:36:17 2019-09-01 07:12:44               1\n 3 VTS         2019-09-01 06:29:19 2019-09-01 06:54:13               1\n 4 CMT         2019-09-01 06:33:09 2019-09-01 06:52:14               2\n 5 VTS         2019-09-01 06:57:43 2019-09-01 07:26:21               1\n 6 CMT         2019-09-01 06:59:16 2019-09-01 07:28:12               1\n 7 CMT         2019-09-01 06:20:06 2019-09-01 06:52:19               1\n 8 CMT         2019-09-01 06:27:54 2019-09-01 06:32:56               0\n 9 CMT         2019-09-01 06:35:08 2019-09-01 06:55:51               0\n10 CMT         2019-09-01 06:19:37 2019-09-01 06:30:52               1\n# ℹ 6,567,386 more rows\n# ℹ 18 more variables: trip_distance &lt;dbl&gt;, pickup_longitude &lt;dbl&gt;,\n#   pickup_latitude &lt;dbl&gt;, rate_code &lt;chr&gt;, store_and_fwd &lt;chr&gt;,\n#   dropoff_longitude &lt;dbl&gt;, dropoff_latitude &lt;dbl&gt;, payment_type &lt;chr&gt;,\n#   fare_amount &lt;dbl&gt;, extra &lt;dbl&gt;, mta_tax &lt;dbl&gt;, tip_amount &lt;dbl&gt;,\n#   tolls_amount &lt;dbl&gt;, total_amount &lt;dbl&gt;, improvement_surcharge &lt;dbl&gt;,\n#   congestion_surcharge &lt;dbl&gt;, pickup_location_id &lt;int&gt;, …"
+    "text": "Read a Parquet File (tibble)\n\nlibrary(arrow)\n\nparquet_file &lt;- \"data/nyc-taxi/year=2019/month=9/part-0.parquet\"\n\ntaxi_df &lt;- read_parquet(file = parquet_file)\ntaxi_df\n\n# A tibble: 6,567,396 × 22\n   vendor_name pickup_datetime     dropoff_datetime    passenger_count\n   &lt;chr&gt;       &lt;dttm&gt;              &lt;dttm&gt;                        &lt;int&gt;\n 1 CMT         2019-08-31 18:09:30 2019-08-31 18:15:42               1\n 2 CMT         2019-08-31 18:26:30 2019-08-31 18:44:31               1\n 3 CMT         2019-08-31 18:39:35 2019-08-31 19:15:55               2\n 4 VTS         2019-08-31 18:12:26 2019-08-31 18:15:17               4\n 5 VTS         2019-08-31 18:43:16 2019-08-31 18:53:50               1\n 6 VTS         2019-08-31 18:26:13 2019-08-31 18:45:35               1\n 7 CMT         2019-08-31 18:34:52 2019-08-31 18:42:03               1\n 8 CMT         2019-08-31 18:50:02 2019-08-31 18:58:16               1\n 9 CMT         2019-08-31 18:08:02 2019-08-31 18:14:44               0\n10 VTS         2019-08-31 18:11:38 2019-08-31 18:26:47               1\n# ℹ 6,567,386 more rows\n# ℹ 18 more variables: trip_distance &lt;dbl&gt;, pickup_longitude &lt;dbl&gt;,\n#   pickup_latitude &lt;dbl&gt;, rate_code &lt;chr&gt;, store_and_fwd &lt;chr&gt;,\n#   dropoff_longitude &lt;dbl&gt;, dropoff_latitude &lt;dbl&gt;, payment_type &lt;chr&gt;,\n#   fare_amount &lt;dbl&gt;, extra &lt;dbl&gt;, mta_tax &lt;dbl&gt;, tip_amount &lt;dbl&gt;,\n#   tolls_amount &lt;dbl&gt;, total_amount &lt;dbl&gt;, improvement_surcharge &lt;dbl&gt;,\n#   congestion_surcharge &lt;dbl&gt;, pickup_location_id &lt;int&gt;, …"
   },
   {
     "objectID": "materials/5_arrow_single_file.html#read-a-parquet-file-table",
@@ -111,6 +111,13 @@
     "section": "tibble <-> Table <-> data.frame",
     "text": "tibble &lt;-&gt; Table &lt;-&gt; data.frame\n\nlibrary(dplyr)\n\n#change a df to a table\narrow_table(taxi_df)\n\n#change a table to a tibble\ntaxi_table |&gt; collect()\nas_tibble(taxi_table)\n\n#change a table to a data.frame\nas.data.frame(taxi_table)\n\n\n\ndata.frame & tibble are R objects in-memory\nTable is an Arrow object in-memory"
   },
+  {
+    "objectID": "materials/5_arrow_single_file.html#watch-your-schemas",
+    "href": "materials/5_arrow_single_file.html#watch-your-schemas",
+    "title": "Big Data in R with Arrow",
+    "section": "Watch Your Schemas 👀",
+    "text": "Watch Your Schemas 👀\n\n\n\nschema(taxi_df)\n\nSchema\nvendor_name: string\npickup_datetime: timestamp[us, tz=America/Vancouver]\ndropoff_datetime: timestamp[us, tz=America/Vancouver]\npassenger_count: int32\ntrip_distance: double\npickup_longitude: double\npickup_latitude: double\nrate_code: string\nstore_and_fwd: string\ndropoff_longitude: double\ndropoff_latitude: double\npayment_type: string\nfare_amount: double\nextra: double\nmta_tax: double\ntip_amount: double\ntolls_amount: double\ntotal_amount: double\nimprovement_surcharge: double\ncongestion_surcharge: double\npickup_location_id: int32\ndropoff_location_id: int32\n\n\n\n\nschema(taxi_table)\n\nSchema\nvendor_name: string\npickup_datetime: timestamp[ms]\ndropoff_datetime: timestamp[ms]\npassenger_count: int64\ntrip_distance: double\npickup_longitude: double\npickup_latitude: double\nrate_code: string\nstore_and_fwd: string\ndropoff_longitude: double\ndropoff_latitude: double\npayment_type: string\nfare_amount: double\nextra: double\nmta_tax: double\ntip_amount: double\ntolls_amount: double\ntotal_amount: double\nimprovement_surcharge: double\ncongestion_surcharge: double\npickup_location_id: int64\ndropoff_location_id: int64"
+  },
   {
     "objectID": "materials/5_arrow_single_file.html#data-frames",
     "href": "materials/5_arrow_single_file.html#data-frames",
@@ -137,14 +144,14 @@
     "href": "materials/5_arrow_single_file.html#arrow-for-efficient-in-memory-processing",
     "title": "Big Data in R with Arrow",
     "section": "Arrow for Efficient In-Memory Processing",
-    "text": "Arrow for Efficient In-Memory Processing\n\nparquet_file |&gt;\n  read_parquet() |&gt;\n  nrow()\n\n[1] 6567396\n\n\n\n\nparquet_file |&gt;\n  read_parquet() |&gt;\n  group_by(vendor_name) |&gt;\n  summarise(all_trips = n(),\n            shared_trips = sum(passenger_count &gt; 1, na.rm = TRUE)) |&gt;\n  mutate(pct_shared = shared_trips / all_trips * 100) |&gt;\n  collect() |&gt;\n  system.time()\n\n   user  system elapsed \n  2.214   0.575   0.814"
+    "text": "Arrow for Efficient In-Memory Processing\n\nparquet_file |&gt;\n  read_parquet() |&gt;\n  nrow()\n\n[1] 6567396\n\n\n\n\nparquet_file |&gt;\n  read_parquet() |&gt;\n  group_by(vendor_name) |&gt;\n  summarise(all_trips = n(),\n            shared_trips = sum(passenger_count &gt; 1, na.rm = TRUE)) |&gt;\n  mutate(pct_shared = shared_trips / all_trips * 100) |&gt;\n  collect() |&gt;\n  system.time()\n\n   user  system elapsed \n  1.157   0.261   0.509"
   },
   {
     "objectID": "materials/5_arrow_single_file.html#arrow-for-efficient-in-memory-processing-1",
     "href": "materials/5_arrow_single_file.html#arrow-for-efficient-in-memory-processing-1",
     "title": "Big Data in R with Arrow",
     "section": "Arrow for Efficient In-Memory Processing",
-    "text": "Arrow for Efficient In-Memory Processing\n\nparquet_file |&gt;\n  read_parquet(as_data_frame = FALSE) |&gt;\n  nrow()\n\n[1] 6567396\n\n\n\n\nparquet_file |&gt;\n  read_parquet(as_data_frame = FALSE) |&gt;\n  group_by(vendor_name) |&gt;\n  summarise(all_trips = n(),\n            shared_trips = sum(passenger_count &gt; 1, na.rm = TRUE)) |&gt;\n  mutate(pct_shared = shared_trips / all_trips * 100) |&gt;\n  collect() |&gt;\n  system.time()\n\n   user  system elapsed \n  1.995   0.343   0.366"
+    "text": "Arrow for Efficient In-Memory Processing\n\nparquet_file |&gt;\n  read_parquet(as_data_frame = FALSE) |&gt;\n  nrow()\n\n[1] 6567396\n\n\n\n\nparquet_file |&gt;\n  read_parquet(as_data_frame = FALSE) |&gt;\n  group_by(vendor_name) |&gt;\n  summarise(all_trips = n(),\n            shared_trips = sum(passenger_count &gt; 1, na.rm = TRUE)) |&gt;\n  mutate(pct_shared = shared_trips / all_trips * 100) |&gt;\n  collect() |&gt;\n  system.time()\n\n   user  system elapsed \n  1.047   0.203   0.220"
   },
   {
     "objectID": "materials/5_arrow_single_file.html#read-a-parquet-file-selectively",
@@ -158,7 +165,7 @@
     "href": "materials/5_arrow_single_file.html#selective-reads-are-faster",
     "title": "Big Data in R with Arrow",
     "section": "Selective Reads Are Faster",
-    "text": "Selective Reads Are Faster\n\nparquet_file |&gt;\n  read_parquet(\n    col_select = c(\"vendor_name\", \"passenger_count\"),\n    as_data_frame = FALSE\n  ) |&gt; \n  group_by(vendor_name) |&gt;\n  summarise(all_trips = n(),\n            shared_trips = sum(passenger_count &gt; 1, na.rm = TRUE)) |&gt;\n  mutate(pct_shared = shared_trips / all_trips * 100) |&gt;\n  collect() |&gt;\n  system.time()\n\n   user  system elapsed \n  0.323   0.088   0.234 \n\n\n\nNotes: row-based format readers often allow you to specify which columns to read in but the entire row must be read in and the unwanted columns discarded. Parquet’s columnar format allows you to read in only the columns you need, which is faster when you only need a subset of the data."
+    "text": "Selective Reads Are Faster\n\nparquet_file |&gt;\n  read_parquet(\n    col_select = c(\"vendor_name\", \"passenger_count\"),\n    as_data_frame = FALSE\n  ) |&gt; \n  group_by(vendor_name) |&gt;\n  summarise(all_trips = n(),\n            shared_trips = sum(passenger_count &gt; 1, na.rm = TRUE)) |&gt;\n  mutate(pct_shared = shared_trips / all_trips * 100) |&gt;\n  collect() |&gt;\n  system.time()\n\n   user  system elapsed \n  0.258   0.011   0.131 \n\n\n\nNotes: row-based format readers often allow you to specify which columns to read in but the entire row must be read in and the unwanted columns discarded. Parquet’s columnar format allows you to read in only the columns you need, which is faster when you only need a subset of the data."
   },
   {
     "objectID": "materials/5_arrow_single_file.html#arrow-table-or-dataset",
@@ -396,7 +403,7 @@
     "href": "materials/3_data_engineering.html#arrowopen_dataset-with-a-csv",
     "title": "Big Data in R with Arrow",
     "section": "arrow::open_dataset() with a CSV",
-    "text": "arrow::open_dataset() with a CSV\n\nlibrary(arrow)\nlibrary(dplyr)\n\n\nseattle_csv &lt;- open_dataset(sources = \"data/seattle-library-checkouts.csv\",\n               format = \"csv\")\nseattle_csv\n\nFileSystemDataset with 1 csv file\n12 columns\nUsageClass: string\nCheckoutType: string\nMaterialType: string\nCheckoutYear: int64\nCheckoutMonth: int64\nCheckouts: int64\nTitle: string\nISBN: null\nCreator: string\nSubjects: string\nPublisher: string\nPublicationYear: string"
+    "text": "arrow::open_dataset() with a CSV\n\nlibrary(arrow)\nlibrary(dplyr)\n\n\nseattle_csv &lt;- open_dataset(sources = \"data/seattle-library-checkouts.csv\",\n                            format = \"csv\")\nseattle_csv\n\nFileSystemDataset with 1 csv file\n12 columns\nUsageClass: string\nCheckoutType: string\nMaterialType: string\nCheckoutYear: int64\nCheckoutMonth: int64\nCheckouts: int64\nTitle: string\nISBN: null\nCreator: string\nSubjects: string\nPublisher: string\nPublicationYear: string"
   },
   {
     "objectID": "materials/3_data_engineering.html#arrowschema",
@@ -445,7 +452,7 @@
     "href": "materials/3_data_engineering.html#lets-control-the-schema-2",
     "title": "Big Data in R with Arrow",
     "section": "Let’s Control the Schema",
-    "text": "Let’s Control the Schema\nSchema defines column names and types, so we need to skip the first row (skip = 1):\n\nseattle_csv &lt;- open_dataset(sources = \"data/seattle-library-checkouts.csv\",\n  format = \"csv\",\n  skip = 1,\n  schema = schema(\n    UsageClass = utf8(),\n    CheckoutType = utf8(),\n    MaterialType = utf8(),\n    CheckoutYear = int64(),\n    CheckoutMonth = int64(),\n    Checkouts = int64(),\n    Title = utf8(),\n    ISBN = string(), #utf8()\n    Creator = utf8(),\n    Subjects = utf8(),\n    Publisher = utf8(),\n    PublicationYear = utf8()\n  )\n)\nseattle_csv\n\nFileSystemDataset with 1 csv file\n12 columns\nUsageClass: string\nCheckoutType: string\nMaterialType: string\nCheckoutYear: int64\nCheckoutMonth: int64\nCheckouts: int64\nTitle: string\nISBN: string\nCreator: string\nSubjects: string\nPublisher: string\nPublicationYear: string"
+    "text": "Let’s Control the Schema\nSchema defines column names and types, so we need to skip the first row (skip = 1):\n\nseattle_csv &lt;- open_dataset(sources = \"data/seattle-library-checkouts.csv\",\n  format = \"csv\",\n  schema = schema(\n    UsageClass = utf8(),\n    CheckoutType = utf8(),\n    MaterialType = utf8(),\n    CheckoutYear = int64(),\n    CheckoutMonth = int64(),\n    Checkouts = int64(),\n    Title = utf8(),\n    ISBN = string(), #utf8()\n    Creator = utf8(),\n    Subjects = utf8(),\n    Publisher = utf8(),\n    PublicationYear = utf8()\n  ),\n    skip = 1,\n)\nseattle_csv\n\nFileSystemDataset with 1 csv file\n12 columns\nUsageClass: string\nCheckoutType: string\nMaterialType: string\nCheckoutYear: int64\nCheckoutMonth: int64\nCheckouts: int64\nTitle: string\nISBN: string\nCreator: string\nSubjects: string\nPublisher: string\nPublicationYear: string"
   },
   {
     "objectID": "materials/3_data_engineering.html#lets-control-the-schema-3",
@@ -459,7 +466,7 @@
     "href": "materials/3_data_engineering.html#your-turn",
     "title": "Big Data in R with Arrow",
     "section": "Your Turn",
-    "text": "Your Turn\n\nThe first few thousand rows of ISBN are blank in the Seattle Checkouts CSV file. Read in the Seattle Checkouts CSV file with open_dataset() and ensure the correct data type for ISBN is &lt;string&gt; instead of the &lt;null&gt; interpreted by Arrow.\nOnce you have a Dataset object with the metadata you are after, count the number of Checkouts by CheckoutYear and arrange the result by CheckoutYear.\n\n➡️ Data Storage Engineering Exercises Page"
+    "text": "Your Turn\n\nThe first few thousand rows of ISBN are blank in the Seattle Checkouts CSV file. Read in the Seattle Checkouts CSV file with open_dataset() and ensure the correct data type for ISBN is &lt;string&gt; (or the alias &lt;utf8&gt;) instead of the &lt;null&gt; interpreted by Arrow.\nOnce you have a Dataset object with the metadata you are after, count the number of Checkouts by CheckoutYear and arrange the result by CheckoutYear.\n\n➡️ Data Storage Engineering Exercises Page"
   },
   {
     "objectID": "materials/3_data_engineering.html#gb-csv-file-arrow-dplyr",
@@ -473,7 +480,7 @@
     "href": "materials/3_data_engineering.html#gb-csv-file-arrow-dplyr-1",
     "title": "Big Data in R with Arrow",
     "section": "9GB CSV file + arrow + dplyr",
-    "text": "9GB CSV file + arrow + dplyr\n\nseattle_csv |&gt;\n  group_by(CheckoutYear) |&gt;\n  summarise(sum(Checkouts)) |&gt;\n  arrange(CheckoutYear) |&gt; \n  collect() |&gt;\n  system.time()\n\n   user  system elapsed \n 11.581   1.136  11.117 \n\n\n42 million rows – not bad, but could be faster…."
+    "text": "9GB CSV file + arrow + dplyr\n\nseattle_csv |&gt;\n  group_by(CheckoutYear) |&gt;\n  summarise(sum(Checkouts)) |&gt;\n  arrange(CheckoutYear) |&gt; \n  collect() |&gt;\n  system.time()\n\n   user  system elapsed \n 10.688   1.099  10.451 \n\n\n42 million rows – not bad, but could be faster…."
   },
   {
     "objectID": "materials/3_data_engineering.html#file-format-apache-parquet",
@@ -515,7 +522,7 @@
     "href": "materials/3_data_engineering.html#storage-parquet-vs-csv",
     "title": "Big Data in R with Arrow",
     "section": "Storage: Parquet vs CSV",
-    "text": "Storage: Parquet vs CSV\n\nfile &lt;- list.files(seattle_parquet)\nfile.size(file.path(seattle_parquet, file)) / 10**9\n\n[1] 4.423348\n\n\n\nParquet about half the size of the CSV file on-disk 💾"
+    "text": "Storage: Parquet vs CSV\n\nfile &lt;- list.files(seattle_parquet)\nfile.size(file.path(seattle_parquet, file)) / 10**9\n\n[1] 4.424267\n\n\n\nParquet about half the size of the CSV file on-disk 💾"
   },
   {
     "objectID": "materials/3_data_engineering.html#your-turn-1",
@@ -529,7 +536,7 @@
     "href": "materials/3_data_engineering.html#gb-parquet-file-arrow-dplyr",
     "title": "Big Data in R with Arrow",
     "section": "4.5GB Parquet file + arrow + dplyr",
-    "text": "4.5GB Parquet file + arrow + dplyr\n\nopen_dataset(sources = seattle_parquet, \n             format = \"parquet\") |&gt;\n  group_by(CheckoutYear) |&gt;\n  summarise(sum(Checkouts)) |&gt;\n  arrange(CheckoutYear) |&gt; \n  collect() |&gt;\n  system.time()\n\n   user  system elapsed \n  2.018   0.265   0.595 \n\n\n42 million rows – much better! But could be even faster…."
+    "text": "4.5GB Parquet file + arrow + dplyr\n\nopen_dataset(sources = seattle_parquet, \n             format = \"parquet\") |&gt;\n  group_by(CheckoutYear) |&gt;\n  summarise(sum(Checkouts)) |&gt;\n  arrange(CheckoutYear) |&gt; \n  collect() |&gt;\n  system.time()\n\n   user  system elapsed \n  1.771   0.431   0.568 \n\n\n42 million rows – much better! But could be even faster…."
   },
   {
     "objectID": "materials/3_data_engineering.html#file-storage-partitioning",
@@ -564,14 +571,14 @@
     "href": "materials/3_data_engineering.html#what-did-we-engineer",
     "title": "Big Data in R with Arrow",
     "section": "What Did We “Engineer”?",
-    "text": "What Did We “Engineer”?\n\nseattle_parquet_part &lt;- \"data/seattle-library-checkouts\"\n\nsizes &lt;- tibble(\n  files = list.files(seattle_parquet_part, recursive = TRUE),\n  size_GB = file.size(file.path(seattle_parquet_part, files)) / 10**9\n)\n\nsizes\n\n# A tibble: 18 × 2\n   files                            size_GB\n   &lt;chr&gt;                              &lt;dbl&gt;\n 1 CheckoutYear=2005/part-0.parquet   0.114\n 2 CheckoutYear=2006/part-0.parquet   0.172\n 3 CheckoutYear=2007/part-0.parquet   0.186\n 4 CheckoutYear=2008/part-0.parquet   0.204\n 5 CheckoutYear=2009/part-0.parquet   0.224\n 6 CheckoutYear=2010/part-0.parquet   0.233\n 7 CheckoutYear=2011/part-0.parquet   0.250\n 8 CheckoutYear=2012/part-0.parquet   0.261\n 9 CheckoutYear=2013/part-0.parquet   0.282\n10 CheckoutYear=2014/part-0.parquet   0.296\n11 CheckoutYear=2015/part-0.parquet   0.308\n12 CheckoutYear=2016/part-0.parquet   0.315\n13 CheckoutYear=2017/part-0.parquet   0.319\n14 CheckoutYear=2018/part-0.parquet   0.306\n15 CheckoutYear=2019/part-0.parquet   0.302\n16 CheckoutYear=2020/part-0.parquet   0.158\n17 CheckoutYear=2021/part-0.parquet   0.240\n18 CheckoutYear=2022/part-0.parquet   0.252"
+    "text": "What Did We “Engineer”?\n\nseattle_parquet_part &lt;- \"data/seattle-library-checkouts\"\n\nsizes &lt;- tibble(\n  files = list.files(seattle_parquet_part, recursive = TRUE),\n  size_GB = file.size(file.path(seattle_parquet_part, files)) / 10**9\n)\n\nsizes\n\n# A tibble: 18 × 2\n   files                            size_GB\n   &lt;chr&gt;                              &lt;dbl&gt;\n 1 CheckoutYear=2005/part-0.parquet   0.115\n 2 CheckoutYear=2006/part-0.parquet   0.172\n 3 CheckoutYear=2007/part-0.parquet   0.186\n 4 CheckoutYear=2008/part-0.parquet   0.204\n 5 CheckoutYear=2009/part-0.parquet   0.224\n 6 CheckoutYear=2010/part-0.parquet   0.233\n 7 CheckoutYear=2011/part-0.parquet   0.250\n 8 CheckoutYear=2012/part-0.parquet   0.261\n 9 CheckoutYear=2013/part-0.parquet   0.282\n10 CheckoutYear=2014/part-0.parquet   0.296\n11 CheckoutYear=2015/part-0.parquet   0.308\n12 CheckoutYear=2016/part-0.parquet   0.315\n13 CheckoutYear=2017/part-0.parquet   0.319\n14 CheckoutYear=2018/part-0.parquet   0.306\n15 CheckoutYear=2019/part-0.parquet   0.303\n16 CheckoutYear=2020/part-0.parquet   0.158\n17 CheckoutYear=2021/part-0.parquet   0.240\n18 CheckoutYear=2022/part-0.parquet   0.252"
   },
   {
     "objectID": "materials/3_data_engineering.html#gb-partitioned-parquet-files-arrow-dplyr",
     "href": "materials/3_data_engineering.html#gb-partitioned-parquet-files-arrow-dplyr",
     "title": "Big Data in R with Arrow",
     "section": "4.5GB partitioned Parquet files + arrow + dplyr",
-    "text": "4.5GB partitioned Parquet files + arrow + dplyr\n\nseattle_parquet_part &lt;- \"data/seattle-library-checkouts\"\n\nopen_dataset(sources = seattle_parquet_part,\n             format = \"parquet\") |&gt;\n  group_by(CheckoutYear) |&gt;\n  summarise(sum(Checkouts)) |&gt;\n  arrange(CheckoutYear) |&gt; \n  collect() |&gt;\n  system.time()\n\n   user  system elapsed \n  1.640   0.220   0.267 \n\n\n\n42 million rows – not too shabby!"
+    "text": "4.5GB partitioned Parquet files + arrow + dplyr\n\nseattle_parquet_part &lt;- \"data/seattle-library-checkouts\"\n\nopen_dataset(sources = seattle_parquet_part,\n             format = \"parquet\") |&gt;\n  group_by(CheckoutYear) |&gt;\n  summarise(sum(Checkouts)) |&gt;\n  arrange(CheckoutYear) |&gt; \n  collect() |&gt;\n  system.time()\n\n   user  system elapsed \n  1.742   0.385   0.366 \n\n\n\n42 million rows – not too shabby!"
   },
   {
     "objectID": "materials/3_data_engineering.html#your-turn-2",
@@ -580,33 +587,40 @@
     "section": "Your Turn",
     "text": "Your Turn\n\nLet’s write the Seattle Checkout CSV data to a multi-file dataset just one more time! This time, write the data partitioned by CheckoutType as Parquet files.\nNow compare the compute time between our Parquet data partitioned by CheckoutYear and our Parquet data partitioned by CheckoutType with a query of the total number of checkouts in September of 2019. Did you find a difference in compute time?\n\n➡️ Data Storage Engineering Exercises Page"
   },
+  {
+    "objectID": "materials/3_data_engineering.html#partition-design",
+    "href": "materials/3_data_engineering.html#partition-design",
+    "title": "Big Data in R with Arrow",
+    "section": "Partition Design",
+    "text": "Partition Design\n\n\n\nPartitioning on variables commonly used in filter() often faster\nNumber of partitions also important (Arrow reads the metadata of each file)"
+  },
   {
     "objectID": "materials/3_data_engineering.html#partitions-na-values",
     "href": "materials/3_data_engineering.html#partitions-na-values",
     "title": "Big Data in R with Arrow",
     "section": "Partitions & NA Values",
-    "text": "Partitions & NA Values\nADD content"
+    "text": "Partitions & NA Values\nDefault:\n\npartition_na_default_path &lt;- \"data/na-partition-default\"\n\nwrite_dataset(starwars,\n              partition_na_default_path,\n              partitioning = \"hair_color\")\n\nlist.files(partition_na_default_path)\n\n [1] \"hair_color=__HIVE_DEFAULT_PARTITION__\"\n [2] \"hair_color=auburn\"                    \n [3] \"hair_color=auburn%2C%20grey\"          \n [4] \"hair_color=auburn%2C%20white\"         \n [5] \"hair_color=black\"                     \n [6] \"hair_color=blond\"                     \n [7] \"hair_color=blonde\"                    \n [8] \"hair_color=brown\"                     \n [9] \"hair_color=brown%2C%20grey\"           \n[10] \"hair_color=grey\"                      \n[11] \"hair_color=none\"                      \n[12] \"hair_color=white\""
   },
   {
-    "objectID": "materials/3_data_engineering.html#partition-design",
-    "href": "materials/3_data_engineering.html#partition-design",
+    "objectID": "materials/3_data_engineering.html#partitions-na-values-1",
+    "href": "materials/3_data_engineering.html#partitions-na-values-1",
     "title": "Big Data in R with Arrow",
-    "section": "Partition Design",
-    "text": "Partition Design\n\n\n\nPartitioning on variables commonly used in filter() often faster\nNumber of partitions also important (Arrow reads the metadata of each file)"
+    "section": "Partitions & NA Values",
+    "text": "Partitions & NA Values\nCustom:\n\npartition_na_custom_path &lt;- \"data/na-partition-custom\"\n\nwrite_dataset(starwars,\n              partition_na_custom_path,\n              partitioning = hive_partition(hair_color = string(),\n                                            null_fallback = \"no_color\"))\n\nlist.files(partition_na_custom_path)\n\n [1] \"hair_color=auburn\"            \"hair_color=auburn%2C%20grey\" \n [3] \"hair_color=auburn%2C%20white\" \"hair_color=black\"            \n [5] \"hair_color=blond\"             \"hair_color=blonde\"           \n [7] \"hair_color=brown\"             \"hair_color=brown%2C%20grey\"  \n [9] \"hair_color=grey\"              \"hair_color=no_color\"         \n[11] \"hair_color=none\"              \"hair_color=white\""
   },
   {
     "objectID": "materials/3_data_engineering.html#performance-review-single-csv",
     "href": "materials/3_data_engineering.html#performance-review-single-csv",
     "title": "Big Data in R with Arrow",
     "section": "Performance Review: Single CSV",
-    "text": "Performance Review: Single CSV\nHow long does it take to calculate the number of books checked out in each month of 2021?\n\n\nopen_dataset(sources = \"data/seattle-library-checkouts.csv\", \n  format = \"csv\") |&gt; \n\n  filter(CheckoutYear == 2021, MaterialType == \"BOOK\") |&gt;\n  group_by(CheckoutMonth) |&gt;\n  summarise(TotalCheckouts = sum(Checkouts)) |&gt;\n  arrange(desc(CheckoutMonth)) |&gt;\n  collect() |&gt;\n  system.time()\n\n   user  system elapsed \n 13.362   1.763  12.438"
+    "text": "Performance Review: Single CSV\nHow long does it take to calculate the number of books checked out in each month of 2021?\n\n\nopen_dataset(sources = \"data/seattle-library-checkouts.csv\", \n  format = \"csv\") |&gt; \n\n  filter(CheckoutYear == 2021, MaterialType == \"BOOK\") |&gt;\n  group_by(CheckoutMonth) |&gt;\n  summarise(TotalCheckouts = sum(Checkouts)) |&gt;\n  arrange(desc(CheckoutMonth)) |&gt;\n  collect() |&gt;\n  system.time()\n\n   user  system elapsed \n 11.718   1.106  11.250"
   },
   {
     "objectID": "materials/3_data_engineering.html#performance-review-partitioned-parquet",
     "href": "materials/3_data_engineering.html#performance-review-partitioned-parquet",
     "title": "Big Data in R with Arrow",
     "section": "Performance Review: Partitioned Parquet",
-    "text": "Performance Review: Partitioned Parquet\nHow long does it take to calculate the number of books checked out in each month of 2021?\n\n\nopen_dataset(sources = \"data/seattle-library-checkouts\",\n             format = \"parquet\") |&gt; \n  filter(CheckoutYear == 2021, MaterialType == \"BOOK\") |&gt;\n  group_by(CheckoutMonth) |&gt;\n  summarise(TotalCheckouts = sum(Checkouts)) |&gt;\n  arrange(desc(CheckoutMonth)) |&gt;\n  collect() |&gt; \n  system.time()\n\n   user  system elapsed \n  0.330   0.039   0.091"
+    "text": "Performance Review: Partitioned Parquet\nHow long does it take to calculate the number of books checked out in each month of 2021?\n\n\nopen_dataset(sources = \"data/seattle-library-checkouts\",\n             format = \"parquet\") |&gt; \n  filter(CheckoutYear == 2021, MaterialType == \"BOOK\") |&gt;\n  group_by(CheckoutMonth) |&gt;\n  summarise(TotalCheckouts = sum(Checkouts)) |&gt;\n  arrange(desc(CheckoutMonth)) |&gt;\n  collect() |&gt; \n  system.time()\n\n   user  system elapsed \n  0.224   0.040   0.068"
   },
   {
     "objectID": "materials/3_data_engineering.html#engineering-data-tips-for-improved-storage-performance",
@@ -1082,7 +1096,14 @@
     "href": "materials/0_housekeeping.html#we-assume",
     "title": "Big Data in R with Arrow",
     "section": "We Assume",
-    "text": "We Assume\n\nYou know \nYou are familiar with the dplyr package for data manipulation \nYou have data in your life that is too large to fit into memory or sluggish in memory\nYou want to learn how to engineer your data storage for more performant access and analysis\n\n\n\n\n\n🔗 pos.it/arrow-conf24"
+    "text": "We Assume\n\nYou know \nYou are familiar with the dplyr package for data manipulation \nYou have data in your life that is too large to fit into memory or sluggish in memory\nYou want to learn how to engineer your data storage for more performant access and analysis"
+  },
+  {
+    "objectID": "materials/0_housekeeping.html#setup",
+    "href": "materials/0_housekeeping.html#setup",
+    "title": "Big Data in R with Arrow",
+    "section": "Setup",
+    "text": "Setup\n\nLog onto Workbench at the following URL: \nCreate a new session; select “Resource Profile: Large”\nRun usethis::use_course(\"posit-conf-2024/arrow\")\nOpen data/setup.R and run the script\n\n\n\n\n\n🔗 pos.it/arrow-conf24"
   },
   {
     "objectID": "index.html",
@@ -1117,7 +1138,7 @@
     "href": "materials/3_data_engineering-exercises.html",
     "title": "Data Engineering with Arrow Exercises",
     "section": "",
-    "text": "Schemas\n\nlibrary(arrow)\nlibrary(dplyr)\n\n\nseattle_csv &lt;- open_dataset(sources = \"data/seattle-library-checkouts.csv\",\n  format = \"csv\"\n)\n\n\n\n\n\n\n\nData Types & Controlling the Schema\n\n\n\n\nProblemsSolution 1Solution 2\n\n\n\nThe first few thousand rows of ISBN are blank in the Seattle Checkouts CSV file. Read in the Seattle Checkouts CSV file with open_dataset() and ensure the correct data type for ISBN is &lt;string&gt; (or the alias &lt;utf8&gt;) instead of the &lt;null&gt; interpreted by Arrow.\nOnce you have a Dataset object with the metadata you are after, count the number of Checkouts by CheckoutYear and arrange the result by CheckoutYear.\n\n\n\n\nseattle_csv &lt;- open_dataset(sources = \"data/seattle-library-checkouts.csv\",\n  format = \"csv\",\n  skip = 1,\n  schema(\n    UsageClass = utf8(),\n    CheckoutType = utf8(),\n    MaterialType = utf8(),\n    CheckoutYear = int64(),\n    CheckoutMonth = int64(),\n    Checkouts = int64(),\n    Title = utf8(),\n    ISBN = string(), #or utf8()\n    Creator = utf8(),\n    Subjects = utf8(),\n    Publisher = utf8(),\n    PublicationYear = utf8()\n  )\n)\n\nor\n\nseattle_csv &lt;- open_dataset(sources = \"data/seattle-library-checkouts.csv\",\n  format = \"csv\",\n  col_types = schema(ISBN = string()) #utf8()\n)\nseattle_csv\n\nFileSystemDataset with 1 csv file\n12 columns\nUsageClass: string\nCheckoutType: string\nMaterialType: string\nCheckoutYear: int64\nCheckoutMonth: int64\nCheckouts: int64\nTitle: string\nISBN: string\nCreator: string\nSubjects: string\nPublisher: string\nPublicationYear: string\n\n\n\n\nThe number of Checkouts by CheckoutYear arranged by CheckoutYear:\n\nseattle_csv |&gt;\n  group_by(CheckoutYear) |&gt;\n  summarise(sum(Checkouts)) |&gt;\n  arrange(CheckoutYear) |&gt; \n  collect()\n\n# A tibble: 18 × 2\n   CheckoutYear `sum(Checkouts)`\n          &lt;int&gt;            &lt;int&gt;\n 1         2005          3798685\n 2         2006          6599318\n 3         2007          7126627\n 4         2008          8438486\n 5         2009          9135167\n 6         2010          8608966\n 7         2011          8321732\n 8         2012          8163046\n 9         2013          9057096\n10         2014          9136081\n11         2015          9084179\n12         2016          9021051\n13         2017          9231648\n14         2018          9149176\n15         2019          9199083\n16         2020          6053717\n17         2021          7361031\n18         2022          7001989\n\n\nor\n\nseattle_csv |&gt; \n  count(CheckoutYear, wt = Checkouts) |&gt; \n  arrange(CheckoutYear) |&gt; \n  collect()\n\n# A tibble: 18 × 2\n   CheckoutYear       n\n          &lt;int&gt;   &lt;int&gt;\n 1         2005 3798685\n 2         2006 6599318\n 3         2007 7126627\n 4         2008 8438486\n 5         2009 9135167\n 6         2010 8608966\n 7         2011 8321732\n 8         2012 8163046\n 9         2013 9057096\n10         2014 9136081\n11         2015 9084179\n12         2016 9021051\n13         2017 9231648\n14         2018 9149176\n15         2019 9199083\n16         2020 6053717\n17         2021 7361031\n18         2022 7001989\n\n\nTiming the query:\n\nseattle_csv |&gt;\n  group_by(CheckoutYear) |&gt;\n  summarise(sum(Checkouts)) |&gt;\n  arrange(CheckoutYear) |&gt; \n  collect() |&gt; \n  system.time()\n\n   user  system elapsed \n 11.474   1.084  11.003 \n\n\nQuerying 42 million rows of data stored in a CSV on disk in ~10 seconds, not too bad.\n\n\n\n\n\n\n\nParquet\n\nseattle_parquet &lt;- \"data/seattle-library-checkouts-parquet\"\n\nseattle_csv |&gt;\n  write_dataset(path = seattle_parquet,\n                format = \"parquet\")\n\n\n\n\n\n\n\nParquet\n\n\n\n\nProblemSolution 1\n\n\n\nRe-run the query counting the number of Checkouts by CheckoutYear and arranging the result by CheckoutYear, this time using the Seattle Checkout data saved to disk as a single, Parquet file. Did you notice a difference in compute time?\n\n\n\n\nseattle_parquet &lt;- \"data/seattle-library-checkouts-parquet\"\n\nopen_dataset(sources = seattle_parquet, \n             format = \"parquet\") |&gt;\n  group_by(CheckoutYear) |&gt;\n  summarise(sum(Checkouts)) |&gt;\n  arrange(CheckoutYear) |&gt; \n  collect() |&gt; \n  system.time()\n\n   user  system elapsed \n  2.076   0.287   0.646 \n\n\nA much faster compute time for the query when the on-disk data is stored in the Parquet format.\n\n\n\n\n\n\n\nPartitioning\n\nseattle_parquet_part &lt;- \"data/seattle-library-checkouts\"\n\nseattle_csv |&gt;\n  group_by(CheckoutYear) |&gt;\n  write_dataset(path = seattle_parquet_part,\n                format = \"parquet\")\n\n\n\n\n\n\n\nPartitioning\n\n\n\n\nProblemsSolution 1Solution 2\n\n\n\nLet’s write the Seattle Checkout CSV data to a multi-file dataset just one more time! This time, write the data partitioned by CheckoutType as Parquet files.\nNow compare the compute time between our Parquet data partitioned by CheckoutYear and our Parquet data partitioned by CheckoutType with a query of the total number of checkouts in September of 2019. Did you find a difference in compute time?\n\n\n\nWriting the data:\n\nseattle_checkouttype &lt;- \"data/seattle-library-checkouts-type\"\n\nseattle_csv |&gt;\n  group_by(CheckoutType) |&gt;\n  write_dataset(path = seattle_checkouttype,\n                format = \"parquet\")\n\n\n\nTotal number of Checkouts in September of 2019 using partitioned Parquet data by CheckoutType:\n\nopen_dataset(sources = \"data/seattle-library-checkouts-type\") |&gt; \n  filter(CheckoutYear == 2019, CheckoutMonth == 9) |&gt; \n  summarise(TotalCheckouts = sum(Checkouts)) |&gt;\n  collect() |&gt; \n  system.time()\n\n   user  system elapsed \n  0.965   0.160   0.409 \n\n\nTotal number of Checkouts in September of 2019 using partitioned Parquet data by CheckoutYear and CheckoutMonth:\n\nopen_dataset(\"data/seattle-library-checkouts\") |&gt; \n  filter(CheckoutYear == 2019, CheckoutMonth == 9) |&gt; \n  summarise(TotalCheckouts = sum(Checkouts)) |&gt;\n  collect() |&gt; \n  system.time()\n\n   user  system elapsed \n  0.058   0.006   0.052 \n\n\nFaster compute time because the filter() call is based on the partitions."
+    "text": "Schemas\n\nlibrary(arrow)\nlibrary(dplyr)\n\n\nseattle_csv &lt;- open_dataset(sources = \"data/seattle-library-checkouts.csv\",\n                            format = \"csv\")\n\n\n\n\n\n\n\nData Types & Controlling the Schema\n\n\n\n\nProblemsSolution 1Solution 2\n\n\n\nThe first few thousand rows of ISBN are blank in the Seattle Checkouts CSV file. Read in the Seattle Checkouts CSV file with open_dataset() and ensure the correct data type for ISBN is &lt;string&gt; (or the alias &lt;utf8&gt;) instead of the &lt;null&gt; interpreted by Arrow.\nOnce you have a Dataset object with the metadata you are after, count the number of Checkouts by CheckoutYear and arrange the result by CheckoutYear.\n\n\n\n\nseattle_csv &lt;- open_dataset(sources = \"data/seattle-library-checkouts.csv\",\n  format = \"csv\",\n  schema(\n    UsageClass = utf8(),\n    CheckoutType = utf8(),\n    MaterialType = utf8(),\n    CheckoutYear = int64(),\n    CheckoutMonth = int64(),\n    Checkouts = int64(),\n    Title = utf8(),\n    ISBN = string(), #or utf8()\n    Creator = utf8(),\n    Subjects = utf8(),\n    Publisher = utf8(),\n    PublicationYear = utf8()\n  ),\n    skip = 1,\n)\n\nor\n\nseattle_csv &lt;- open_dataset(sources = \"data/seattle-library-checkouts.csv\",\n  format = \"csv\",\n  col_types = schema(ISBN = string()) # or utf8()\n)\nseattle_csv\n\nFileSystemDataset with 1 csv file\n12 columns\nUsageClass: string\nCheckoutType: string\nMaterialType: string\nCheckoutYear: int64\nCheckoutMonth: int64\nCheckouts: int64\nTitle: string\nISBN: string\nCreator: string\nSubjects: string\nPublisher: string\nPublicationYear: string\n\n\n\n\nThe number of Checkouts by CheckoutYear arranged by CheckoutYear:\n\nseattle_csv |&gt;\n  group_by(CheckoutYear) |&gt;\n  summarise(sum(Checkouts)) |&gt;\n  arrange(CheckoutYear) |&gt; \n  collect()\n\n# A tibble: 18 × 2\n   CheckoutYear `sum(Checkouts)`\n          &lt;int&gt;            &lt;int&gt;\n 1         2005          3798685\n 2         2006          6599318\n 3         2007          7126627\n 4         2008          8438486\n 5         2009          9135167\n 6         2010          8608966\n 7         2011          8321732\n 8         2012          8163046\n 9         2013          9057096\n10         2014          9136081\n11         2015          9084179\n12         2016          9021051\n13         2017          9231648\n14         2018          9149176\n15         2019          9199083\n16         2020          6053717\n17         2021          7361031\n18         2022          7001989\n\n\nor\n\nseattle_csv |&gt; \n  count(CheckoutYear, wt = Checkouts) |&gt; \n  arrange(CheckoutYear) |&gt; \n  collect()\n\n# A tibble: 18 × 2\n   CheckoutYear       n\n          &lt;int&gt;   &lt;int&gt;\n 1         2005 3798685\n 2         2006 6599318\n 3         2007 7126627\n 4         2008 8438486\n 5         2009 9135167\n 6         2010 8608966\n 7         2011 8321732\n 8         2012 8163046\n 9         2013 9057096\n10         2014 9136081\n11         2015 9084179\n12         2016 9021051\n13         2017 9231648\n14         2018 9149176\n15         2019 9199083\n16         2020 6053717\n17         2021 7361031\n18         2022 7001989\n\n\nTiming the query:\n\nseattle_csv |&gt;\n  group_by(CheckoutYear) |&gt;\n  summarise(sum(Checkouts)) |&gt;\n  arrange(CheckoutYear) |&gt; \n  collect() |&gt; \n  system.time()\n\n   user  system elapsed \n 10.651   1.091  10.333 \n\n\nQuerying 42 million rows of data stored in a CSV on disk in ~10 seconds, not too bad.\n\n\n\n\n\n\n\nParquet\n\nseattle_parquet &lt;- \"data/seattle-library-checkouts-parquet\"\n\nseattle_csv |&gt;\n  write_dataset(path = seattle_parquet,\n                format = \"parquet\")\n\n\n\n\n\n\n\nParquet\n\n\n\n\nProblemSolution 1\n\n\n\nRe-run the query counting the number of Checkouts by CheckoutYear and arranging the result by CheckoutYear, this time using the Seattle Checkout data saved to disk as a single, Parquet file. Did you notice a difference in compute time?\n\n\n\n\nseattle_parquet &lt;- \"data/seattle-library-checkouts-parquet\"\n\nopen_dataset(sources = seattle_parquet, \n             format = \"parquet\") |&gt;\n  group_by(CheckoutYear) |&gt;\n  summarise(sum(Checkouts)) |&gt;\n  arrange(CheckoutYear) |&gt; \n  collect() |&gt; \n  system.time()\n\n   user  system elapsed \n  1.634   0.345   0.558 \n\n\nA much faster compute time for the query when the on-disk data is stored in the Parquet format.\n\n\n\n\n\n\n\nPartitioning\n\nseattle_parquet_part &lt;- \"data/seattle-library-checkouts\"\n\nseattle_csv |&gt;\n  group_by(CheckoutYear) |&gt;\n  write_dataset(path = seattle_parquet_part,\n                format = \"parquet\")\n\n\n\n\n\n\n\nPartitioning\n\n\n\n\nProblemsSolution 1Solution 2\n\n\n\nLet’s write the Seattle Checkout CSV data to a multi-file dataset just one more time! This time, write the data partitioned by CheckoutType as Parquet files.\nNow compare the compute time between our Parquet data partitioned by CheckoutYear and our Parquet data partitioned by CheckoutType with a query of the total number of checkouts in September of 2019. Did you find a difference in compute time?\n\n\n\nWriting the data:\n\nseattle_checkouttype &lt;- \"data/seattle-library-checkouts-type\"\n\nseattle_csv |&gt;\n  group_by(CheckoutType) |&gt;\n  write_dataset(path = seattle_checkouttype,\n                format = \"parquet\")\n\n\n\nTotal number of Checkouts in September of 2019 using partitioned Parquet data by CheckoutType:\n\nopen_dataset(sources = \"data/seattle-library-checkouts-type\") |&gt; \n  filter(CheckoutYear == 2019, CheckoutMonth == 9) |&gt; \n  summarise(TotalCheckouts = sum(Checkouts)) |&gt;\n  collect() |&gt; \n  system.time()\n\n   user  system elapsed \n  0.777   0.072   0.296 \n\n\nTotal number of Checkouts in September of 2019 using partitioned Parquet data by CheckoutYear and CheckoutMonth:\n\nopen_dataset(\"data/seattle-library-checkouts\") |&gt; \n  filter(CheckoutYear == 2019, CheckoutMonth == 9) |&gt; \n  summarise(TotalCheckouts = sum(Checkouts)) |&gt;\n  collect() |&gt; \n  system.time()\n\n   user  system elapsed \n  0.034   0.005   0.030 \n\n\nFaster compute time because the filter() call is based on the partitions."
   },
   {
     "objectID": "materials/4_data_manipulation_2-exercises.html",
@@ -1138,7 +1159,7 @@
     "href": "materials/6_wrapping_up.html#arrow",
     "title": "Big Data in R with Arrow",
     "section": "Arrow",
-    "text": "Arrow\n\nefficiently read + filter + join + summarise 1.15 billion rows\n\n\nlibrary(arrow)\nlibrary(dplyr)\nlibrary(janitor)\nlibrary(stringr)\n\nnyc_taxi_zones &lt;- read_csv_arrow(\"data/taxi_zone_lookup.csv\",\n                                 as_data_frame = FALSE) |&gt;\n  clean_names()\n  \nairport_zones &lt;- nyc_taxi_zones |&gt;\n  filter(str_detect(zone, \"Airport\")) |&gt;\n  pull(location_id, as_vector = TRUE)\n\ndropoff_zones &lt;- nyc_taxi_zones |&gt;\n  select(dropoff_location_id = location_id,\n         dropoff_zone = zone) |&gt; \n  compute() # run the query but don't pull results into R session\n\nairport_pickups &lt;- open_dataset(\"data/nyc-taxi/\") |&gt;\n  filter(pickup_location_id %in% airport_zones) |&gt;\n  select(\n    matches(\"datetime\"),\n    matches(\"location_id\")\n  ) |&gt;\n  left_join(dropoff_zones) |&gt;\n  count(dropoff_zone) |&gt;\n  arrange(desc(n)) |&gt;\n  collect()"
+    "text": "Arrow\n\nefficiently read + filter + join + summarise 1.15 billion rows\n\n\nlibrary(arrow)\nlibrary(dplyr)\nlibrary(janitor)\nlibrary(stringr)\n\nnyc_taxi_zones &lt;- read_csv_arrow(\"data/taxi_zone_lookup.csv\",\n                                 as_data_frame = FALSE) |&gt;\n  clean_names()\n  \nairport_zones &lt;- nyc_taxi_zones |&gt;\n  filter(str_detect(zone, \"Airport\")) |&gt;\n  pull(location_id, as_vector = TRUE)\n\ndropoff_zones &lt;- nyc_taxi_zones |&gt;\n  select(dropoff_location_id = location_id,\n         dropoff_zone = zone) |&gt; \n  collect(as_data_frame = FALSE)\n\nairport_pickups &lt;- open_dataset(\"data/nyc-taxi/\") |&gt;\n  filter(pickup_location_id %in% airport_zones) |&gt;\n  select(\n    matches(\"datetime\"),\n    matches(\"location_id\")\n  ) |&gt;\n  left_join(dropoff_zones) |&gt;\n  count(dropoff_zone) |&gt;\n  arrange(desc(n)) |&gt;\n  collect()"
   },
   {
     "objectID": "materials/6_wrapping_up.html#r",
diff --git a/materials/3_data_engineering-exercises.qmd b/materials/3_data_engineering-exercises.qmd
index a396cba..8e93b63 100644
--- a/materials/3_data_engineering-exercises.qmd
+++ b/materials/3_data_engineering-exercises.qmd
@@ -19,8 +19,7 @@ library(dplyr)
 #| label: open-dataset-seattle-csv
 
 seattle_csv <- open_dataset(sources = "data/seattle-library-checkouts.csv",
-  format = "csv"
-)
+                            format = "csv")
 ```
 
 ::: {#exercise-schema .callout-tip}
@@ -41,7 +40,6 @@ seattle_csv <- open_dataset(sources = "data/seattle-library-checkouts.csv",
 
 seattle_csv <- open_dataset(sources = "data/seattle-library-checkouts.csv",
   format = "csv",
-  skip = 1,
   schema(
     UsageClass = utf8(),
     CheckoutType = utf8(),
@@ -55,7 +53,8 @@ seattle_csv <- open_dataset(sources = "data/seattle-library-checkouts.csv",
     Subjects = utf8(),
     Publisher = utf8(),
     PublicationYear = utf8()
-  )
+  ),
+    skip = 1,
 )
 ```
 
@@ -66,7 +65,7 @@ or
 
 seattle_csv <- open_dataset(sources = "data/seattle-library-checkouts.csv",
   format = "csv",
-  col_types = schema(ISBN = string()) #utf8()
+  col_types = schema(ISBN = string()) # or utf8()
 )
 seattle_csv
 ```
diff --git a/materials/3_data_engineering.qmd b/materials/3_data_engineering.qmd
index ff9c16a..052a1d9 100644
--- a/materials/3_data_engineering.qmd
+++ b/materials/3_data_engineering.qmd
@@ -59,7 +59,6 @@ seattle_parquet_part <- "data/seattle-library-checkouts"
 - 3️⃣ Parquet (.parquet)
 - 4️⃣ Something else
 
-
 ## Arrow & File Formats
 
 ![](images/arrow-read-write-updated.png)
@@ -85,7 +84,7 @@ library(dplyr)
 
 
 seattle_csv <- open_dataset(sources = "data/seattle-library-checkouts.csv",
-               format = "csv")
+                            format = "csv")
 seattle_csv
 ```
 
@@ -191,10 +190,9 @@ Schema defines column names and types, so we need to skip the first row (skip =
 
 ```{r}
 #| label: seattle-schema-control
-#| code-line-numbers: "|12"
+#| code-line-numbers: "|11|17"
 seattle_csv <- open_dataset(sources = "data/seattle-library-checkouts.csv",
   format = "csv",
-  skip = 1,
   schema = schema(
     UsageClass = utf8(),
     CheckoutType = utf8(),
@@ -208,7 +206,8 @@ seattle_csv <- open_dataset(sources = "data/seattle-library-checkouts.csv",
     Subjects = utf8(),
     Publisher = utf8(),
     PublicationYear = utf8()
-  )
+  ),
+    skip = 1,
 )
 seattle_csv
 ```
@@ -230,7 +229,7 @@ seattle_csv
 
 ## Your Turn
 
-1.  The first few thousand rows of `ISBN` are blank in the Seattle Checkouts CSV file. Read in the Seattle Checkouts CSV file with `open_dataset()` and ensure the correct data type for `ISBN` is `<string>` instead of the `<null>` interpreted by Arrow.
+1.  The first few thousand rows of `ISBN` are blank in the Seattle Checkouts CSV file. Read in the Seattle Checkouts CSV file with `open_dataset()` and ensure the correct data type for `ISBN` is `<string>` (or the alias `<utf8>`) instead of the `<null>` interpreted by Arrow.
 
 2.  Once you have a `Dataset` object with the metadata you are after, count the number of `Checkouts` by `CheckoutYear` and arrange the result by `CheckoutYear`.
 
@@ -270,7 +269,6 @@ seattle_csv |>
 <https://parquet.apache.org/>
 :::
 
-
 ## Parquet Files: "row-chunked"
 
 ![](images/parquet-chunking.png)
@@ -290,7 +288,6 @@ seattle_csv |>
 -   CSV has no info about data types, inferred by each parser
 :::
 
-
 ## Writing to Parquet
 
 ```{r}
@@ -438,11 +435,6 @@ open_dataset(sources = seattle_parquet_part,
 
 ➡️ [Data Storage Engineering Exercises Page](3_data_engineering-exercises.html)
 
-
-## Partitions & NA Values
-
-ADD content
-
 ## Partition Design
 
 ::: columns
@@ -456,6 +448,35 @@ ADD content
 :::
 :::
 
+## Partitions & NA Values
+
+Default:
+
+```{r}
+partition_na_default_path <- "data/na-partition-default"
+
+write_dataset(starwars,
+              partition_na_default_path,
+              partitioning = "hair_color")
+
+list.files(partition_na_default_path)
+```
+
+## Partitions & NA Values
+
+Custom:
+
+```{r}
+partition_na_custom_path <- "data/na-partition-custom"
+
+write_dataset(starwars,
+              partition_na_custom_path,
+              partitioning = hive_partition(hair_color = string(),
+                                            null_fallback = "no_color"))
+
+list.files(partition_na_custom_path)
+```
+
 ## Performance Review: Single CSV
 
 How long does it take to calculate the number of books checked out in each month of 2021?
diff --git a/materials/5_arrow_single_file.qmd b/materials/5_arrow_single_file.qmd
index 52f4d30..13545bc 100644
--- a/materials/5_arrow_single_file.qmd
+++ b/materials/5_arrow_single_file.qmd
@@ -89,6 +89,29 @@ as.data.frame(taxi_table)
 -   `data.frame` & `tibble` are R objects *in-memory*
 -   `Table` is an Arrow object *in-memory*
 
+## Watch Your Schemas 👀
+
+
+:::: {.columns}
+
+::: {.column width="50%"}
+```{r}
+#| label: check-schema-df
+
+schema(taxi_df)
+```
+:::
+
+::: {.column width="50%"}
+```{r}
+#| label: check-schema-table
+
+schema(taxi_table)
+```
+:::
+
+::::
+
 ## Data frames
 
 ![](images/tabular-structures-r.png)
diff --git a/materials/6_wrapping_up.qmd b/materials/6_wrapping_up.qmd
index 96019a6..b9137d7 100644
--- a/materials/6_wrapping_up.qmd
+++ b/materials/6_wrapping_up.qmd
@@ -34,7 +34,7 @@ airport_zones <- nyc_taxi_zones |>
 dropoff_zones <- nyc_taxi_zones |>
   select(dropoff_location_id = location_id,
          dropoff_zone = zone) |> 
-  compute() # run the query but don't pull results into R session
+  collect(as_data_frame = FALSE)
 
 airport_pickups <- open_dataset("data/nyc-taxi/") |>
   filter(pickup_location_id %in% airport_zones) |>