From 92941dec04ae61822d63da1507432a752575ef29 Mon Sep 17 00:00:00 2001 From: Stephan Koenig Date: Sat, 30 May 2020 16:49:35 -0700 Subject: [PATCH 01/11] Move file from master to individual branch --- .../data_wrangling_basic.Rmd | 490 ++++++++++++++++++ 1 file changed, 490 insertions(+) create mode 100644 inst/tutorials/data_wrangling_basic/data_wrangling_basic.Rmd diff --git a/inst/tutorials/data_wrangling_basic/data_wrangling_basic.Rmd b/inst/tutorials/data_wrangling_basic/data_wrangling_basic.Rmd new file mode 100644 index 0000000..263ef1b --- /dev/null +++ b/inst/tutorials/data_wrangling_basic/data_wrangling_basic.Rmd @@ -0,0 +1,490 @@ +--- +title: "Introduction to data wrangling" +author: "Michelle Kang" +date: "05/02/2020" +output: + learnr::tutorial: + progressive: true + allow_skip: true +runtime: shiny_prerendered +description: This file contains the first of three data wrangling tutorials using the tidyverse package in R. Along with an introduction on downloading and loading packages, this tutorial introduces loading and visualising tabular data tables, and the filter, slice and select functions of tidyverse. +--- + +```{r setup, include = FALSE} +# General learnr setup +library(learnr) +knitr::opts_chunk$set(echo = TRUE) +library(educer) +# Helper function to set path to images to "/images" etc. +setup_resources() + +# Tutorial specific setup +library(readr) +library(tidyverse) + +OTU_metadata_table <- combined + +subset_dat <- slice(geochemicals, 710, 713, 715, 716, 709, 717, 718, 719) + +x = 10 #for boolean exercise +restricted_columns <- select(OTU_metadata_table, OTU0001, + OTU0002, OTU0004, Depth) +summary_solution_1 <- + geochemicals %>% + select(Cruise, Date, Depth, CTD_O2) %>% + filter(Cruise == 72 & Depth >= 0.05) + +summary_solution_2 <- + geochemicals %>% + filter(CTD_O2 > 0 | NO3 > 0) %>% + select(Cruise, Depth) +``` + + + +## Objectives + +### By the end of this tutorial you should be able to: + +- Install and load R packages. +- Load tabular data using `read_csv()` and save the data to your R environment. +- Use the `filter()`, `slice()` and `select()` methods to conditionally subset your data. + + + +## R packages + +R packages are units of shareable code, containing functions that facilitate and enhance analyses. In simpler terms, think of R packages as iPhone Applications. Each App has specific functions and capabilities that can be accessed when we install then open the application. The same can be said about R packages. In order to use the functions for a specific R package, we first need to install the package, then each time we want to use the package we need to "open" the package. + +In this tutorial we will be using the "tidyverse" package. This package contains a versatile set of functions designed for easy manipulation of data. + +### Installing Packages + +The tidyverse package can be installed like this (to install a different package just replace "tidyverse" with the name of the desired package): + +**For R v3.4 or newer** +```{r eval = FALSE} +install.packages("tidyverse") +``` + +**For R v3.3 or older** +Unfortunately, tidyverse does not work on older versions of R. Instead, you will need to individually install each package within the tidyverse. +```{r eval=FALSE} +install.packages("tibble") +install.packages("readr") +install.packages("dplyr") +install.packages("tidyr") +install.packages("ggplot2") +install.packages("stringr") +install.packages("purrr") +install.packages("forcats") +``` + + + +## Loading packages + +After installing a package, and *everytime* you open a new RStudio session, the packages you want to use need to be loaded (opened) into the R work-space with the `library()` function. This tells R to access the package's functions and prevents RStudio from lags that would occur if it automatically loaded every downloaded package every time you opened it. + +Packages can be loaded like this: + +```{r tidyverse_load, exercise = TRUE, exercise.lines = 5} +library(tidyverse) +library(educer) +``` + + + +## Tidyverse vs. base R + +The [tidyverse](https://www.tidyverse.org/) is a collection of R packages for data wrangling, analysis, and visualization. + +The main advantages of using the tidyverse to read in data over base R are: + +- Faster data processing +- Seamless integration with other tidyverse functions +- Automatic designation of data types +- Data storage in tibble as opposed to data frames + - Tibbles are data frames with an additional layer of formatting that causes them to print nicely in the console and always return a tibble in functions + + +A popular package for data wrangling is *dplyr* in the tidyverse. This package is so good at what it does, and integrates so well with other popular tools like *ggplot2*, that it has rapidly become the de-facto standard. + +dplyr code is very readable because all operations are based on using dplyr functions or *verbs* (select, filter, mutate...). + +Typical data wrangling tasks in dplyr: + +- `select()` a subset of variables (columns) +- `slice()` out rows by their ordinal position in the tbl +- `filter()` out a subset of observations (rows) +- `rename()` variables +- `arrange()` the observations by sorting a variable in ascending or descending order +- `mutate()` all values of a variable (apply a transformation) +- `group_by()` a variable and `summarise` data by the grouped variable +- `*_join()` two data frames into a single data frame + +Each verb works similarly: + +- Input data frame in the first argument. +- Other arguments can refer to variables as if they were local objects +- Output is another data frame + +Before working with our data, we first want to make a copy of the raw data so that we may revert to it quickly if we make any mistakes. This is best practices for data science in general. + +```{r} +# dat <- raw_dat +``` + +We will then continually overwrite this object with `<-` as we clean it in R. + + + +### Data description +The data used throughout this module were collected as part of an on-going oceanographic time series program in Saanich Inlet, a seasonally anoxic fjord on the East coast of Vancouver Island, British Columbia. + +The data that you will use in R are 16S amplicon profiles of microbial communities at several depths in Saanich Inlet from one time point in this series (August 2012). These ~300 bp sequences were processed using [mothur](https://www.mothur.org/wiki/Main_Page) to yield 97% (approximately species-level) operational taxonomic units (OTUs). + +`combined` is a comma-delimited table of counts of four OTUs in each sample, normalized to 100,000 sequences per sample and the corresponding conditions of each sample (Depth, NO2, NO3 etc). + +For a brief introduction to these data, see Hallam SJ et al. 2017. Monitoring microbial responses to ocean deoxygenation in a model oxygen minimum zone. Sci Data 4: 170158 [doi:10.1038/sdata.2017.158](https://www.nature.com/articles/sdata2017158). + +Click the button below to save a copy of this data set to your computer: + +```{r echo = FALSE} +# Download button shiny app UI +fluidRow( + column(12, align = "center", downloadButton("downloadData", "Download")) +) + +``` + +```{r context = "server"} +# Download button shiny app server +output$downloadData <- downloadHandler( + filename = "combined.csv", + content = function(file) { + write_csv(combined, file) + } +) +``` + + + +## Loading tabular data + +Every R function follows the following basic syntax, where `function()` is the name of the function and `arguments` are the different parameters you can specify. + +`function(argument1=..., argument2=..., ...)` + +Data tables can be loaded into R using the tidyverse `read_*()` function. + +The `read_table()` function allows us to open raw datafiles. Run the following code to see what `combined.csv` looks like. + +```{r eval = FALSE} +read_table("combined.csv") +``` + +Notice how values in `combined.csv` are separated by commas. We can load our Saanich data into R with `read_csv()` for comma separated files and specify the arguments that describe our data as follows. + +- `col_names`: tells R that the first row is column names, not data + +```{r eval = FALSE} +read_csv(file = "combined.csv", col_names = TRUE) +``` + +Now our data is formatted nicely into table form. + +### Save data in the environment + +Since we want to do more with our data after reading it in, we need to save it as a variable in R like we did previously with the `<-` operator. You can choose to name the object whatever you like, though this module assumes the names used below. + +```{r eval = FALSE} +OTU_metadata_table <- read_csv(file = "combined.csv", col_names = TRUE) +``` + +```{r saving-quiz, echo = FALSE} +quiz( + question("How do we make sure the datatable is saved to 'OTU_metadata_table' in our environment?", + answer("Entering `\"OTU_metadata_table\"` (with quotes) in the console displays the table."), + answer("Entering `OTU_metadata_table` (no quotes) in the console displays the table.", correct = TRUE), + answer("`OTU_metadata_table` shows up in the \"Global Environment\" box on the top right hand corner.", correct = TRUE) + ) +) +``` + + + +## Data exploration + +Let's explore the data that we've imported into R. The simplest way to view your imported data is to view it as a "tibble" like this. This view displays a subset of large data tables (notice that the last column name gets cut off) in a table view + +```{r tibble, exercise = TRUE, exercise.lines = 5} +OTU_metadata_table +``` + +`glimpse()` is a function that as its name suggests, allows us to get a "glimpse" of the contents of a data table. Running `glimpse()` on a data table outputs the number of rows (observations), columns (variables), and lists each column name along with its type and a portion of its contents. Let's run `glimpse()` with our OTU_metadata_table like this: + +```{r glimpse, exercise = TRUE, exercise.lines = 5} +glimpse(OTU_metadata_table) +``` +from this we see that our table has 7 rows and 10 columns. Each $ is followed by a column name, with information on the contents following each column name. `glimpse()` lists all columns of a table + +### Exercise + +```{r glimpse-quiz, echo = FALSE} +quiz( + question("Which columns are in the OTU_metadata_table?", + answer("OTU001", correct = TRUE), + answer("Otu002"), + answer("72"), + answer("NO3", correct = TRUE), + answer("NO3_Mean"), + answer("Mean_N2O", correct = TRUE), + answer("Depth", correct = TRUE) + ) +) +``` + +If we only want the dimensions of a dataframe or table, we can use the `dim()` function which prints the number of rows followed by the number of columns. Simple functions to query just the number of rows or columns in a data table are `nrow()` and `ncol()`. + +```{r dim, exercise = TRUE, exercise.lines = 5} +#number of rows followed by number of columns +dim(OTU_metadata_table) +#number of rows +nrow(OTU_metadata_table) +#number of columns +ncol(OTU_metadata_table) +``` + +We can list the column names using `colnames()`. + +```{R colnames,exercise = TRUE, exercise.lines = 5} +colnames(OTU_metadata_table) +``` + + + +## `select()` + +You can use the `select()` function to keep only a subset of variables (columns). Let's select the variables `OTU0001`, `OTU0002`, `OTU0004`, `Depth`. + +```{R select-1, exercise = TRUE, exercise.lines = 5} +restricted_columns <- select(OTU_metadata_table, OTU0001, OTU0002, OTU0004, Depth) +``` + +To view our new `restricted_columns` variable, just type in the variable name and run the code like this: + +```{R select-2, exercise = TRUE, exercise.lines = 5} +restricted_columns +``` + +### Exercise +As an exercise, select for only the depth and geochemical columns in `OTU_metadata_table` and name the new table `metadata`: + +```{r select-exercise, exercise = TRUE, exercise.lines = 5} + +``` + +### Exercise: `select()` + +Select the Cruise, Date, Depth, PO4, and WS_NO3 variables of the `geochemicals` data set + +```{r select-exercise-1, exercise=TRUE, exercise.lines = 3} + +dat <- select() + +dat +``` + + + + +## Booleans + +Booleans are logical statements that are either `TRUE` or `FALSE` but can not be anything in between. As an example, run the code below: + +```{r boolean-exercise, exercise = TRUE, exercise.lines = 5} +x <- 6 +y <- "cat" + +x < 3 +y == "dog" +``` + +The equation `x < 3` is `FALSE` because x is set to 6 in the line above. As a simple exercise, manipulate the above code to make both equations `TRUE`. + +note that in R, `==` is used in Boolean equations and using a single `=` will result in error. As you may have noticed above a single `=` is used to set a variable to a value. + +For quick reference, here are the most commonly used statements and operators. + +R code | meaning +---------- | --------------- +`==` | equals +`< or >` | less/greater than +`<= or >= `| less/greater than or equal to +`%in%` | in +`is.na` | is missing (`NA`) +`!` | not (as in not equal to `!=`) +`&` | and +`|` | or + + +### Exercise + +Write a boolean equation for "x is greater than 6 or less than 12", it should return `TRUE` after running. + +```{r boolean-exercise-2, exercise = TRUE, exercise.lines = 5} + +``` + +### `filter()` + +Conditional statements and logical operators are important when working with data in R. We will practice using different conditional statements and logical operators on the oxygen data in a subset of the `geochemicals` data set. You can use `filter()` to select specific rows based on a logical condition of a variable. + + +```{r} +subset_dat <- slice(geochemicals, 710, 713, 715, 716, 709, 717, 718, 719) +``` + +`variable == value` returns rows where the variable matches the value: + +```{r equal-to, exercise = TRUE, exercise.lines = 5} +filter(subset_dat, CTD_O2 == 204.259) +``` + +`variable != value` returns rows where the variable does not match the value: + +```{r not-equal-to, exercise = TRUE, exercise.lines = 5} +filter(subset_dat, CTD_O2 != 204.259) +``` + +`variable > value` returns rows where the variable is greater than the value: + +```{r greater-than, exercise = TRUE, exercise.lines = 5} +filter(subset_dat, CTD_O2 > 204.259) +``` + +`variable %in% values` returns rows where the variable matches one of the given values. +Values are provided as a vector `c(value1, value2, ...)`: + +```{r match-in, exercise = TRUE, exercise.lines = 5} +filter(subset_dat, CTD_O2 %in% c(40.745, 204.259)) +``` + +`is.na(variable)` returns rows where the variable is `NA` (Not Available). + +```{r is-na, exercise = TRUE, exercise.lines = 5} +filter(subset_dat, is.na(CTD_O2)) +``` + +`!condition` returns rows where the condition is not fulfilled. + +```{r opposite, exercise = TRUE, exercise.lines = 5} +filter(subset_dat, !is.na(CTD_O2)) +``` + +We can look for a range of values by finding the rows where the value of the variable is <= 120 **AND** >= 20 bu using the logical operator `&`. + +```{r and, exercise = TRUE, exercise.lines = 5} +filter(subset_dat, CTD_O2 <= 120 & CTD_O2 >= 20) +``` + +Logical OR `|`. Find the rows where the value is <= 50 **OR** >= 150. + +```{r or, exercise = TRUE, exercise.lines = 5} +filter(subset_dat, CTD_O2 <= 50 | CTD_O2 >= 150) +``` + +### Exercise + +As an exercise, restrict for rows where the value for "depth" is less than or equal to 135m. +```{r filter-exercise, exercise = TRUE, exercise.lines = 5} + +``` + + + +## `slice()` + +We can also only choose to work with specific rows in our data table using the `slice()` function. + +To select a subset of observations (rows) by their ordinal position, we use the `slice()` function. + +```{r slice-1, exercise = TRUE, exercise.lines = 5} +slice(OTU_metadata_table, 1) +``` + +You can list multiple ordinal postions to select multiple observations at once. + +```{r slice-2, exercise = TRUE, exercise.lines = 5} +slice(OTU_metadata_table, 1, 2, 3, 4, 5) +``` + +If you would like to to select a range of observations, give the starting and end position separated by a colon like so: `:`. + +```{r slice-3, exercise = TRUE, exercise.lines = 5} +slice(OTU_metadata_table, 1:5) +``` + +```{r slice-quiz, echo = FALSE} +quiz( + question("What is the value of OTU0003 in the 6th row of OTU_metadata_table?", + answer("0"), + answer("156"), + answer("178", correct=TRUE), + answer("72") + ) +) +``` + +### Exercise: `slice()` and `select()` +Using `slice()` and `select()`, determine: + +A) what depth value occurs in the 20th row? +B) what methane value occurs in the 170th row? + +```{r slice_exercise, exercise=TRUE, exercise.lines=5} +dat <- OTU_metadata_table +``` + + + +## Summary Exercise + +The `geochemicals` dataset is included in the "educer" package. This dataframe contains time series observations on the water column chemistry. Learn more about the `geochemicals` dataset by running the following line in your R console. + +```{r dataset_exercise, exercise = TRUE, exercise.lines = 5} +?geochemicals +``` + +Using the geochemical data: + +1. Select the Cruise, Date, Depth, and oxygen variables. +2. Filter the data to retain data on Cruise 72 where Depth is greater than or equal to 0.05 km. + +Your resulting pdat object should be a [`r dim(summary_solution_1)`] data frame. The data has been loaded for you into the `dat` variable. + +```{r summary-exercise, exercise = TRUE, exercise.lines = 5} +dat <- geochemicals +``` + +### Challenge exercise: `select()` and `filter()` + +If you want more practice or have previous experience in R, try this more challenging exercise! Be sure to create a fresh `dat`. + +3. Keep only the Cruise and Depth variables and the rows where oxygen OR nitrate is greater than zero. + +Your resulting pdat object should be a [`r dim(summary_solution_2)`] data frame. *Hint:* Can you filter based on a variable that you previously removed by not selecting it? + +```{r summary-exercise-2, exercise = TRUE, exercise.lines = 5} +dat <- geochemicals +``` + + + +## Additional resources + +* [R cheatsheets](https://www.rstudio.com/resources/cheatsheets/) also available in RStudio under Help > Cheatsheets +* [Introduction to dplyr](https://cran.r-project.org/web/packages/dplyr/vignettes/dplyr.html) +* [dplyr tutorial](https://rpubs.com/justmarkham/dplyr-tutorial) +* [dplyr video tutorial](https://www.r-bloggers.com/hands-on-dplyr-tutorial-for-faster-data-manipulation-in-r/) From 4e546702675dbd82b83d8584142deb6ae0a53d57 Mon Sep 17 00:00:00 2001 From: r-karimi Date: Mon, 12 Oct 2020 09:38:11 -0700 Subject: [PATCH 02/11] Remove tidyverse installation instructions --- .../data_wrangling_basic.Rmd | 28 - .../data_wrangling_basic.html | 818 ++++++++++++++++++ 2 files changed, 818 insertions(+), 28 deletions(-) create mode 100644 inst/tutorials/data_wrangling_basic/data_wrangling_basic.html diff --git a/inst/tutorials/data_wrangling_basic/data_wrangling_basic.Rmd b/inst/tutorials/data_wrangling_basic/data_wrangling_basic.Rmd index 263ef1b..c8a5763 100644 --- a/inst/tutorials/data_wrangling_basic/data_wrangling_basic.Rmd +++ b/inst/tutorials/data_wrangling_basic/data_wrangling_basic.Rmd @@ -52,34 +52,6 @@ summary_solution_2 <- -## R packages - -R packages are units of shareable code, containing functions that facilitate and enhance analyses. In simpler terms, think of R packages as iPhone Applications. Each App has specific functions and capabilities that can be accessed when we install then open the application. The same can be said about R packages. In order to use the functions for a specific R package, we first need to install the package, then each time we want to use the package we need to "open" the package. - -In this tutorial we will be using the "tidyverse" package. This package contains a versatile set of functions designed for easy manipulation of data. - -### Installing Packages - -The tidyverse package can be installed like this (to install a different package just replace "tidyverse" with the name of the desired package): - -**For R v3.4 or newer** -```{r eval = FALSE} -install.packages("tidyverse") -``` - -**For R v3.3 or older** -Unfortunately, tidyverse does not work on older versions of R. Instead, you will need to individually install each package within the tidyverse. -```{r eval=FALSE} -install.packages("tibble") -install.packages("readr") -install.packages("dplyr") -install.packages("tidyr") -install.packages("ggplot2") -install.packages("stringr") -install.packages("purrr") -install.packages("forcats") -``` - ## Loading packages diff --git a/inst/tutorials/data_wrangling_basic/data_wrangling_basic.html b/inst/tutorials/data_wrangling_basic/data_wrangling_basic.html new file mode 100644 index 0000000..32a8e3b --- /dev/null +++ b/inst/tutorials/data_wrangling_basic/data_wrangling_basic.html @@ -0,0 +1,818 @@ + + + + + + + + + + + + + + + + + +Introduction to data wrangling + + + + + + + + + + + + + + + + + + + + + + + +
+
+ +
+ +
+

Objectives

+
+

By the end of this tutorial you should be able to:

+
    +
  • Install and load R packages.
  • +
  • Load tabular data using read_csv() and save the data to your R environment.
  • +
  • Use the filter(), slice() and select() methods to conditionally subset your data.
  • +
+
+
+
+

R packages

+

R packages are units of shareable code, containing functions that facilitate and enhance analyses. In simpler terms, think of R packages as iPhone Applications. Each App has specific functions and capabilities that can be accessed when we install then open the application. The same can be said about R packages. In order to use the functions for a specific R package, we first need to install the package, then each time we want to use the package we need to “open” the package.

+

In this tutorial we will be using the “tidyverse” package. This package contains a versatile set of functions designed for easy manipulation of data.

+
+

Installing Packages

+

The tidyverse package can be installed like this (to install a different package just replace “tidyverse” with the name of the desired package):

+

For R v3.4 or newer

+
install.packages("tidyverse")
+

For R v3.3 or older
+Unfortunately, tidyverse does not work on older versions of R. Instead, you will need to individually install each package within the tidyverse.

+
install.packages("tibble")
+install.packages("readr")
+install.packages("dplyr")
+install.packages("tidyr")
+install.packages("ggplot2")
+install.packages("stringr")
+install.packages("purrr")
+install.packages("forcats")
+
+
+
+

Loading packages

+

After installing a package, and everytime you open a new RStudio session, the packages you want to use need to be loaded (opened) into the R work-space with the library() function. This tells R to access the package’s functions and prevents RStudio from lags that would occur if it automatically loaded every downloaded package every time you opened it.

+

Packages can be loaded like this:

+
+
library(tidyverse)
+library(educer)
+ +
+
+
+

Tidyverse vs. base R

+

The tidyverse is a collection of R packages for data wrangling, analysis, and visualization.

+

The main advantages of using the tidyverse to read in data over base R are:

+
    +
  • Faster data processing
  • +
  • Seamless integration with other tidyverse functions
  • +
  • Automatic designation of data types
  • +
  • Data storage in tibble as opposed to data frames +
      +
    • Tibbles are data frames with an additional layer of formatting that causes them to print nicely in the console and always return a tibble in functions
    • +
  • +
+

A popular package for data wrangling is dplyr in the tidyverse. This package is so good at what it does, and integrates so well with other popular tools like ggplot2, that it has rapidly become the de-facto standard.

+

dplyr code is very readable because all operations are based on using dplyr functions or verbs (select, filter, mutate…).

+

Typical data wrangling tasks in dplyr:

+
    +
  • select() a subset of variables (columns)
  • +
  • slice() out rows by their ordinal position in the tbl
  • +
  • filter() out a subset of observations (rows)
  • +
  • rename() variables
  • +
  • arrange() the observations by sorting a variable in ascending or descending order
  • +
  • mutate() all values of a variable (apply a transformation)
  • +
  • group_by() a variable and summarise data by the grouped variable
  • +
  • *_join() two data frames into a single data frame
  • +
+

Each verb works similarly:

+
    +
  • Input data frame in the first argument.
  • +
  • Other arguments can refer to variables as if they were local objects
  • +
  • Output is another data frame
  • +
+

Before working with our data, we first want to make a copy of the raw data so that we may revert to it quickly if we make any mistakes. This is best practices for data science in general.

+
# dat <- raw_dat
+

We will then continually overwrite this object with <- as we clean it in R.

+
+

Data description

+

The data used throughout this module were collected as part of an on-going oceanographic time series program in Saanich Inlet, a seasonally anoxic fjord on the East coast of Vancouver Island, British Columbia.

+

The data that you will use in R are 16S amplicon profiles of microbial communities at several depths in Saanich Inlet from one time point in this series (August 2012). These ~300 bp sequences were processed using mothur to yield 97% (approximately species-level) operational taxonomic units (OTUs).

+

combined is a comma-delimited table of counts of four OTUs in each sample, normalized to 100,000 sequences per sample and the corresponding conditions of each sample (Depth, NO2, NO3 etc).

+

For a brief introduction to these data, see Hallam SJ et al. 2017. Monitoring microbial responses to ocean deoxygenation in a model oxygen minimum zone. Sci Data 4: 170158 doi:10.1038/sdata.2017.158.

+

Click the button below to save a copy of this data set to your computer:

+ +
+
+
+

Loading tabular data

+

Every R function follows the following basic syntax, where function() is the name of the function and arguments are the different parameters you can specify.

+

function(argument1=..., argument2=..., ...)

+

Data tables can be loaded into R using the tidyverse read_*() function.

+

The read_table() function allows us to open raw datafiles. Run the following code to see what combined.csv looks like.

+
read_table("combined.csv")
+

Notice how values in combined.csv are separated by commas. We can load our Saanich data into R with read_csv() for comma separated files and specify the arguments that describe our data as follows.

+
    +
  • col_names: tells R that the first row is column names, not data
  • +
+
read_csv(file = "combined.csv", col_names = TRUE)
+

Now our data is formatted nicely into table form.

+
+

Save data in the environment

+

Since we want to do more with our data after reading it in, we need to save it as a variable in R like we did previously with the <- operator. You can choose to name the object whatever you like, though this module assumes the names used below.

+
OTU_metadata_table <- read_csv(file = "combined.csv", col_names = TRUE)
+

Quiz
+
+
+
+
+ +
+

+
+
+
+

Data exploration

+

Let’s explore the data that we’ve imported into R. The simplest way to view your imported data is to view it as a “tibble” like this. This view displays a subset of large data tables (notice that the last column name gets cut off) in a table view

+
+
OTU_metadata_table
+ +
+

glimpse() is a function that as its name suggests, allows us to get a “glimpse” of the contents of a data table. Running glimpse() on a data table outputs the number of rows (observations), columns (variables), and lists each column name along with its type and a portion of its contents. Let’s run glimpse() with our OTU_metadata_table like this:

+
+
glimpse(OTU_metadata_table)
+ +
+

from this we see that our table has 7 rows and 10 columns. Each $ is followed by a column name, with information on the contents following each column name. glimpse() lists all columns of a table

+
+

Exercise

+

Quiz
+
+
+
+
+ +
+

+

If we only want the dimensions of a dataframe or table, we can use the dim() function which prints the number of rows followed by the number of columns. Simple functions to query just the number of rows or columns in a data table are nrow() and ncol().

+
+
#number of rows followed by number of columns
+dim(OTU_metadata_table)
+#number of rows
+nrow(OTU_metadata_table)
+#number of columns
+ncol(OTU_metadata_table)
+ +
+

We can list the column names using colnames().

+
+
colnames(OTU_metadata_table)
+ +
+
+
+
+

select()

+

You can use the select() function to keep only a subset of variables (columns). Let’s select the variables OTU0001, OTU0002, OTU0004, Depth.

+
+
restricted_columns <- select(OTU_metadata_table, OTU0001, OTU0002, OTU0004, Depth)
+ +
+

To view our new restricted_columns variable, just type in the variable name and run the code like this:

+
+
restricted_columns
+ +
+
+

Exercise

+

As an exercise, select for only the depth and geochemical columns in OTU_metadata_table and name the new table metadata:

+
+ +
+
+
+

Exercise: select()

+

Select the Cruise, Date, Depth, PO4, and WS_NO3 variables of the geochemicals data set

+
+
dat <- select()
+
+dat
+ +
+
+
+
+

Booleans

+

Booleans are logical statements that are either TRUE or FALSE but can not be anything in between. As an example, run the code below:

+
+
x <- 6
+y <- "cat"
+
+x < 3
+y == "dog"
+ +
+

The equation x < 3 is FALSE because x is set to 6 in the line above. As a simple exercise, manipulate the above code to make both equations TRUE.

+

note that in R, == is used in Boolean equations and using a single = will result in error. As you may have noticed above a single = is used to set a variable to a value.

+

For quick reference, here are the most commonly used statements and operators.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
R codemeaning
==equals
< or >less/greater than
<= or >=less/greater than or equal to
%in%in
is.nais missing (NA)
!not (as in not equal to !=)
&and
|or
+
+

Exercise

+

Write a boolean equation for “x is greater than 6 or less than 12”, it should return TRUE after running.

+
+ +
+
+
+

filter()

+

Conditional statements and logical operators are important when working with data in R. We will practice using different conditional statements and logical operators on the oxygen data in a subset of the geochemicals data set. You can use filter() to select specific rows based on a logical condition of a variable.

+
subset_dat <- slice(geochemicals, 710, 713, 715, 716, 709, 717, 718, 719)
+

variable == value returns rows where the variable matches the value:

+
+
filter(subset_dat, CTD_O2 == 204.259)
+ +
+

variable != value returns rows where the variable does not match the value:

+
+
filter(subset_dat, CTD_O2 != 204.259)
+ +
+

variable > value returns rows where the variable is greater than the value:

+
+
filter(subset_dat, CTD_O2 > 204.259)
+ +
+

variable %in% values returns rows where the variable matches one of the given values. Values are provided as a vector c(value1, value2, ...):

+
+
filter(subset_dat, CTD_O2 %in% c(40.745, 204.259))
+ +
+

is.na(variable) returns rows where the variable is NA (Not Available).

+
+
filter(subset_dat, is.na(CTD_O2))
+ +
+

!condition returns rows where the condition is not fulfilled.

+
+
filter(subset_dat, !is.na(CTD_O2))
+ +
+

We can look for a range of values by finding the rows where the value of the variable is <= 120 AND >= 20 bu using the logical operator &.

+
+
filter(subset_dat, CTD_O2 <= 120 & CTD_O2 >= 20)
+ +
+

Logical OR |. Find the rows where the value is <= 50 OR >= 150.

+
+
filter(subset_dat, CTD_O2 <= 50 | CTD_O2 >= 150)
+ +
+
+
+

Exercise

+As an exercise, restrict for rows where the value for “depth” is less than or equal to 135m. +
+ +
+
+
+
+

slice()

+

We can also only choose to work with specific rows in our data table using the slice() function.

+

To select a subset of observations (rows) by their ordinal position, we use the slice() function.

+
+
slice(OTU_metadata_table, 1)
+ +
+

You can list multiple ordinal postions to select multiple observations at once.

+
+
slice(OTU_metadata_table, 1, 2, 3, 4, 5)
+ +
+

If you would like to to select a range of observations, give the starting and end position separated by a colon like so: <start>:<end>.

+
+
slice(OTU_metadata_table, 1:5)
+ +
+

Quiz
+
+
+
+
+ +
+

+
+

Exercise: slice() and select()

+

Using slice() and select(), determine:

+
    +
  1. what depth value occurs in the 20th row?
  2. +
  3. what methane value occurs in the 170th row?
  4. +
+
+
dat <- OTU_metadata_table
+ +
+
+
+
+

Summary Exercise

+

The geochemicals dataset is included in the “educer” package. This dataframe contains time series observations on the water column chemistry. Learn more about the geochemicals dataset by running the following line in your R console.

+
+
?geochemicals
+ +
+

Using the geochemical data:

+
    +
  1. Select the Cruise, Date, Depth, and oxygen variables.
  2. +
  3. Filter the data to retain data on Cruise 72 where Depth is greater than or equal to 0.05 km.
  4. +
+

Your resulting pdat object should be a [16, 4] data frame. The data has been loaded for you into the dat variable.

+
+
dat <- geochemicals
+ +
+
+

Challenge exercise: select() and filter()

+

If you want more practice or have previous experience in R, try this more challenging exercise! Be sure to create a fresh dat.

+
    +
  1. Keep only the Cruise and Depth variables and the rows where oxygen OR nitrate is greater than zero.
  2. +
+

Your resulting pdat object should be a [1432, 2] data frame. Hint: Can you filter based on a variable that you previously removed by not selecting it?

+
+
dat <- geochemicals
+ +
+
+
+
+

Additional resources

+
    +
  • R cheatsheets also available in RStudio under Help > Cheatsheets
  • +
  • Introduction to dplyr
  • +
  • dplyr tutorial
  • +
  • dplyr video tutorial + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
  • +
+
+ +
+ +
+
+
+
+ + +
+ +

Michelle Kang

+

05/02/2020

+
+ + +
+
+
+
+ + +
+
+ + + + + + + + + + + + + + From 8e98f0845af081f409ccc62a450b6a7c76662912 Mon Sep 17 00:00:00 2001 From: r-karimi Date: Mon, 12 Oct 2020 09:56:13 -0700 Subject: [PATCH 03/11] Edit 'Loading Tabular Data' --- .../data_wrangling_basic_data_description.Rmd | 28 +++++++++++++++++ .../data_wrangling_basic.Rmd | 8 +++-- .../data_wrangling_basic.html | 30 ++++--------------- 3 files changed, 39 insertions(+), 27 deletions(-) create mode 100644 inst/resources/markdown/data_wrangling_basic_data_description.Rmd diff --git a/inst/resources/markdown/data_wrangling_basic_data_description.Rmd b/inst/resources/markdown/data_wrangling_basic_data_description.Rmd new file mode 100644 index 0000000..5d9c413 --- /dev/null +++ b/inst/resources/markdown/data_wrangling_basic_data_description.Rmd @@ -0,0 +1,28 @@ +The data used throughout this module were collected as part of an on-going oceanographic time series program in Saanich Inlet, a seasonally anoxic fjord on the East coast of Vancouver Island, British Columbia. + +The data that you will use in R are 16S amplicon profiles of microbial communities at several depths in Saanich Inlet from one time point in this series (August 2012). These ~300 bp sequences were processed using [mothur](https://www.mothur.org/wiki/Main_Page) to yield 97% (approximately species-level) operational taxonomic units (OTUs). + +`combined` is a comma-delimited table of counts of four OTUs in each sample, normalized to 100,000 sequences per sample and the corresponding conditions of each sample (Depth, NO2, NO3 etc). + +For a brief introduction to these data, see Hallam SJ et al. 2017. Monitoring microbial responses to ocean deoxygenation in a model oxygen minimum zone. Sci Data 4: 170158 [doi:10.1038/sdata.2017.158](https://www.nature.com/articles/sdata2017158). + +Click the button below to save a copy of this data set to your computer: + +```{r echo = FALSE} +# Download button shiny app UI +fluidRow( + column(12, align = "center", downloadButton("downloadData", "Download")) +) + +``` + +```{r context = "server"} +# Download button shiny app server +output$downloadData <- downloadHandler( + filename = "combined.csv", + content = function(file) { + write_csv(combined, file) + } +) +``` + diff --git a/inst/tutorials/data_wrangling_basic/data_wrangling_basic.Rmd b/inst/tutorials/data_wrangling_basic/data_wrangling_basic.Rmd index c8a5763..02b0b4f 100644 --- a/inst/tutorials/data_wrangling_basic/data_wrangling_basic.Rmd +++ b/inst/tutorials/data_wrangling_basic/data_wrangling_basic.Rmd @@ -66,7 +66,6 @@ library(educer) ``` - ## Tidyverse vs. base R The [tidyverse](https://www.tidyverse.org/) is a collection of R packages for data wrangling, analysis, and visualization. @@ -109,7 +108,10 @@ Before working with our data, we first want to make a copy of the raw data so th We will then continually overwrite this object with `<-` as we clean it in R. + + + ### Data description The data used throughout this module were collected as part of an on-going oceanographic time series program in Saanich Inlet, a seasonally anoxic fjord on the East coast of Vancouver Island, British Columbia. @@ -168,7 +170,7 @@ Now our data is formatted nicely into table form. ### Save data in the environment -Since we want to do more with our data after reading it in, we need to save it as a variable in R like we did previously with the `<-` operator. You can choose to name the object whatever you like, though this module assumes the names used below. +Since we want to further manipulate our dataset after reading it in, we need to save it as a variable in R like we did previously with the `<-` operator. You can name the object whatever you like, though this module will assume the names used below. ```{r eval = FALSE} OTU_metadata_table <- read_csv(file = "combined.csv", col_names = TRUE) @@ -188,7 +190,7 @@ quiz( ## Data exploration -Let's explore the data that we've imported into R. The simplest way to view your imported data is to view it as a "tibble" like this. This view displays a subset of large data tables (notice that the last column name gets cut off) in a table view +Let's explore the data that we've imported into R. The simplest way to view your imported data is to view it as a "tibble" like this. This view displays a subset of large data table. ```{r tibble, exercise = TRUE, exercise.lines = 5} OTU_metadata_table diff --git a/inst/tutorials/data_wrangling_basic/data_wrangling_basic.html b/inst/tutorials/data_wrangling_basic/data_wrangling_basic.html index 32a8e3b..0895832 100644 --- a/inst/tutorials/data_wrangling_basic/data_wrangling_basic.html +++ b/inst/tutorials/data_wrangling_basic/data_wrangling_basic.html @@ -129,27 +129,6 @@

By the end of this tutorial you should be able to:

-
-

R packages

-

R packages are units of shareable code, containing functions that facilitate and enhance analyses. In simpler terms, think of R packages as iPhone Applications. Each App has specific functions and capabilities that can be accessed when we install then open the application. The same can be said about R packages. In order to use the functions for a specific R package, we first need to install the package, then each time we want to use the package we need to “open” the package.

-

In this tutorial we will be using the “tidyverse” package. This package contains a versatile set of functions designed for easy manipulation of data.

-
-

Installing Packages

-

The tidyverse package can be installed like this (to install a different package just replace “tidyverse” with the name of the desired package):

-

For R v3.4 or newer

-
install.packages("tidyverse")
-

For R v3.3 or older
-Unfortunately, tidyverse does not work on older versions of R. Instead, you will need to individually install each package within the tidyverse.

-
install.packages("tibble")
-install.packages("readr")
-install.packages("dplyr")
-install.packages("tidyr")
-install.packages("ggplot2")
-install.packages("stringr")
-install.packages("purrr")
-install.packages("forcats")
-
-

Loading packages

After installing a package, and everytime you open a new RStudio session, the packages you want to use need to be loaded (opened) into the R work-space with the library() function. This tells R to access the package’s functions and prevents RStudio from lags that would occur if it automatically loaded every downloaded package every time you opened it.

@@ -195,6 +174,9 @@

Tidyverse vs. base R

Before working with our data, we first want to make a copy of the raw data so that we may revert to it quickly if we make any mistakes. This is best practices for data science in general.

# dat <- raw_dat

We will then continually overwrite this object with <- as we clean it in R.

+ + +

Data description

The data used throughout this module were collected as part of an on-going oceanographic time series program in Saanich Inlet, a seasonally anoxic fjord on the East coast of Vancouver Island, British Columbia.

@@ -559,7 +541,7 @@

Additional resources

-

glimpse() is a function that as its name suggests, allows us to get a “glimpse” of the contents of a data table. Running glimpse() on a data table outputs the number of rows (observations), columns (variables), and lists each column name along with its type and a portion of its contents. Let’s run glimpse() with our OTU_metadata_table like this:

+

glimpse() is a function that, as its name suggests, allows us to get a “glimpse” of the contents of a data table. Running glimpse() on a data table outputs the number of rows (observations), columns (variables), and lists each column name along with its type and a portion of its contents. Let’s run glimpse() with our OTU_metadata_table like this:

glimpse(OTU_metadata_table)
@@ -275,20 +275,22 @@

select()

Exercise

-

As an exercise, select for only the depth and geochemical columns in OTU_metadata_table and name the new table metadata:

+

As an exercise, select for only the depth and geochemical columns (Depth, NO3, Mean_NO2, Mean_N2O, and Mean_NH4) in OTU_metadata_table and name the new table metadata:

+
+
select(OTU_metadata_table, <variable1>, <variable2>, <...>)
-
-

Exercise: select()

-

Select the Cruise, Date, Depth, PO4, and WS_NO3 variables of the geochemicals data set

-
-
dat <- select()
-
-dat
- +
+
select(OTU_metadata_table, Depth, NO3, Mean_NO2, Mean_N2O, Mean_NH4)
+ + + + + +
@@ -303,13 +305,27 @@

Booleans

The equation x < 3 is FALSE because x is set to 6 in the line above. As a simple exercise, manipulate the above code to make both equations TRUE.

-

note that in R, == is used in Boolean equations and using a single = will result in error. As you may have noticed above a single = is used to set a variable to a value.

+
+
#x <- A number less than 3
+#y <- A string
+
+x < 3
+y == "dog"
+
+
+
x <- 1
+y <- "dog"
+
+x < 3
+y == "dog"
+
+

Note that in R, == is used in Boolean equations and using a single = will result in error. As you may have noticed above a single = is used to set a variable to a value.

For quick reference, here are the most commonly used statements and operators.

- - + + @@ -353,6 +369,16 @@

Exercise

+
+
# Consider how we represent the "greater/less than" operator and the "OR" operator in R.
+
+# "x is greater than n" can be represented in the following way.
+x > n
+
+
+
# "x is greater than n or less than m" can be represented in the following way.
+x > n | x < m
+

filter()

@@ -393,7 +419,7 @@

filter()

filter(subset_dat, CTD_O2 <= 120 & CTD_O2 >= 20)
-

Logical OR |. Find the rows where the value is <= 50 OR >= 150.

+

Lastly, we can use the logical OR (|) to find the rows where the value is <= 50 OR >= 150.

filter(subset_dat, CTD_O2 <= 50 | CTD_O2 >= 150)
@@ -541,7 +567,7 @@

Additional resources

- - From 564eee7d55c2f788747cc426c54ba62746f61aaf Mon Sep 17 00:00:00 2001 From: r-karimi <66281159+r-karimi@users.noreply.github.com> Date: Sat, 17 Oct 2020 11:02:44 -0700 Subject: [PATCH 05/11] Rewrote description and learning objectives --- .../data_wrangling_basic.Rmd | 48 ++++++---- .../data_wrangling_basic.html | 92 ++++++++++++------- 2 files changed, 89 insertions(+), 51 deletions(-) diff --git a/inst/tutorials/data_wrangling_basic/data_wrangling_basic.Rmd b/inst/tutorials/data_wrangling_basic/data_wrangling_basic.Rmd index 1eb8de9..0dd4851 100644 --- a/inst/tutorials/data_wrangling_basic/data_wrangling_basic.Rmd +++ b/inst/tutorials/data_wrangling_basic/data_wrangling_basic.Rmd @@ -42,31 +42,46 @@ summary_solution_2 <- -## Objectives +## Motivation -### By the end of this tutorial you should be able to: +### Why use R for data processing? -- Install and load R packages. -- Load tabular data using `read_csv()` and save the data to your R environment. -- Use the `filter()`, `slice()` and `select()` methods to conditionally subset your data. +Imagine you are looking at an environmental library of 10,000 plasmids, and you are asked to make arrow plots of only the plasmids that are less than `7500 bp`, originating only from bacteria or archea, and that have shown activity in your screen. It is more likely than not that you are used to using programs like Excel for general data processing. But how would you do this particular task in excel? You would have to apply conditional filters to multiple rows, then find a program online that accepts annotated plasmid maps as input and manually make each plot. +It would be a time-consuming process, and one you would have to repeat every time you had to complete this particular workflow. Thankfully, R provides packages for data management that make the filtering process a breeze, and the results of your filtering can be fed into a package that can sequentially generate the plots you want. If you had reason, you could even write an R script that will generate these plots for you in one click! +While the learning curve is somewhat steeper than programs like Excel, R is a highly modular language that allows for implementation of many different workflows (almost anything you can imagine)! And more importantly, you can write *general* scripts that can take predictable raw input from another source and process it automatically into whatever shape or form you would like. +### Structure of Data Wrangling Tutorials +We have developed three "data wrangling" modules for R. "Data wrangling," the process of taking raw data and transforming it into another, possibly more useful form, is a linchpin of R competency. By learning these basic techniques, you will open the door to creating beautiful figures using `ggplot2`, training machine learning models with `caret`, and so on. -## Loading packages +We have split the contents into beginner, intermediate, and advanced modules. -After installing a package, and *everytime* you open a new RStudio session, the packages you want to use need to be loaded (opened) into the R work-space with the `library()` function. This tells R to access the package's functions and prevents RStudio from lags that would occur if it automatically loaded every downloaded package every time you opened it. +#### Beginner -Packages can be loaded like this: +In this tutorial, we will demonstrate how to load data from your disk in from another data format (usually `.csv`) into R, how to manipulate that data table in R by subsetting it and doing some light data processing, and how to write your processed data to your disk. -```{r tidyverse_load, exercise = TRUE, exercise.lines = 5} -library(tidyverse) -library(educer) -``` + +#### Intermediate + +In this tutorial, we will discuss how to join different datasets together using the numerous `*_join()` functions found in R. We will also discuss how to turn "wide-format" data into "long-format" data, and vice-versa. + +#### Advanced + +Here, we will introduce the `purrr` package, which allows you to perform more advanced processing on subsets of your dataset in parallel, and how to apply your own custom functions to columns in your data table. -## Tidyverse vs. base R +## Learning Goals + +- Load tabular data using `read_csv()` and save the data to your R environment. +- Introduce the use of logical operators and conditional statements in R for subsetting your data. +- Use the `filter()`, `slice()` and `select()` methods to conditionally subset your data. +- Use the `mutate()` function to create new variables in your dataset, using your existing variables. +- Writing your processed data to your disk. + + +## What is the Tidyverse? The [tidyverse](https://www.tidyverse.org/) is a collection of R packages for data wrangling, analysis, and visualization. @@ -102,8 +117,8 @@ Each verb works similarly: Before working with our data, we first want to make a copy of the raw data so that we may revert to it quickly if we make any mistakes. This is best practices for data science in general. -```{r} -# dat <- raw_dat +```{r eval = F} +dat <- raw_dat ``` We will then continually overwrite this object with `<-` as we clean it in R. @@ -463,9 +478,6 @@ dat <- geochemicals ```{r slice_exercise-hint-1} # Recall that slice() allows you to find a row # and select() allows you to find a column -``` - -```{r slice_exercise-hint-2} slice(dat, 20) select(dat, depth) ``` diff --git a/inst/tutorials/data_wrangling_basic/data_wrangling_basic.html b/inst/tutorials/data_wrangling_basic/data_wrangling_basic.html index 38561c7..c430348 100644 --- a/inst/tutorials/data_wrangling_basic/data_wrangling_basic.html +++ b/inst/tutorials/data_wrangling_basic/data_wrangling_basic.html @@ -118,29 +118,44 @@
-
-

Objectives

-
-

By the end of this tutorial you should be able to:

+
+

Motivation

+
+

Why use R for data processing?

+

Imagine you are looking at an environmental library of 10,000 plasmids, and you are asked to make arrow plots of only the plasmids that are less than 7500 bp, originating only from bacteria or archea, and that have shown activity in your screen. It is more likely than not that you are used to using programs like Excel for general data processing. But how would you do this particular task in excel? You would have to apply conditional filters to multiple rows, then find a program online that accepts annotated plasmid maps as input and manually make each plot.

+

It would be a time-consuming process, and one you would have to repeat every time you had to complete this particular workflow. Thankfully, R provides packages for data management that make the filtering process a breeze, and the results of your filtering can be fed into a package that can sequentially generate the plots you want. If you had reason, you could even write an R script that will generate these plots for you in one click!

+

While the learning curve is somewhat steeper than programs like Excel, R is a highly modular language that allows for implementation of many different workflows (almost anything you can imagine)! And more importantly, you can write general scripts that can take predictable raw input from another source and process it automatically into whatever shape or form you would like.

+
+
+

Structure of Data Wrangling Tutorials

+

We have developed three “data wrangling” modules for R. “Data wrangling,” the process of taking raw data and transforming it into another, possibly more useful form, is a linchpin of R competency. By learning these basic techniques, you will open the door to creating beautiful figures using ggplot2, training machine learning models with caret, and so on.

+

We have split the contents into beginner, intermediate, and advanced modules.

+
+

Beginner

+

In this tutorial, we will demonstrate how to load data from your disk in from another data format (usually .csv) into R, how to manipulate that data table in R by subsetting it and doing some light data processing, and how to write your processed data to your disk.

+
+
+

Intermediate

+

In this tutorial, we will discuss how to join different datasets together using the numerous *_join() functions found in R. We will also discuss how to turn “wide-format” data into “long-format” data, and vice-versa.

+
+
+

Advanced

+

Here, we will introduce the purrr package, which allows you to perform more advanced processing on subsets of your dataset in parallel, and how to apply your own custom functions to columns in your data table.

+
+
+
+
+

Learning Goals

    -
  • Install and load R packages.
  • Load tabular data using read_csv() and save the data to your R environment.
  • +
  • Introduce the use of logical operators and conditional statements in R for subsetting your data.
  • Use the filter(), slice() and select() methods to conditionally subset your data.
  • +
  • Use the mutate() function to create new variables in your dataset, using your existing variables.
  • +
  • Writing your processed data to your disk.
-
-
-

Loading packages

-

After installing a package, and everytime you open a new RStudio session, the packages you want to use need to be loaded (opened) into the R work-space with the library() function. This tells R to access the package’s functions and prevents RStudio from lags that would occur if it automatically loaded every downloaded package every time you opened it.

-

Packages can be loaded like this:

-
-
library(tidyverse)
-library(educer)
- -
-
-
-

Tidyverse vs. base R

+
+

What is the Tidyverse?

The tidyverse is a collection of R packages for data wrangling, analysis, and visualization.

The main advantages of using the tidyverse to read in data over base R are:

    @@ -172,7 +187,7 @@

    Tidyverse vs. base R

  • Output is another data frame

Before working with our data, we first want to make a copy of the raw data so that we may revert to it quickly if we make any mistakes. This is best practices for data science in general.

-
# dat <- raw_dat
+
dat <- raw_dat

We will then continually overwrite this object with <- as we clean it in R.

@@ -370,7 +385,8 @@

Exercise

-
# Consider how we represent the "greater/less than" operator and the "OR" operator in R.
+
# Consider how we represent the "greater/less than" 
+# operator and the "OR" operator in R.
 
 # "x is greater than n" can be represented in the following way.
 x > n
@@ -431,6 +447,13 @@

Exercise

+
+
# Recall the general syntax for filtering.
+filter(dataset, (column, operator, quantity))
+
+
+
filter(subset_dat, depth <= 135)
+
@@ -467,9 +490,19 @@

Exercise: slice() and select()

  • what methane value occurs in the 170th row?
  • -
    dat <- OTU_metadata_table
    +
    dat <- geochemicals
    +
    +
    # Recall that slice() allows you to find a row
    +# and select() allows you to find a column
    +slice(dat, 20)
    +select(dat, depth)
    +
    +
    +
    slice(select(dat, Depth), 20)
    +select(slice(dat, 170), methane)
    +
    @@ -549,13 +582,6 @@

    Additional resources

    }) - - From f98c8f22e2d69782a0b1de8ada93148bf50e7921 Mon Sep 17 00:00:00 2001 From: r-karimi <66281159+r-karimi@users.noreply.github.com> Date: Sat, 17 Oct 2020 11:24:00 -0700 Subject: [PATCH 06/11] Restructure data i/o section --- .../data_wrangling_basic.Rmd | 78 ++++++++++++------- .../data_wrangling_basic.html | 26 ++++--- 2 files changed, 66 insertions(+), 38 deletions(-) diff --git a/inst/tutorials/data_wrangling_basic/data_wrangling_basic.Rmd b/inst/tutorials/data_wrangling_basic/data_wrangling_basic.Rmd index 0dd4851..14c8b59 100644 --- a/inst/tutorials/data_wrangling_basic/data_wrangling_basic.Rmd +++ b/inst/tutorials/data_wrangling_basic/data_wrangling_basic.Rmd @@ -118,10 +118,15 @@ Each verb works similarly: Before working with our data, we first want to make a copy of the raw data so that we may revert to it quickly if we make any mistakes. This is best practices for data science in general. ```{r eval = F} -dat <- raw_dat +working_data <- raw_data ``` -We will then continually overwrite this object with `<-` as we clean it in R. +We will then repeatedly overwrite this object with the assignment operator (`<-`) as we further process it in R, as follows. + +```{r eval = F} +working_data <- working_data + 7 +working_data <- working_data / 3 +``` @@ -157,48 +162,67 @@ output$downloadData <- downloadHandler( ) ``` -## Loading tabular data +## Reading and Writing Data to Disk -Every R function follows the following basic syntax, where `function()` is the name of the function and `arguments` are the different parameters you can specify. +### Reading in a Dataset -`function(argument1=..., argument2=..., ...)` +First, ensure that you have downloaded the `combined.csv` file from the previous section, and you have saved it to your working directory. If you saved the file to another location, the data import function below will fail. To check your working directory, you can run the following. -Data tables can be loaded into R using the tidyverse `read_*()` function. - -The `read_table()` function allows us to open raw datafiles. Run the following code to see what `combined.csv` looks like. - -```{r eval = FALSE} -read_table("combined.csv") +```{r eval = F} +getwd() ``` -Notice how values in `combined.csv` are separated by commas. We can load our Saanich data into R with `read_csv()` for comma separated files and specify the arguments that describe our data as follows. +We can load our Saanich data into R with `read_csv()` for comma separated files and specify the arguments that describe our data as follows. - `col_names`: tells R that the first row is column names, not data -```{r eval = FALSE} -read_csv(file = "combined.csv", col_names = TRUE) +```{r eval = F} +raw_data <- read_csv(file = "combined.csv", col_names = TRUE) ``` -Now our data is formatted nicely into table form. + -### Save data in the environment +```{r include=FALSE} +raw_data <- combined +``` -Since we want to further manipulate our dataset after reading it in, we need to save it as a variable in R like we did previously with the `<-` operator. You can name the object whatever you like, though this module will assume the names used below. +Now our data is formatted nicely into table form, and we can have a look at it with the `head()` function. -```{r eval = FALSE} -OTU_metadata_table <- read_csv(file = "combined.csv", col_names = TRUE) +```{r} +head(raw_data) ``` -```{r saving-quiz, echo = FALSE} -quiz( - question("How do we make sure the datatable is saved to 'OTU_metadata_table' in our environment?", - answer("Entering `\"OTU_metadata_table\"` (with quotes) in the console displays the table."), - answer("Entering `OTU_metadata_table` (no quotes) in the console displays the table.", correct = TRUE), - answer("`OTU_metadata_table` shows up in the \"Global Environment\" box on the top right hand corner.", correct = TRUE) - ) -) +The `head()` function prints the first six rows of your dataset, alongside your column names. The `` printed below column names like `Cruise` is the data type of the column. In this case, it's a ``, which is short for [https://en.wikipedia.org/wiki/Double-precision_floating-point_format](double-precision floating-point format), a particular way of holding numbers in memory. The details are beyond the scope of this tutorial, but just keep in mind it's a different data type than, say, a character string (``). Anyway, we see that our import was successful. + + + + + + + + + + + + + + + + + + + +### Writing Data to Disk + +Although we have done no processing to our dataset, let's assume that we have, and that we want to save a processed dataset to disk. First, let's make a dummy processed dataset. + +```{r} +processed_data <- raw_data ``` +Then, to save it to disk, we have to use the `write_csv()` command. + + ## Data exploration Let's explore the data that we've imported into R. The simplest way to view your imported data is to view it as a "tibble" like this. This view displays a subset of large data table. diff --git a/inst/tutorials/data_wrangling_basic/data_wrangling_basic.html b/inst/tutorials/data_wrangling_basic/data_wrangling_basic.html index c430348..3edc95e 100644 --- a/inst/tutorials/data_wrangling_basic/data_wrangling_basic.html +++ b/inst/tutorials/data_wrangling_basic/data_wrangling_basic.html @@ -211,17 +211,21 @@

    Data description

    Loading tabular data

    -

    Every R function follows the following basic syntax, where function() is the name of the function and arguments are the different parameters you can specify.

    -

    function(argument1=..., argument2=..., ...)

    -

    Data tables can be loaded into R using the tidyverse read_*() function.

    -

    The read_table() function allows us to open raw datafiles. Run the following code to see what combined.csv looks like.

    -
    read_table("combined.csv")
    -

    Notice how values in combined.csv are separated by commas. We can load our Saanich data into R with read_csv() for comma separated files and specify the arguments that describe our data as follows.

    +

    First, ensure that you have downloaded the combined.csv file from the previous section, and you have saved it to your working directory. If you saved the file to another location, the data import function below will fail. To check your working directory, you can run the following.

    +
    getwd()
    +

    We can load our Saanich data into R with read_csv() for comma separated files and specify the arguments that describe our data as follows.

    • col_names: tells R that the first row is column names, not data
    -
    read_csv(file = "combined.csv", col_names = TRUE)
    -

    Now our data is formatted nicely into table form.

    +
    raw_data <- read_csv(file = "combined.csv", col_names = TRUE)
    + +

    Now our data is formatted nicely into table form, and we can have a look at it with the head() function.

    +
    head(raw_data)
    +
    + +

    Save data in the environment

    Since we want to further manipulate our dataset after reading it in, we need to save it as a variable in R like we did previously with the <- operator. You can name the object whatever you like, though this module will assume the names used below.

    @@ -593,7 +597,7 @@

    Additional resources

    -
    -

    Save data in the environment

    -

    Since we want to further manipulate our dataset after reading it in, we need to save it as a variable in R like we did previously with the <- operator. You can name the object whatever you like, though this module will assume the names used below.

    -
    OTU_metadata_table <- read_csv(file = "combined.csv", col_names = TRUE)
    -

    Quiz
    -
    -
    -
    -
    - +

    The head() function prints the first six rows of your dataset, alongside your column names. The <dbl> printed below column names like Cruise is the data type of the column. In this case, it’s a <dbl>, which is short for https://en.wikipedia.org/wiki/Double-precision_floating-point_format, a particular way of holding numbers in memory. The details are beyond the scope of this tutorial, but just keep in mind it’s a different data type than, say, a character string (<chr>). Anyway, we see that our import was successful.

    + + + + + + + + + + + + + +
    -

    +
    +

    Writing Data to Disk

    +

    Although we have done no processing to our dataset, let’s assume that we have, and that we want to save a processed dataset to disk. First, let’s make a dummy processed dataset.

    +
    processed_data <- raw_data
    +

    Then, to save it to disk, we have to use the write_csv() command.

    @@ -596,10 +609,6 @@

    Additional resources

    ) - - From e232dd3443b82a58705cf6f9146ac82d41021a35 Mon Sep 17 00:00:00 2001 From: r-karimi Date: Sat, 24 Oct 2020 11:16:31 -0700 Subject: [PATCH 08/11] Rework logical operators section --- .../data_wrangling_basic.Rmd | 208 +++++++++------ .../data_wrangling_basic.html | 249 +++++++++--------- 2 files changed, 250 insertions(+), 207 deletions(-) diff --git a/inst/tutorials/data_wrangling_basic/data_wrangling_basic.Rmd b/inst/tutorials/data_wrangling_basic/data_wrangling_basic.Rmd index 22ebfd6..37329ee 100644 --- a/inst/tutorials/data_wrangling_basic/data_wrangling_basic.Rmd +++ b/inst/tutorials/data_wrangling_basic/data_wrangling_basic.Rmd @@ -165,7 +165,7 @@ output$downloadData <- downloadHandler( ) ``` -## Reading and Writing Data to Disk +## Reading and Writing Data ### Reading in a Dataset @@ -195,57 +195,58 @@ Now our data is formatted nicely into table form, and we can have a look at it w head(raw_data) ``` -The `head()` function prints the first six rows of your dataset, alongside your column names. The `` printed below column names like `Cruise` is the data type of the column. In this case, it's a ``, which is short for [https://en.wikipedia.org/wiki/Double-precision_floating-point_format](double-precision floating-point format), a particular way of holding numbers in memory. The details are beyond the scope of this tutorial, but just keep in mind it's a different data type than, say, a character string (``). Anyway, we see that our import was successful. +The `head()` function prints the first six rows of your dataset, alongside your column names. The `` printed below column names like `Cruise` is the data type of the column. In this case, it's a ``, which is short for [double-precision floating-point format](https://en.wikipedia.org/wiki/Double-precision_floating-point_format), a particular way of holding numbers in memory. The details are beyond the scope of this tutorial, but just keep in mind it's a different data type than, say, a character string (``). Anyway, we see that our import was successful. - +As an exercise, go ahead and place a copy of `combined.csv` in a folder called `import_exercise` in your working directory. Then, try to read in `combined.csv` directly from the folder, and use `head()` to ensure the data import was successful and nothing looks funny. - +```{r folder-import, exercise = TRUE, exercise.lines = 2} - - - - - - - - - - - - +``` + +```{r folder-import-hint-1} +# How do you specify that a file is inside a folder in a filepath? +# If a folder is called "foo", then you would include "foo/" in your filepath. +``` + +```{r folder-import-hint-2} +raw_data <- read_csv(file = "import_exercise/combined.csv", col_names = TRUE) +head(raw_data) +``` ### Writing Data to Disk -Although we have done no processing to our dataset, let's assume that we have, and that we want to save a processed dataset to disk. First, let's make a dummy processed dataset. +Although we have done no processing to our dataset, let's assume that we have. Now we want to save a processed dataset to disk. First, let's make a dummy processed dataset that we can practice saving to disk. ```{r} processed_data <- raw_data ``` -Then, to save it to disk, we have to use the `write_csv()` command. +Then, to save it to disk, we have to use the `write_csv()` function. The `write_csv()` function takes two critical arguments, `x` and `path`. `x` is simply the tibble you wish to save to memory, and `path` is a character string indicating the filepath of the new object. The following command will save `processed_data` as `processed.csv` in your current working directory. There are other optional arguments that can tweak how your data is saved, which you can read about by running `?write_csv`. +```{r eval = F} +write_csv(processed_data, path = "processed.csv", col_names = TRUE) +``` -## Data exploration +### Data exploration -Let's explore the data that we've imported into R. The simplest way to view your imported data is to view it as a "tibble" like this. This view displays a subset of large data table. +Let's explore the data that we've imported into R. Although we've discussed the use of `head()`, the simplest way to view your imported data is to call the variable name directly. This view displays a subset of large data table. ```{r tibble, exercise = TRUE, exercise.lines = 5} -OTU_metadata_table +raw_data ``` -`glimpse()` is a function that, as its name suggests, allows us to get a "glimpse" of the contents of a data table. Running `glimpse()` on a data table outputs the number of rows (observations), columns (variables), and lists each column name along with its type and a portion of its contents. Let's run `glimpse()` with our OTU_metadata_table like this: +`glimpse()` is a function that allows us to get a "glimpse" of the contents of a data table. Running `glimpse()` on a data table outputs its number of rows (observations); its number of columns (variables); a list of each column name, along with its type and a portion of its contents. We can run `glimpse()` on `raw_data` like so: -```{r glimpse, exercise = TRUE, exercise.lines = 5} -glimpse(OTU_metadata_table) +```{r} +glimpse(raw_data) ``` -from this we see that our table has 7 rows and 10 columns. Each $ is followed by a column name, with information on the contents following each column name. `glimpse()` lists all columns of a table -### Exercise +From the output above, we see that our table has 7 rows and 10 columns. As discussed above, each $ is followed by a column name, a portion of the data it contains, and its data type. ```{r glimpse-quiz, echo = FALSE} quiz( - question("Which columns are in the OTU_metadata_table?", + question("Which columns are in raw_data?", answer("OTU001", correct = TRUE), answer("Otu002"), answer("72"), @@ -257,108 +258,100 @@ quiz( ) ``` -If we only want the dimensions of a dataframe or table, we can use the `dim()` function which prints the number of rows followed by the number of columns. Simple functions to query just the number of rows or columns in a data table are `nrow()` and `ncol()`. +If we want to know the dimensions of our tibble, we can use the `dim()` function, which prints the number of rows followed by the number of columns. -```{r dim, exercise = TRUE, exercise.lines = 5} -#number of rows followed by number of columns -dim(OTU_metadata_table) -#number of rows -nrow(OTU_metadata_table) -#number of columns -ncol(OTU_metadata_table) -``` - -We can list the column names using `colnames()`. - -```{R colnames,exercise = TRUE, exercise.lines = 5} -colnames(OTU_metadata_table) +```{r} +dim(raw_data) ``` +Simple functions to obtain only the number of rows or only the number of columns in a data table are `nrow()` and `ncol()`. -## `select()` - -You can use the `select()` function to keep only a subset of variables (columns). Let's select the variables `OTU0001`, `OTU0002`, `OTU0004`, `Depth`. - -```{R select-1, exercise = TRUE, exercise.lines = 5} -restricted_columns <- select(OTU_metadata_table, OTU0001, OTU0002, OTU0004, Depth) +```{r} +nrow(raw_data) +ncol(raw_data) ``` -To view our new `restricted_columns` variable, just type in the variable name and run the code like this: +Lastly, we can list the column names using `colnames()`. -```{R select-2, exercise = TRUE, exercise.lines = 5} -restricted_columns +```{r} +colnames(raw_data) ``` -### Exercise -As an exercise, select for only the depth and geochemical columns (Depth, NO3, Mean_NO2, Mean_N2O, and Mean_NH4) in `OTU_metadata_table` and name the new table `metadata`: +## Logical Operators in R -```{r select-exercise, exercise = TRUE, exercise.lines = 5} +Logical operators are special symbols in R that one can use to ask `TRUE/FALSE` questions about your data. For instance, say you had a column in a data frame containing the following data: `c("apple", "pear", "banana")`. You could then ask R, "which entries in my column are equivalent to the character string `"pear"`? R would then return the following vector: `c(FALSE, TRUE, FALSE)`. As you might imagine, the next step could be to tell R to only keep the entries for which the answer to this question is `TRUE` or only keep those that are `FALSE`. -``` +The `==` operator asks R whether the left-hand side is equivalent to the right-hand side. So here is how you would ask the above question in R. -```{r select-exercise-hint-1} -select(OTU_metadata_table, , , <...>) -``` +```{r} +# create our fruit vector +fruits = c("apple", "pear", "banana") -```{r select-exercise-hint-2} -select(OTU_metadata_table, Depth, NO3, Mean_NO2, Mean_N2O, Mean_NH4) +# ask our question +fruits == "pear" ``` - - - - +We see that our predicted vector is returned. - +This process also works with variables containing single pieces of data too. For instance, let's initialize the following variables. - - +```{r} +number <- 6 +animal <- "cat" +``` +And ask some questions. Is `number` less than 3? +```{r} +number < 3 +``` +Is the `animal` a dog? -## Booleans +```{r} +animal == "dog" +``` -Booleans are logical statements that are either `TRUE` or `FALSE` but can not be anything in between. As an example, run the code below: +As a simple exercise, manipulate the below code to make both equations `TRUE`. ```{r boolean-exercise, exercise = TRUE, exercise.lines = 5} -x <- 6 -y <- "cat" +number <- 6 +animal <- "cat" -x < 3 -y == "dog" +number < 3 +animal == "dog" ``` -The equation `x < 3` is `FALSE` because x is set to 6 in the line above. As a simple exercise, manipulate the above code to make both equations `TRUE`. - ```{r boolean-exercise-hint-1} -#x <- A number less than 3 -#y <- A string +number <- #A number less than 3 +animal <- #A string -x < 3 -y == "dog" +number < 3 +animal == "dog" ``` ```{r boolean-exercise-hint-2} -x <- 1 -y <- "dog" +number <- 1 +animal <- "dog" -x < 3 -y == "dog" +number < 3 +animal == "dog" ``` -Note that in R, `==` is used in Boolean equations and using a single `=` will result in error. As you may have noticed above a single `=` is used to set a variable to a value. +Note that in R, only the double `==` is a logical operator. Using a single `=` will result in error. As you may have noticed, a single `=` can only be used to assign a variable to a value. + +We can chain together multiple logical operators in a single question. Say we wanted to ask, "is `number` less than 10 but also greater than 4?" We would do so with the following operator: `number < 10 & number > 4`. For quick reference, here are the most commonly used statements and operators. R Operator | Meaning ---------- | --------------- -`==` | equals +`==` | equivalent to +`!=` | not equivalent to `< or >` | less/greater than `<= or >= `| less/greater than or equal to `%in%` | in `is.na` | is missing (`NA`) -`!` | not (as in not equal to `!=`) `&` | and `|` | or @@ -385,6 +378,51 @@ x > n | x < m ``` + + +## `select()` + +You can use the `select()` function to keep only a subset of variables (columns). Let's select the variables `OTU0001`, `OTU0002`, `OTU0004`, `Depth`. + +```{R select-1, exercise = TRUE, exercise.lines = 5} +restricted_columns <- select(OTU_metadata_table, OTU0001, OTU0002, OTU0004, Depth) +``` + +To view our new `restricted_columns` variable, just type in the variable name and run the code like this: + +```{R select-2, exercise = TRUE, exercise.lines = 5} +restricted_columns +``` + +### Exercise +As an exercise, select for only the depth and geochemical columns (Depth, NO3, Mean_NO2, Mean_N2O, and Mean_NH4) in `OTU_metadata_table` and name the new table `metadata`: + +```{r select-exercise, exercise = TRUE, exercise.lines = 5} + +``` + +```{r select-exercise-hint-1} +select(OTU_metadata_table, , , <...>) +``` + +```{r select-exercise-hint-2} +select(OTU_metadata_table, Depth, NO3, Mean_NO2, Mean_N2O, Mean_NH4) +``` + + + + + + + + + + + + + + + ### `filter()` Conditional statements and logical operators are important when working with data in R. We will practice using different conditional statements and logical operators on the oxygen data in a subset of the `geochemicals` data set. You can use `filter()` to select specific rows based on a logical condition of a variable. diff --git a/inst/tutorials/data_wrangling_basic/data_wrangling_basic.html b/inst/tutorials/data_wrangling_basic/data_wrangling_basic.html index 9498a8f..5f7a3dd 100644 --- a/inst/tutorials/data_wrangling_basic/data_wrangling_basic.html +++ b/inst/tutorials/data_wrangling_basic/data_wrangling_basic.html @@ -162,14 +162,14 @@

    What is the Tidyverse?

  • Faster data processing
  • Seamless integration with other tidyverse functions
  • Automatic designation of data types
  • -
  • Data storage in tibble as opposed to data frames +
  • Data storage in tibbles as opposed to data frames
    • Tibbles are data frames with an additional layer of formatting that causes them to print nicely in the console and always return a tibble in functions
  • A popular package for data wrangling is dplyr in the tidyverse. This package is so good at what it does, and integrates so well with other popular tools like ggplot2, that it has rapidly become the de-facto standard.

    dplyr code is very readable because all operations are based on using dplyr functions or verbs (select, filter, mutate…).

    -

    Typical data wrangling tasks in dplyr:

    +

    Typical verbs in dplyr:

    • select() a subset of variables (columns)
    • slice() out rows by their ordinal position in the tbl
    • @@ -196,6 +196,7 @@

      What is the Tidyverse?

      Data description

      +

      The data used throughout this module were collected as part of an on-going oceanographic time series program in Saanich Inlet, a seasonally anoxic fjord on the East coast of Vancouver Island, British Columbia.

      The data that you will use in R are 16S amplicon profiles of microbial communities at several depths in Saanich Inlet from one time point in this series (August 2012). These ~300 bp sequences were processed using mothur to yield 97% (approximately species-level) operational taxonomic units (OTUs).

      combined is a comma-delimited table of counts of four OTUs in each sample, normalized to 100,000 sequences per sample and the corresponding conditions of each sample (Depth, NO2, NO3 etc).

      @@ -211,8 +212,8 @@

      Data description

    -
    -

    Reading and Writing Data to Disk

    +
    +

    Reading and Writing Data

    Reading in a Dataset

    First, ensure that you have downloaded the combined.csv file from the previous section, and you have saved it to your working directory. If you saved the file to another location, the data import function below will fail. To check your working directory, you can run the following.

    @@ -230,44 +231,49 @@

    Reading in a Dataset

    {"columns":[{"label":["Cruise"],"name":[1],"type":["dbl"],"align":["right"]},{"label":["Depth"],"name":[2],"type":["dbl"],"align":["right"]},{"label":["OTU0001"],"name":[3],"type":["dbl"],"align":["right"]},{"label":["OTU0002"],"name":[4],"type":["dbl"],"align":["right"]},{"label":["OTU0003"],"name":[5],"type":["dbl"],"align":["right"]},{"label":["OTU0004"],"name":[6],"type":["dbl"],"align":["right"]},{"label":["NO3"],"name":[7],"type":["dbl"],"align":["right"]},{"label":["Mean_NO2"],"name":[8],"type":["dbl"],"align":["right"]},{"label":["Mean_N2O"],"name":[9],"type":["dbl"],"align":["right"]},{"label":["Mean_NH4"],"name":[10],"type":["dbl"],"align":["right"]}],"data":[{"1":"72","2":"10","3":"263","4":"0","5":"3210","6":"26","7":"1.793","8":"0.1275","9":"0.849","10":"0.4080"},{"1":"72","2":"100","3":"6489","4":"0","5":"18405","6":"779","7":"26.400","8":"0.0817","9":"18.087","10":"0.1344"},{"1":"72","2":"120","3":"24380","4":"0","5":"8221","6":"3404","7":"21.302","8":"0.0978","9":"16.304","10":"0.1782"},{"1":"72","2":"135","3":"39519","4":"3","5":"2793","6":"5368","7":"15.917","8":"0.0706","9":"12.909","10":"0.1296"},{"1":"72","2":"150","3":"55812","4":"12","5":"596","6":"7032","7":"5.278","8":"0.1127","9":"11.815","10":"2.1754"},{"1":"72","2":"165","3":"49362","4":"6","5":"178","6":"10689","7":"0.000","8":"0.0805","9":"6.310","10":"4.7095"}],"options":{"columns":{"min":{},"max":[10]},"rows":{"min":[10],"max":[10]},"pages":{}}}
    -

    The head() function prints the first six rows of your dataset, alongside your column names. The <dbl> printed below column names like Cruise is the data type of the column. In this case, it’s a <dbl>, which is short for https://en.wikipedia.org/wiki/Double-precision_floating-point_format, a particular way of holding numbers in memory. The details are beyond the scope of this tutorial, but just keep in mind it’s a different data type than, say, a character string (<chr>). Anyway, we see that our import was successful.

    - - - - - - - - - - - - - - +

    The head() function prints the first six rows of your dataset, alongside your column names. The <dbl> printed below column names like Cruise is the data type of the column. In this case, it’s a <dbl>, which is short for double-precision floating-point format, a particular way of holding numbers in memory. The details are beyond the scope of this tutorial, but just keep in mind it’s a different data type than, say, a character string (<chr>). Anyway, we see that our import was successful.

    +

    As an exercise, go ahead and place a copy of combined.csv in a folder called import_exercise in your working directory. Then, try to read in combined.csv directly from the folder, and use head() to ensure the data import was successful and nothing looks funny.

    +
    + +
    +
    +
    # How do you specify that a file is inside a folder in a filepath?
    +# If a folder is called "foo", then you would include "foo/" in your filepath.
    +
    +
    +
    raw_data <- read_csv(file = "import_exercise/combined.csv", col_names = TRUE)
    +head(raw_data)
    +

    Writing Data to Disk

    -

    Although we have done no processing to our dataset, let’s assume that we have, and that we want to save a processed dataset to disk. First, let’s make a dummy processed dataset.

    +

    Although we have done no processing to our dataset, let’s assume that we have. Now we want to save a processed dataset to disk. First, let’s make a dummy processed dataset that we can practice saving to disk.

    processed_data <- raw_data
    -

    Then, to save it to disk, we have to use the write_csv() command.

    -
    +

    Then, to save it to disk, we have to use the write_csv() function. The write_csv() function takes two critical arguments, x and path. x is simply the tibble you wish to save to memory, and path is a character string indicating the filepath of the new object. The following command will save processed_data as processed.csv in your current working directory. There are other optional arguments that can tweak how your data is saved, which you can read about by running ?write_csv.

    +
    write_csv(processed_data, path = "processed.csv", col_names = TRUE)
    -
    -

    Data exploration

    -

    Let’s explore the data that we’ve imported into R. The simplest way to view your imported data is to view it as a “tibble” like this. This view displays a subset of large data table.

    +
    +

    Data exploration

    +

    Let’s explore the data that we’ve imported into R. Although we’ve discussed the use of head(), the simplest way to view your imported data is to call the variable name directly. This view displays a subset of large data table.

    -
    OTU_metadata_table
    +
    raw_data
    -

    glimpse() is a function that, as its name suggests, allows us to get a “glimpse” of the contents of a data table. Running glimpse() on a data table outputs the number of rows (observations), columns (variables), and lists each column name along with its type and a portion of its contents. Let’s run glimpse() with our OTU_metadata_table like this:

    -
    -
    glimpse(OTU_metadata_table)
    - -
    -

    from this we see that our table has 7 rows and 10 columns. Each $ is followed by a column name, with information on the contents following each column name. glimpse() lists all columns of a table

    -
    -

    Exercise

    +

    glimpse() is a function that allows us to get a “glimpse” of the contents of a data table. Running glimpse() on a data table outputs its number of rows (observations); its number of columns (variables); a list of each column name, along with its type and a portion of its contents. We can run glimpse() on raw_data like so:

    +
    glimpse(raw_data)
    +
    ## Rows: 7
    +## Columns: 10
    +## $ Cruise   <dbl> 72, 72, 72, 72, 72, 72, 72
    +## $ Depth    <dbl> 10, 100, 120, 135, 150, 165, 200
    +## $ OTU0001  <dbl> 263, 6489, 24380, 39519, 55812, 49362, 8140
    +## $ OTU0002  <dbl> 0, 0, 0, 3, 12, 6, 41438
    +## $ OTU0003  <dbl> 3210, 18405, 8221, 2793, 596, 178, 60
    +## $ OTU0004  <dbl> 26, 779, 3404, 5368, 7032, 10689, 273
    +## $ NO3      <dbl> 1.793, 26.400, 21.302, 15.917, 5.278, 0.000, 0.000
    +## $ Mean_NO2 <dbl> 0.1275, 0.0817, 0.0978, 0.0706, 0.1127, 0.0805, 0.0000
    +## $ Mean_N2O <dbl> 0.849, 18.087, 16.304, 12.909, 11.815, 6.310, 0.000
    +## $ Mean_NH4 <dbl> 0.4080, 0.1344, 0.1782, 0.1296, 2.1754, 4.7095, 7.3582
    +

    From the output above, we see that our table has 7 rows and 10 columns. As discussed above, each $ is followed by a column name, a portion of the data it contains, and its data type.

    Quiz
    @@ -276,58 +282,40 @@

    Exercise

    -

    If we only want the dimensions of a dataframe or table, we can use the dim() function which prints the number of rows followed by the number of columns. Simple functions to query just the number of rows or columns in a data table are nrow() and ncol().

    -
    -
    #number of rows followed by number of columns
    -dim(OTU_metadata_table)
    -#number of rows
    -nrow(OTU_metadata_table)
    -#number of columns
    -ncol(OTU_metadata_table)
    - -
    -

    We can list the column names using colnames().

    -
    -
    colnames(OTU_metadata_table)
    - -
    -
    -
    -
    -

    select()

    -

    You can use the select() function to keep only a subset of variables (columns). Let’s select the variables OTU0001, OTU0002, OTU0004, Depth.

    -
    -
    restricted_columns <- select(OTU_metadata_table, OTU0001, OTU0002, OTU0004, Depth)
    - -
    -

    To view our new restricted_columns variable, just type in the variable name and run the code like this:

    -
    -
    restricted_columns
    - -
    -
    -

    Exercise

    -

    As an exercise, select for only the depth and geochemical columns (Depth, NO3, Mean_NO2, Mean_N2O, and Mean_NH4) in OTU_metadata_table and name the new table metadata:

    -
    - -
    -
    -
    select(OTU_metadata_table, <variable1>, <variable2>, <...>)
    -
    -
    -
    select(OTU_metadata_table, Depth, NO3, Mean_NO2, Mean_N2O, Mean_NH4)
    -
    - - - - - - -
    -
    -
    -

    Booleans

    -

    Booleans are logical statements that are either TRUE or FALSE but can not be anything in between. As an example, run the code below:

    +

    If we want to know the dimensions of our tibble, we can use the dim() function, which prints the number of rows followed by the number of columns.

    +
    dim(raw_data)
    +
    ## [1]  7 10
    +

    Simple functions to obtain only the number of rows or only the number of columns in a data table are nrow() and ncol().

    +
    nrow(raw_data)
    +
    ## [1] 7
    +
    ncol(raw_data)
    +
    ## [1] 10
    +

    Lastly, we can list the column names using colnames().

    +
    colnames(raw_data)
    +
    ##  [1] "Cruise"   "Depth"    "OTU0001"  "OTU0002"  "OTU0003"  "OTU0004" 
    +##  [7] "NO3"      "Mean_NO2" "Mean_N2O" "Mean_NH4"
    +
    +
    +
    +

    Logical Operators in R

    +

    Logical operators are special symbols in R that one can use to ask TRUE/FALSE questions about your data. For instance, say you had a column in a data frame containing the following data: c("apple", "pear", "banana"). You could then ask R, "which entries in my column are equivalent to the character string "pear"? R would then return the following vector: c(FALSE, TRUE, FALSE). As you might imagine, the next step could be to tell R to only keep the entries for which the answer to this question is TRUE or only keep those that are FALSE.

    +

    The == operator asks R whether the left-hand side is equivalent to the right-hand side. So here is how you would ask the above question in R.

    +
    # create our fruit vector
    +fruits = c("apple", "pear", "banana")
    +
    +# ask our question
    +fruits == "pear"
    +
    ## [1] FALSE  TRUE FALSE
    +

    We see that our predicted vector is returned.

    +

    This process also works with variables containing single pieces of data too. For instance,

    +
    x <- 6
    +y <- "cat"
    +
    +x < 3
    +
    ## [1] FALSE
    +
    y == "dog"
    +
    ## [1] FALSE
    +

    The equation x < 3 is FALSE because x is set to 6 in the line above. As a simple exercise, manipulate the below code to make both equations TRUE.

    x <- 6
     y <- "cat"
    @@ -336,10 +324,9 @@ 

    Booleans

    y == "dog"
    -

    The equation x < 3 is FALSE because x is set to 6 in the line above. As a simple exercise, manipulate the above code to make both equations TRUE.

    -
    #x <- A number less than 3
    -#y <- A string
    +
    x <- #A number less than 3
    +y <- #A string
     
     x < 3
     y == "dog"
    @@ -351,7 +338,7 @@

    Booleans

    x < 3 y == "dog"
    -

    Note that in R, == is used in Boolean equations and using a single = will result in error. As you may have noticed above a single = is used to set a variable to a value.

    +

    Note that in R, a double == is a logical operator and using a single = will result in error. As you may have noticed, a single = can be used to assign a variable to a value.

    For quick reference, here are the most commonly used statements and operators.

    R codemeaningR OperatorMeaning
    @@ -395,7 +382,7 @@

    Booleans

    -
    +

    Exercise

    Write a boolean equation for “x is greater than 6 or less than 12”, it should return TRUE after running.

    @@ -413,6 +400,38 @@

    Exercise

    x > n | x < m
    +
    +
    +

    select()

    +

    You can use the select() function to keep only a subset of variables (columns). Let’s select the variables OTU0001, OTU0002, OTU0004, Depth.

    +
    +
    restricted_columns <- select(OTU_metadata_table, OTU0001, OTU0002, OTU0004, Depth)
    + +
    +

    To view our new restricted_columns variable, just type in the variable name and run the code like this:

    +
    +
    restricted_columns
    + +
    +
    +

    Exercise

    +

    As an exercise, select for only the depth and geochemical columns (Depth, NO3, Mean_NO2, Mean_N2O, and Mean_NH4) in OTU_metadata_table and name the new table metadata:

    +
    + +
    +
    +
    select(OTU_metadata_table, <variable1>, <variable2>, <...>)
    +
    +
    +
    select(OTU_metadata_table, Depth, NO3, Mean_NO2, Mean_N2O, Mean_NH4)
    +
    + + + + + + +

    filter()

    Conditional statements and logical operators are important when working with data in R. We will practice using different conditional statements and logical operators on the oxygen data in a subset of the geochemicals data set. You can use filter() to select specific rows based on a logical condition of a variable.

    @@ -458,7 +477,7 @@

    filter()

    -
    +

    Exercise

    As an exercise, restrict for rows where the value for “depth” is less than or equal to 135m.
    @@ -610,34 +629,34 @@

    Additional resources

    @@ -662,20 +681,6 @@

    Additional resources

    }) - - - - From e9a0c818e27134940e1ca37a9328af374fd1a852 Mon Sep 17 00:00:00 2001 From: r-karimi Date: Sat, 24 Oct 2020 11:38:41 -0700 Subject: [PATCH 09/11] Restructure select, slice, and filter --- .../data_wrangling_basic.Rmd | 116 ++++----- .../data_wrangling_basic.html | 243 +++++++++--------- 2 files changed, 181 insertions(+), 178 deletions(-) diff --git a/inst/tutorials/data_wrangling_basic/data_wrangling_basic.Rmd b/inst/tutorials/data_wrangling_basic/data_wrangling_basic.Rmd index 37329ee..e74f27e 100644 --- a/inst/tutorials/data_wrangling_basic/data_wrangling_basic.Rmd +++ b/inst/tutorials/data_wrangling_basic/data_wrangling_basic.Rmd @@ -75,10 +75,11 @@ Here, we will introduce the `purrr` package, which allows you to perform more ad ## Learning Goals - Load tabular data using `read_csv()` and save the data to your R environment. +- Writing your processed data to your disk. - Introduce the use of logical operators and conditional statements in R for subsetting your data. -- Use the `filter()`, `slice()` and `select()` methods to conditionally subset your data. +- Use the `select()`, `slice()`, and `filter()` functions to conditionally subset your data. - Use the `mutate()` function to create new variables in your dataset, using your existing variables. -- Writing your processed data to your disk. +- Use the pipe operator to more efficiently daisy-chain functions together. ## What is the Tidyverse? @@ -378,9 +379,11 @@ x > n | x < m ``` +## `select()`, `slice()`, and `filter()` +Now that we know how to ask logical questions in R, we can take advantage of this to subset our data in any way we'd like. In a nutshell, the `select()` function allows you to select certain *columns* of your data frame to work with, while the `slice()` function allows you to select certain rows. You can isolate specific entries with a combination of `slice()` and `select()`. `filter()` allows you to apply a conditional statement to the rows of your table, using the logical operators we talked about in the previous section. -## `select()` +### `select()` You can use the `select()` function to keep only a subset of variables (columns). Let's select the variables `OTU0001`, `OTU0002`, `OTU0004`, `Depth`. @@ -408,19 +411,61 @@ select(OTU_metadata_table, , , <...>) ```{r select-exercise-hint-2} select(OTU_metadata_table, Depth, NO3, Mean_NO2, Mean_N2O, Mean_NH4) ``` - - +### `slice()` - +We can also only choose to work with specific rows in our data table using the `slice()` function. - +To select a subset of observations (rows) by their ordinal position, we use the `slice()` function. - - +```{r slice-1, exercise = TRUE, exercise.lines = 5} +slice(OTU_metadata_table, 1) +``` + +You can list multiple ordinal postions to select multiple observations at once. + +```{r slice-2, exercise = TRUE, exercise.lines = 5} +slice(OTU_metadata_table, 1, 2, 3, 4, 5) +``` + +If you would like to to select a range of observations, give the starting and end position separated by a colon like so: `:`. + +```{r slice-3, exercise = TRUE, exercise.lines = 5} +slice(OTU_metadata_table, 1:5) +``` + +```{r slice-quiz, echo = FALSE} +quiz( + question("What is the value of OTU0003 in the 6th row of OTU_metadata_table?", + answer("0"), + answer("156"), + answer("178", correct=TRUE), + answer("72") + ) +) +``` +### Exercise: `slice()` and `select()` +Using `slice()` and `select()`, determine: +A) what depth value occurs in the 20th row? +B) what methane value occurs in the 170th row? + +```{r slice_exercise, exercise=TRUE, exercise.lines=5} +dat <- geochemicals +``` + +```{r slice_exercise-hint-1} +# Recall that slice() allows you to find a row +# and select() allows you to find a column +slice(dat, 20) +select(dat, depth) +``` +```{r slice_exercise-hint-2} +slice(select(dat, Depth), 20) +select(slice(dat, 170), methane) +``` ### `filter()` @@ -497,60 +542,11 @@ filter(dataset, (column, operator, quantity)) filter(subset_dat, depth <= 135) ``` -## `slice()` +## `mutate()` -We can also only choose to work with specific rows in our data table using the `slice()` function. - -To select a subset of observations (rows) by their ordinal position, we use the `slice()` function. - -```{r slice-1, exercise = TRUE, exercise.lines = 5} -slice(OTU_metadata_table, 1) -``` - -You can list multiple ordinal postions to select multiple observations at once. - -```{r slice-2, exercise = TRUE, exercise.lines = 5} -slice(OTU_metadata_table, 1, 2, 3, 4, 5) -``` - -If you would like to to select a range of observations, give the starting and end position separated by a colon like so: `:`. - -```{r slice-3, exercise = TRUE, exercise.lines = 5} -slice(OTU_metadata_table, 1:5) -``` - -```{r slice-quiz, echo = FALSE} -quiz( - question("What is the value of OTU0003 in the 6th row of OTU_metadata_table?", - answer("0"), - answer("156"), - answer("178", correct=TRUE), - answer("72") - ) -) -``` +## The Pipe Operator (`%>%`) -### Exercise: `slice()` and `select()` -Using `slice()` and `select()`, determine: -A) what depth value occurs in the 20th row? -B) what methane value occurs in the 170th row? - -```{r slice_exercise, exercise=TRUE, exercise.lines=5} -dat <- geochemicals -``` - -```{r slice_exercise-hint-1} -# Recall that slice() allows you to find a row -# and select() allows you to find a column -slice(dat, 20) -select(dat, depth) -``` - -```{r slice_exercise-hint-2} -slice(select(dat, Depth), 20) -select(slice(dat, 170), methane) -``` ## Summary Exercise The `geochemicals` dataset is included in the "educer" package. This dataframe contains time series observations on the water column chemistry. Learn more about the `geochemicals` dataset by running the following line in your R console. diff --git a/inst/tutorials/data_wrangling_basic/data_wrangling_basic.html b/inst/tutorials/data_wrangling_basic/data_wrangling_basic.html index 5f7a3dd..a16fff0 100644 --- a/inst/tutorials/data_wrangling_basic/data_wrangling_basic.html +++ b/inst/tutorials/data_wrangling_basic/data_wrangling_basic.html @@ -148,10 +148,11 @@

    Advanced

    Learning Goals

    • Load tabular data using read_csv() and save the data to your R environment.
    • +
    • Writing your processed data to your disk.
    • Introduce the use of logical operators and conditional statements in R for subsetting your data.
    • -
    • Use the filter(), slice() and select() methods to conditionally subset your data.
    • +
    • Use the select(), slice(), and filter() functions to conditionally subset your data.
    • Use the mutate() function to create new variables in your dataset, using your existing variables.
    • -
    • Writing your processed data to your disk.
    • +
    • Use the pipe operator to more efficiently daisy-chain functions together.
    @@ -307,38 +308,40 @@

    Logical Operators in R

    fruits == "pear"
    ## [1] FALSE  TRUE FALSE

    We see that our predicted vector is returned.

    -

    This process also works with variables containing single pieces of data too. For instance,

    -
    x <- 6
    -y <- "cat"
    -
    -x < 3
    +

    This process also works with variables containing single pieces of data too. For instance, let’s initialize the following variables.

    +
    number <- 6
    +animal <- "cat"
    +

    And ask some questions. Is number less than 3?

    +
    number < 3
    ## [1] FALSE
    -
    y == "dog"
    +

    Is the animal a dog?

    +
    animal == "dog"
    ## [1] FALSE
    -

    The equation x < 3 is FALSE because x is set to 6 in the line above. As a simple exercise, manipulate the below code to make both equations TRUE.

    +

    As a simple exercise, manipulate the below code to make both equations TRUE.

    -
    x <- 6
    -y <- "cat"
    +
    number <- 6
    +animal <- "cat"
     
    -x < 3
    -y == "dog"
    +number < 3 +animal == "dog"
    -
    x <- #A number less than 3
    -y <- #A string
    +
    number <- #A number less than 3
    +animal <- #A string
     
    -x < 3
    -y == "dog"
    +number < 3 +animal == "dog"
    -
    x <- 1
    -y <- "dog"
    +
    number <- 1
    +animal <- "dog"
     
    -x < 3
    -y == "dog"
    +number < 3 +animal == "dog"
    -

    Note that in R, a double == is a logical operator and using a single = will result in error. As you may have noticed, a single = can be used to assign a variable to a value.

    +

    Note that in R, only the double == is a logical operator. Using a single = will result in error. As you may have noticed, a single = can only be used to assign a variable to a value.

    +

    We can chain together multiple logical operators in a single question. Say we wanted to ask, “is number less than 10 but also greater than 4?” We would do so with the following operator: number < 10 & number > 4.

    For quick reference, here are the most commonly used statements and operators.

    @@ -350,28 +353,28 @@

    Logical Operators in R

    - + + + + + - + - + - + - - - - @@ -401,8 +404,11 @@

    Exercise

    -
    -

    select()

    +
    +

    select(), slice(), and filter()

    +

    Now that we know how to ask logical questions in R, we can take advantage of this to subset our data in any way we’d like. In a nutshell, the select() function allows you to select certain columns of your data frame to work with, while the slice() function allows you to select certain rows. You can isolate specific entries with a combination of slice() and select(). filter() allows you to apply a conditional statement to the rows of your table, using the logical operators we talked about in the previous section.

    +
    +

    select()

    You can use the select() function to keep only a subset of variables (columns). Let’s select the variables OTU0001, OTU0002, OTU0004, Depth.

    restricted_columns <- select(OTU_metadata_table, OTU0001, OTU0002, OTU0004, Depth)
    @@ -413,6 +419,7 @@

    select()

    restricted_columns
    +

    Exercise

    As an exercise, select for only the depth and geochemical columns (Depth, NO3, Mean_NO2, Mean_N2O, and Mean_NH4) in OTU_metadata_table and name the new table metadata:

    @@ -425,12 +432,55 @@

    Exercise

    select(OTU_metadata_table, Depth, NO3, Mean_NO2, Mean_N2O, Mean_NH4)
    - - - - - - +
    +
    +

    slice()

    +

    We can also only choose to work with specific rows in our data table using the slice() function.

    +

    To select a subset of observations (rows) by their ordinal position, we use the slice() function.

    +
    +
    slice(OTU_metadata_table, 1)
    + +
    +

    You can list multiple ordinal postions to select multiple observations at once.

    +
    +
    slice(OTU_metadata_table, 1, 2, 3, 4, 5)
    + +
    +

    If you would like to to select a range of observations, give the starting and end position separated by a colon like so: <start>:<end>.

    +
    +
    slice(OTU_metadata_table, 1:5)
    + +
    +

    Quiz
    +
    +
    +
    +
    + +
    +

    +
    +
    +

    Exercise: slice() and select()

    +

    Using slice() and select(), determine:

    +
      +
    1. what depth value occurs in the 20th row?
    2. +
    3. what methane value occurs in the 170th row?
    4. +
    +
    +
    dat <- geochemicals
    + +
    +
    +
    # Recall that slice() allows you to find a row
    +# and select() allows you to find a column
    +slice(dat, 20)
    +select(dat, depth)
    +
    +
    +
    slice(select(dat, Depth), 20)
    +select(slice(dat, 170), methane)
    +

    filter()

    @@ -492,54 +542,11 @@

    Exercise

    -
    -

    slice()

    -

    We can also only choose to work with specific rows in our data table using the slice() function.

    -

    To select a subset of observations (rows) by their ordinal position, we use the slice() function.

    -
    -
    slice(OTU_metadata_table, 1)
    - -
    -

    You can list multiple ordinal postions to select multiple observations at once.

    -
    -
    slice(OTU_metadata_table, 1, 2, 3, 4, 5)
    - -
    -

    If you would like to to select a range of observations, give the starting and end position separated by a colon like so: <start>:<end>.

    -
    -
    slice(OTU_metadata_table, 1:5)
    - -
    -

    Quiz
    -
    -
    -
    -
    - -
    -

    -
    -

    Exercise: slice() and select()

    -

    Using slice() and select(), determine:

    -
      -
    1. what depth value occurs in the 20th row?
    2. -
    3. what methane value occurs in the 170th row?
    4. -
    -
    -
    dat <- geochemicals
    - -
    -
    -
    # Recall that slice() allows you to find a row
    -# and select() allows you to find a column
    -slice(dat, 20)
    -select(dat, depth)
    -
    -
    -
    slice(select(dat, Depth), 20)
    -select(slice(dat, 170), methane)
    -
    +
    +

    mutate()

    +
    +

    The Pipe (%>%) Operator

    Summary Exercise

    @@ -643,7 +650,7 @@

    Additional resources

    + + + + + + + + + + - - - - - - - - - -
    ==equalsequivalent to
    !=not equivalent to
    < or > less/greater than
    <= or >= less/greater than or equal to
    %in% in
    is.na is missing (NA)
    !not (as in not equal to !=)
    & and