From c7b5953b0f2a7d59c93e0ca88c90d8ecbf9f0e3c Mon Sep 17 00:00:00 2001 From: gbganalyst Date: Sun, 25 Feb 2024 22:48:38 +0100 Subject: [PATCH] updated the vignettes files --- README.Rmd | 8 +- README.md | 7 +- _pkgdown.yml | 2 + vignettes/bulkreadr.Rmd | 251 +--------------------------------- vignettes/labelled-data.Rmd | 128 +++++++++++++++++ vignettes/other-functions.Rmd | 164 ++++++++++++++++++++++ 6 files changed, 302 insertions(+), 258 deletions(-) create mode 100644 vignettes/labelled-data.Rmd create mode 100644 vignettes/other-functions.Rmd diff --git a/README.Rmd b/README.Rmd index 0110e21..b061c0d 100644 --- a/README.Rmd +++ b/README.Rmd @@ -41,12 +41,8 @@ Additionally, the package seamlessly works with labelled data from SPSS and Stat ## The Concept Map - - ![](man/figures/concept-map.png) - - ## Installation You can install `bulkreadr` package from [CRAN](https://cran.r-project.org/) with: @@ -72,13 +68,11 @@ Now that you have installed `bulkreadr` package, you can simply load it by using ```{r pkgload} library(bulkreadr) -library(dplyr) ``` - ## Context -bulkreadr draws on and complements / emulates other packages such as readxl, readr, and googlesheets4 to read bulk data in R. +bulkreadr is designed to integrate with and augment the capabilities of established packages such as `readxl`, `readr`, and `googlesheets4`, offering enhanced functionality for reading bulk data within the R programming environment. * [readxl](https://readxl.tidyverse.org) is the tidyverse package for reading Excel files (xls or xlsx) into an R data frame. diff --git a/README.md b/README.md index 833e73d..9ce4910 100644 --- a/README.md +++ b/README.md @@ -69,13 +69,14 @@ by using: ``` r library(bulkreadr) -library(dplyr) ``` ## Context -bulkreadr draws on and complements / emulates other packages such as -readxl, readr, and googlesheets4 to read bulk data in R. +bulkreadr is designed to integrate with and augment the capabilities of +established packages such as `readxl`, `readr`, and `googlesheets4`, +offering enhanced functionality for reading bulk data within the R +programming environment. - [readxl](https://readxl.tidyverse.org) is the tidyverse package for reading Excel files (xls or xlsx) into an R data frame. diff --git a/_pkgdown.yml b/_pkgdown.yml index 99d7f22..3a3bfab 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -54,6 +54,8 @@ articles: navbar: ~ contents: - bulkreadr + - labelled-data + - other-functions navbar: title: "bulkreadr" diff --git a/vignettes/bulkreadr.Rmd b/vignettes/bulkreadr.Rmd index 392f106..4c227a9 100644 --- a/vignettes/bulkreadr.Rmd +++ b/vignettes/bulkreadr.Rmd @@ -64,43 +64,15 @@ Now that you have installed `bulkreadr` package, you can simply load it by using ```{r pkgload} library(bulkreadr) -library(dplyr) ``` ## Functions in bulkreadr package -This section provides a concise overview of the different functions available in the `bulkreadr` package. These functions serve various purposes and are designed to handle importing of data in bulk. +This section provides a concise overview of the different functions available in the `bulkreadr` package for importing bulk data in R. -- [`read_excel_workbook()`](#read_excel_workbook) - -- [`read_excel_files_from_dir()`](#read_csv_files_from_dir) - -- [`read_csv_files_from_dir()`](#read_csv_files_from_dir) - -- [`read_gsheets()`](#read_gsheets) - -- [`read_spss_data()`](#read_spss_data) - -- [`read_stata_data()`](#read_stata_data) - -## Other functions in bulkreadr package: - -- [`generate_dictionary()`](#generate_dictionary) - -- [`look_for()`](#look_for) - -- [`pull_out()`](#pull_out) - -- [`convert_to_date()`](#convert_to_date) - -- [`inspect_na()`](#inspect_na) - -- [`fill_missing_values()`](#fill_missing_values) - -**Note:** - -For the majority of functions within this package, we will utilize data stored in the system file by the `bulkreadr`, which can be accessed using the `system.file()` function. If you wish to utilize your own data stored in your local directory, please ensure that you have set the appropriate file path prior to using any functions provided by the bulkreadr package. +## Note +> For the majority of functions within this package, we will utilize data stored in the system file by the `bulkreadr`, which can be accessed using the `system.file()` function. If you wish to utilize your own data stored in your local directory, please ensure that you have set the appropriate file path prior to using any functions provided by the bulkreadr package. ## read_excel_workbook() @@ -167,220 +139,3 @@ sheet_id <- "1izO0mHu3L9AMySQUXGDn9GPs1n-VwGFSEoAKGhqVQh0" read_gsheets(ss = sheet_id) ``` - -## read_spss_data() - -`read_spss_data()` is designed to seamlessly import data from an SPSS data (`.sav` or `.zsav`) files. It converts labelled variables into factors, a crucial step that enhances the ease of data manipulation and analysis within the R programming environment. - -**Read the SPSS data file without converting variable labels as column names** - -```{r spssdata1} - -file_path <- system.file("extdata", "Wages.sav", package = "bulkreadr") - -data <- read_spss_data(file = file_path) - -data - -``` - - -**Read the SPSS data file and convert variable labels as column names** - -```{r spssdata2} - -data <- read_spss_data(file = file_path, label = TRUE) - -data - -``` - - -## read_stata_data() - -`read_stata_data()` reads Stata data file (`.dta`) into an R data frame, converting labeled variables into factors. - -**Read the Stata data file without converting variable labels as column names** - -```{r statadata1} - -file_path <- system.file("extdata", "Wages.dta", package = "bulkreadr") - -data <- read_stata_data(file = file_path) - -data - -``` - -**Read the Stata data file and convert variable labels as column names** - -```{r statadata2} - -data <- read_stata_data(file = file_path, label = TRUE) - -data - -``` - - -## generate_dictionary() - -`generate_dictionary()` creates a data dictionary from a specified data frame. This function is particularly useful for understanding and documenting the structure of your dataset, similar to data dictionaries in Stata or SPSS. - -```{r} - -# Creating a data dictionary from an SPSS file - -file_path <- system.file("extdata", "Wages.sav", package = "bulkreadr") - -wage_data <- read_spss_data(file = file_path) - -generate_dictionary(wage_data) -``` - - -## look_for() - -The `look_for()` function is designed to emulate the functionality of the Stata `lookfor` command in R. It provides a powerful tool for searching through large datasets, specifically targeting variable names, variable label descriptions, factor levels, and value labels. This function is handy for users working with extensive and complex datasets, enabling them to quickly and efficiently locate the variables of interest. - - -```{r} - -# Look for a single keyword. - -look_for(wage_data, "south") - -look_for(wage_data, "s") -``` - -## pull_out() - -`pull_out()` is similar to [. It acts on vectors, matrices, arrays and lists to extract or replace parts. It is pleasant to use with the magrittr (`⁠%>%`⁠) and base(`|>`) operators. - -```{r example4} - -top_10_richest_nig <- c("Aliko Dangote", "Mike Adenuga", "Femi Otedola", "Arthur Eze", "Abdulsamad Rabiu", "Cletus Ibeto", "Orji Uzor Kalu", "ABC Orjiakor", "Jimoh Ibrahim", "Tony Elumelu") - -top_10_richest_nig %>% - pull_out(c(1, 5, 2)) -``` - -```{r} -top_10_richest_nig %>% - pull_out(-c(1, 5, 2)) -``` - - -## convert_to_date() - -`convert_to_date()` parses an input vector into POSIXct date-time object. It is also powerful to convert from excel date number like `42370` into date value like `2016-01-01`. - -```{r example 5} - -## ** heterogeneous dates ** - -dates <- c( - 44869, "22.09.2022", NA, "02/27/92", "01-19-2022", - "13-01- 2022", "2023", "2023-2", 41750.2, 41751.99, - "11 07 2023", "2023-4" - ) - -# Convert to POSIXct or Date object - -convert_to_date(dates) - -# It can also convert date time object to date object - -convert_to_date(lubridate::now()) - -``` - -## inspect_na() - -`inspect_na()` summarizes the rate of missingness in each column of a data frame. For a grouped data frame, the rate of missingness is summarized separately for each group. - -```{r example 6a} - -# dataframe summary - -inspect_na(airquality) -``` - -**Grouped dataframe summary** - -```{r} -airquality %>% - group_by(Month) %>% - inspect_na() -``` - -## fill_missing_values() - -`fill_missing_values()` in an efficient function that addresses missing values in a dataframe. It uses imputation by function, meaning it replaces missing data in numeric variables with either the mean or the median, and in non-numeric variables with the mode. The function takes a column-based imputation approach, ensuring that replacement values are derived from the respective columns, resulting in accurate and consistent data. This method enhances the integrity of the dataset and promotes sound decision-making and analysis in data processing workflows. - -```{r example 6} - -df <- tibble::tibble( - Sepal_Length = c(5.2, 5, 5.7, NA, 6.2, 6.7, 5.5), - Sepal.Width = c(4.1, 3.6, 3, 3, 2.9, 2.5, 2.4), - Petal_Length = c(1.5, 1.4, 4.2, 1.4, NA, 5.8, 3.7), - Petal_Width = c(NA, 0.2, 1.2, 0.2, 1.3, 1.8, NA), - Species = c("setosa", NA, "versicolor", "setosa", - NA, "virginica", "setosa" - ) -) - -``` - -```{r} -df -``` - - -**Using mean to fill missing values for numeric variables** - - -```{r} - -result_df_mean <- fill_missing_values(df, use_mean = TRUE) - -result_df_mean - -``` - - -**Using median to fill missing values for numeric variables** - -```{r} -result_df_median <- fill_missing_values(df, use_mean = FALSE) - -result_df_median -``` - - -### Impute missing values (NAs) in a grouped data frame - -You can use the `fill_missing_values()` in a grouped data frame by using other grouping and map functions. Here is an example of how to do this: - -```{r} -sample_iris <- tibble::tibble( -Sepal_Length = c(5.2, 5, 5.7, NA, 6.2, 6.7, 5.5), -Petal_Length = c(1.5, 1.4, 4.2, 1.4, NA, 5.8, 3.7), -Petal_Width = c(0.3, 0.2, 1.2, 0.2, 1.3, 1.8, NA), -Species = c("setosa", "setosa", "versicolor", "setosa", - "virginica", "virginica", "setosa") -) - -``` - -```{r} -sample_iris -``` - -```{r} -sample_iris %>% - group_by(Species) %>% - group_split() %>% - map_df(fill_missing_values) -``` - diff --git a/vignettes/labelled-data.Rmd b/vignettes/labelled-data.Rmd new file mode 100644 index 0000000..d73ab82 --- /dev/null +++ b/vignettes/labelled-data.Rmd @@ -0,0 +1,128 @@ +--- +title: "Introduction to labelled data" +output: rmarkdown::html_vignette +author: "Ezekiel Ogundepo and Ernest Fokoué" +vignette: > + %\VignetteIndexEntry{Introduction to labelled data} + %\VignetteEncoding{UTF-8} + %\VignetteEngine{knitr::rmarkdown} +description: > + The R ecosystem, through packages like `foreign` and `haven`, facilitates the importation of labelled data from software like SPSS and Stata, ensuring a smooth transition into R. This vignette introduces you to other functions in bulkreadr, such as `read_spss_data()`, which extends this functionality by leveraging `haven` to streamline the process further. +editor_options: + chunk_output_type: console +--- + +```{r setup, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + message = FALSE, + warning = FALSE, + comment = "#>", + fig.path = "man/figures/", + out.width = "100%") + +options(tibble.print_min = 5, tibble.print_max = 5) + +options(rmarkdown.html_vignette.check_title = FALSE) +``` + +# What is labelled data in R? + +Labelled data in SPSS and Stata refers to datasets where each variable (or column) and its values are assigned meaningful labels. These labels provide context, such as descriptions or categories, making the data easier to understand and analyze. For instance, a variable representing gender might have numerical codes (1, 2) with labels ("Male", "Female"). This feature enhances data analysis by allowing researchers to work with descriptive labels instead of deciphering codes or numeric values, facilitating clearer interpretation and communication of statistical results. + +The R ecosystem, through packages like `foreign` and `haven`, facilitates the importation of labelled data from software like SPSS and Stata, ensuring a smooth transition into R. The `bulkreadr` package extends this functionality by leveraging `haven` to further streamline the process. It automatically converts labelled data into R's factor data type, eliminating the need for manual recoding. This enhancement significantly improves the efficiency of the data analysis workflow within the R environment. + + +## Note + +> For the majority of functions within this package, we will utilize data stored in the system file by the `bulkreadr`, which can be accessed using the `system.file()` function. If you wish to utilize your own data stored in your local directory, please ensure that you have set the appropriate file path prior to using any functions provided by the bulkreadr package. + +## read_spss_data() + +`read_spss_data()` is designed to seamlessly import data from an SPSS data (`.sav` or `.zsav`) files. It converts labelled variables into factors, a crucial step that enhances the ease of data manipulation and analysis within the R programming environment. + +**Read the SPSS data file without converting variable labels as column names** + +```{r spssdata1} + +library(bulkreadr) + +file_path <- system.file("extdata", "Wages.sav", package = "bulkreadr") + +data <- read_spss_data(file = file_path) + +data + +``` + + +**Read the SPSS data file and convert variable labels as column names** + +```{r spssdata2} + +data <- read_spss_data(file = file_path, label = TRUE) + +data + +``` + + +## read_stata_data() + +`read_stata_data()` reads Stata data file (`.dta`) into an R data frame, converting labeled variables into factors. + +**Read the Stata data file without converting variable labels as column names** + +```{r statadata1} + +file_path <- system.file("extdata", "Wages.dta", package = "bulkreadr") + +data <- read_stata_data(file = file_path) + +data + +``` + +**Read the Stata data file and convert variable labels as column names** + +```{r statadata2} + +data <- read_stata_data(file = file_path, label = TRUE) + +data + +``` + + +## generate_dictionary() + +`generate_dictionary()` creates a data dictionary from a specified data frame. This function is particularly useful for understanding and documenting the structure of your dataset, similar to data dictionaries in Stata or SPSS. + +```{r} + +# Creating a data dictionary from an SPSS file + +file_path <- system.file("extdata", "Wages.sav", package = "bulkreadr") + +wage_data <- read_spss_data(file = file_path) + +generate_dictionary(wage_data) +``` + + +## look_for() + +The `look_for()` function is designed to emulate the functionality of the Stata `lookfor` command in R. It provides a powerful tool for searching through large datasets, specifically targeting variable names, variable label descriptions, factor levels, and value labels. This function is handy for users working with extensive and complex datasets, enabling them to quickly and efficiently locate the variables of interest. + + +```{r} + +# Look for a single keyword. + +look_for(wage_data, "south") + +look_for(wage_data, "^s") +``` + + + diff --git a/vignettes/other-functions.Rmd b/vignettes/other-functions.Rmd new file mode 100644 index 0000000..689ba8a --- /dev/null +++ b/vignettes/other-functions.Rmd @@ -0,0 +1,164 @@ +--- +title: "Other functions in bulkreadr" +output: rmarkdown::html_vignette +author: "Ezekiel Ogundepo and Ernest Fokoué" +vignette: > + %\VignetteIndexEntry{Other functions in bulkreadr} + %\VignetteEncoding{UTF-8} + %\VignetteEngine{knitr::rmarkdown} +description: > + The `bulkreadr` package includes specialized functions beyond bulk data reading, aimed at enhancing data analysis efficiency. These functions are designed to operate on individual vectors, except for `inspect_na()` and `fill_missing_values()`, which work on data frames. +editor_options: + chunk_output_type: console +--- + +```{r setup, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + message = FALSE, + warning = FALSE, + comment = "#>", + fig.path = "man/figures/", + out.width = "100%") + +options(tibble.print_min = 5, tibble.print_max = 5) + +options(rmarkdown.html_vignette.check_title = FALSE) +``` + +The `bulkreadr` package in R includes specialized functions beyond bulk data reading, aimed at enhancing data analysis efficiency. These functions are designed to operate on individual vectors, except for `inspect_na()` and `fill_missing_values()`, which work on data frames. + +## pull_out() + +`pull_out()` is similar to [. It acts on vectors, matrices, arrays and lists to extract or replace parts. It is pleasant to use with the magrittr (`⁠%>%`⁠) and base(`|>`) operators. + +```{r example4} + +library(bulkreadr) +library(dplyr) + +top_10_richest_nig <- c("Aliko Dangote", "Mike Adenuga", "Femi Otedola", "Arthur Eze", "Abdulsamad Rabiu", "Cletus Ibeto", "Orji Uzor Kalu", "ABC Orjiakor", "Jimoh Ibrahim", "Tony Elumelu") + +top_10_richest_nig %>% + pull_out(c(1, 5, 2)) +``` + +```{r} +top_10_richest_nig %>% + pull_out(-c(1, 5, 2)) +``` + + +## convert_to_date() + +`convert_to_date()` parses an input vector into POSIXct date-time object. It is also powerful to convert from excel date number like `42370` into date value like `2016-01-01`. + +```{r example 5} + +## ** heterogeneous dates ** + +dates <- c( + 44869, "22.09.2022", NA, "02/27/92", "01-19-2022", + "13-01- 2022", "2023", "2023-2", 41750.2, 41751.99, + "11 07 2023", "2023-4" + ) + +# Convert to POSIXct or Date object + +convert_to_date(dates) + +# It can also convert date time object to date object + +convert_to_date(lubridate::now()) + +``` + +## inspect_na() + +`inspect_na()` summarizes the rate of missingness in each column of a data frame. For a grouped data frame, the rate of missingness is summarized separately for each group. + +```{r example 6a} + +# dataframe summary + +inspect_na(airquality) +``` + +**Grouped dataframe summary** + +```{r} +airquality %>% + group_by(Month) %>% + inspect_na() +``` + +## fill_missing_values() + +`fill_missing_values()` in an efficient function that addresses missing values in a dataframe. It uses imputation by function, meaning it replaces missing data in numeric variables with either the mean or the median, and in non-numeric variables with the mode. The function takes a column-based imputation approach, ensuring that replacement values are derived from the respective columns, resulting in accurate and consistent data. This method enhances the integrity of the dataset and promotes sound decision-making and analysis in data processing workflows. + +```{r example 6} + +df <- tibble::tibble( + Sepal_Length = c(5.2, 5, 5.7, NA, 6.2, 6.7, 5.5), + Sepal.Width = c(4.1, 3.6, 3, 3, 2.9, 2.5, 2.4), + Petal_Length = c(1.5, 1.4, 4.2, 1.4, NA, 5.8, 3.7), + Petal_Width = c(NA, 0.2, 1.2, 0.2, 1.3, 1.8, NA), + Species = c("setosa", NA, "versicolor", "setosa", + NA, "virginica", "setosa" + ) +) + +``` + +```{r} +df +``` + + +**Using mean to fill missing values for numeric variables** + + +```{r} + +result_df_mean <- fill_missing_values(df, use_mean = TRUE) + +result_df_mean + +``` + + +**Using median to fill missing values for numeric variables** + +```{r} +result_df_median <- fill_missing_values(df, use_mean = FALSE) + +result_df_median +``` + + +### Impute missing values (NAs) in a grouped data frame + +You can use the `fill_missing_values()` in a grouped data frame by using other grouping and map functions. Here is an example of how to do this: + +```{r} +sample_iris <- tibble::tibble( +Sepal_Length = c(5.2, 5, 5.7, NA, 6.2, 6.7, 5.5), +Petal_Length = c(1.5, 1.4, 4.2, 1.4, NA, 5.8, 3.7), +Petal_Width = c(0.3, 0.2, 1.2, 0.2, 1.3, 1.8, NA), +Species = c("setosa", "setosa", "versicolor", "setosa", + "virginica", "virginica", "setosa") +) + +``` + +```{r} +sample_iris +``` + +```{r} +sample_iris %>% + group_by(Species) %>% + group_split() %>% + map_df(fill_missing_values) +``` +