From 31deb05ea52d0bf8c31e167f0956d21d9911932d Mon Sep 17 00:00:00 2001 From: Nathan Hwangbo Date: Mon, 30 Sep 2019 17:01:23 -0700 Subject: [PATCH 1/4] change read_csv path to specify package --- R/bib_reader.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/bib_reader.R b/R/bib_reader.R index 0025dce..5f808ab 100644 --- a/R/bib_reader.R +++ b/R/bib_reader.R @@ -29,7 +29,7 @@ bibfile_reader <- function(isi_dir, formatting){ purrr::map_dfr(as.data.frame) # read the lookup table to match fields - lut <- readr::read_csv("inst/LUT_non_isi.csv") + lut <- readr::read_csv(system.file("LUT_non_isi.csv", package = "BibScan")) # Check if the necessary fields are here stopifnot(sum(lut[[formatting]] %in% names(biblio_data)) == 5) From 57920076c8d3edec9ad815107d7a89c817b095fa Mon Sep 17 00:00:00 2001 From: Nathan Hwangbo Date: Wed, 23 Oct 2019 15:45:41 -0700 Subject: [PATCH 2/4] Add file names to pdfs --- R/article_pdf_download.R | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/R/article_pdf_download.R b/R/article_pdf_download.R index 054c7d8..8cd0894 100644 --- a/R/article_pdf_download.R +++ b/R/article_pdf_download.R @@ -54,7 +54,8 @@ article_pdf_download <- function(infilepath, outfilepath = infilepath, bib_forma # Select attributes of interest and clean the author field my_df <- tibble::tibble(name = paste(gsub(";.*$", "", matched$AU),matched$PY,matched$SO), titles = matched$TI, - DOI = matched$DI) + DOI = matched$DI, + py = matched$PY) # Print percent of papers with DOI perc <- suppressWarnings((nrow(dplyr::filter(my_df, !is.na(DOI)))/nrow(my_df))) @@ -88,7 +89,7 @@ article_pdf_download <- function(infilepath, outfilepath = infilepath, bib_forma rm(perc) # Add to report document which reference didn't have URL - report <- dplyr::left_join(report, my_df, by = c("name", "DOI", "titles")) + report <- dplyr::left_join(report, my_df, by = c("name", "DOI", "titles", "py")) report$length <- lapply(report$links, length) report$URL_found <- ifelse((report$length > 0), TRUE, FALSE) report <- dplyr::select(report, name, titles, DOI, DOI_exists, URL_found) @@ -98,6 +99,21 @@ article_pdf_download <- function(infilepath, outfilepath = infilepath, bib_forma # Remove references with no URL my_df <- my_df[lapply(my_df$links, length) > 0, ] + # try to add the first author and doi to put as file names. + my_df <- my_df %>% + mutate(first_author = name %>% + str_replace_all("\\.\\s", "\\.") %>% + str_replace_all("[A-Z]\\s[A-Z]", "") %>% + str_replace_all('"', "") %>% + str_replace_all("and", "") %>% + word(1, 2) %>% + str_trim() %>% + str_replace("\\s", ""), + # replace backslashes with 2 periods in a row + doi_title = DOI %>% str_replace_all("/", "\\.\\.") + + ) + # Elsevier links require a separate download process, so we distinguish them here # my_df <- elsevier_tagger(my_df, "links") not neede anymore @@ -134,11 +150,16 @@ article_pdf_download <- function(infilepath, outfilepath = infilepath, bib_forma finally = message(sprintf("\nThe reference %s has been processed \n", my_df$name[[i]])) ) # keep track of the PDF names - if (length(crminer::crm_cache$list()) > nb_pdfs) { + if (length(crminer::crm_cache$list()) >= nb_pdfs) { # print(crminer::crm_cache$list()) nb_pdfs <- length(crminer::crm_cache$list()) last_paper <- setdiff(crminer::crm_cache$list(), old_cache) - my_df$downloaded_file[i] <- last_paper + + # rename file + file_name <- file.path(dirname(last_paper), paste0(test[i,]$first_author, test[i,]$py, "_", test[i,]$doi_title, ".pdf")) + file.rename(from = last_paper, to = file_name) + + my_df$downloaded_file[i] <- file_name } else { my_df$downloaded_file[i] <- NA } From 2a8cb83342b7b0fdfaf32c0d93aa0583ccb3e102 Mon Sep 17 00:00:00 2001 From: Nathan Hwangbo Date: Wed, 23 Oct 2019 16:02:37 -0700 Subject: [PATCH 3/4] Add overwrite parameter to prevent accidental overwrite of outfilepath --- R/article_pdf_download.R | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/R/article_pdf_download.R b/R/article_pdf_download.R index 8cd0894..7b24316 100644 --- a/R/article_pdf_download.R +++ b/R/article_pdf_download.R @@ -6,6 +6,7 @@ #' @param bib_format (character) format used by the bibtex file #' @param colandr A file (character) that provides titles to match; designed to be output of Colandr #' @param cond Condition (logical) that defines sorting of output from Colandr file +#' @param overwrite Condition (logical) that determines whether to overwrite contents of outfilepath, if it's nonempty #' #' @return data frame containing dowload information #' @@ -15,12 +16,16 @@ #' @export #' @examples \dontrun{ article_pdf_download(infilepath = "/data/isi_searches", outfilepath = "data")} -article_pdf_download <- function(infilepath, outfilepath = infilepath, bib_format = "soc_bib", colandr=NULL, cond="included"){ +article_pdf_download <- function(infilepath, outfilepath = infilepath, bib_format = "soc_bib", colandr=NULL, cond="included", overwrite = FALSE){ # =============================== # CONSTANTS # =============================== - # Create the main output directory - dir.create(outfilepath, showWarnings = FALSE) + # give error if overwrite is false AND the directory is nonempty + if(!overwrite & length(list.files(outfilepath, all.files = TRUE)) != 0){ + stop("outfilepath not empty! Set overwrite = TRUE or change outfilepath") + } else{ + dir.create(outfilepath, showWarnings = FALSE) + } # PDF subdirectory pdf_output_dir <- file.path(outfilepath, 'pdfs') From 04c74c6c24db463c9013aae7cc95c198374c6f13 Mon Sep 17 00:00:00 2001 From: Nathan Hwangbo Date: Fri, 25 Oct 2019 10:08:33 -0700 Subject: [PATCH 4/4] Add is_empty column to summary report Fix non_pdf_paths location --- R/article_pdf_download.R | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/R/article_pdf_download.R b/R/article_pdf_download.R index 7b24316..c425964 100644 --- a/R/article_pdf_download.R +++ b/R/article_pdf_download.R @@ -6,7 +6,7 @@ #' @param bib_format (character) format used by the bibtex file #' @param colandr A file (character) that provides titles to match; designed to be output of Colandr #' @param cond Condition (logical) that defines sorting of output from Colandr file -#' @param overwrite Condition (logical) that determines whether to overwrite contents of outfilepath, if it's nonempty +#' @param overwrite Condition (logical) that determines whether to overwrite contents of outfilepath, if it's nonempty (default FALSE) #' #' @return data frame containing dowload information #' @@ -161,7 +161,7 @@ article_pdf_download <- function(infilepath, outfilepath = infilepath, bib_forma last_paper <- setdiff(crminer::crm_cache$list(), old_cache) # rename file - file_name <- file.path(dirname(last_paper), paste0(test[i,]$first_author, test[i,]$py, "_", test[i,]$doi_title, ".pdf")) + file_name <- file.path(dirname(last_paper), paste0(my_df[i,]$first_author, my_df[i,]$py, "_", my_df[i,]$doi_title, ".pdf")) file.rename(from = last_paper, to = file_name) my_df$downloaded_file[i] <- file_name @@ -190,6 +190,7 @@ article_pdf_download <- function(infilepath, outfilepath = infilepath, bib_forma report$downloaded <- as.logical(NA) report$is_pdf <- as.logical(NA) + report$is_empty <- as.logical(NA) for (i in 1:nrow(report)) { print(report$downloaded_file[i]) @@ -198,36 +199,42 @@ article_pdf_download <- function(infilepath, outfilepath = infilepath, bib_forma report$downloaded[i] <- TRUE # test if the file is binary (assumed to be PDF) or not (html or other) report$is_pdf[i] <- suppressWarnings(is_binary(report$downloaded_file[i])) + # test if we just downloaded an empty file. a little more informative than is_pdf + report$is_empty[i] <- file.size(report$downloaded_file[i]) == 0 } else { report$downloaded[i] <- FALSE report$is_pdf[i] <- FALSE + report$is_empty[i] <- FALSE } } # Extract some statistics download_success <- sum(report$downloaded, na.rm = TRUE) # out of 5759 acquired links, 4604 produced downloaded files - unique_files <- length(unique(report$downloaded_file[report$downloaded])) # out of 4604 downloaded files, 4539 are unique - unique_pdfs <- length(unique(report$downloaded_file[report$downloaded & report$is_pdf])) # out of 4539 unique downloaded files, 4057 are binary files (PDFs) + unique_files <- length(unique(report$downloaded_file[report$downloaded & !report$is_empty])) # out of 4604 downloaded files, 4539 are unique and nonempty + unique_pdfs <- length(unique(report$downloaded_file[report$downloaded & report$is_pdf & !report$is_empty])) # out of 4539 unique downloaded files, 4057 are nonempty binary files (PDFs) message(sprintf("Over the %i acquired links, %i PDFs were succesfully downloaded", nrow(report), unique_pdfs)) # Extract the files info that were not PDFs non_pdf_paths <- unique(report$downloaded_file[report$downloaded & !report$is_pdf]) # For investigative purposes, here are the paths for the non-PDF files (482) that were downloaded - if(length(non_pdf_paths > 0)){ + + + + # Make non-pdf links HTML to make them clickable + if(length(non_pdf_paths) > 0){ ## Move the non-pdf files to a specific directory # Create the destination list html_paths <- file.path( nopdf_output_dir, paste0(basename(tools::file_path_sans_ext(non_pdf_paths)), - ".html") + ".html" + ) ) # Move the files file.rename(from = non_pdf_paths, to = html_paths) } - # TO DOs: Add file renaming - # output information regarding the download processs to csv summary_path <- file.path(outfilepath, 'summary.csv') readr::write_csv(report, path = summary_path)