PecanProject · Sweetdevil144 · Jul 18, 2024 · Jul 18, 2024 · Jul 25, 2024 · Jul 25, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,5 @@
 # Change Log
+
 All notable changes are kept in this file. All changes made should be added to the section called
 `Unreleased`. Once a new release is made this file will be updated to create a new `Unreleased`
 section for the next release.
@@ -9,6 +10,8 @@ For more information about this file see also [Keep a Changelog](http://keepacha
 
 ### Added
 
+- Refactor `convert_input` to Perform tasks via helper function. Subtask of [#3307](https://github.com/PecanProject/pecan/issues/3307)
+
 ### Fixed
 
 ### Changed

diff --git a/base/db/R/add.database.entries.R b/base/db/R/add.database.entries.R
@@ -0,0 +1,111 @@
+#' Return new arrangement of database while adding code to deal with ensembles
+#'
+#' @param result list of results from the download function
+#' @param con database connection
+#' @param start_date start date of the data
+#' @param end_date end date of the data
+#' @param write whether to write to the database
+#' @param overwrite Logical: If a file already exists, create a fresh copy?
+#' @param insert.new.file whether to insert a new file
+#' @param input.args input arguments obtained from the convert_input function
+#' @param machine machine information
+#' @param mimetype data product specific file format
+#' @param formatname format name of the data
+#' @param allow.conflicting.dates whether to allow conflicting dates
+#' @param ensemble ensemble id
+#' @param ensemble_name ensemble name
+#' @param existing.input existing input records
+#' @param existing.dbfile existing dbfile records
+#' @param input input records
+#' @return list of input and dbfile ids
+#'
+#' @author Betsy Cowdery, Michael Dietze, Ankur Desai, Tony Gardella, Luke Dramko
+
+add.database.entries <- function(
+    result, con, start_date,
+    end_date, write, overwrite,
+    insert.new.file, input.args,
+    machine, mimetype, formatname,
+    allow.conflicting.dates, ensemble,
+    ensemble_name, existing.input,
+    existing.dbfile, input) {
+    if (write) {
+        # Setup newinput. This list will contain two variables: a vector of input IDs and a vector of DB IDs for each entry in result.
+        # This list will be returned.
+        newinput <- list(input.id = NULL, dbfile.id = NULL) # Blank vectors are null.
+        for (i in 1:length(result)) { # Master for loop
+            id_not_added <- TRUE
+
+            if (!is.null(existing.input) && nrow(existing.input[[i]]) > 0 &&
+                (existing.input[[i]]$start_date != start_date || existing.input[[i]]$end_date != end_date)) {
+                # Updating record with new dates
+                db.query(paste0("UPDATE inputs SET start_date='", start_date, "', end_date='", end_date, "' WHERE id=", existing.input[[i]]$id), con)
+                id_not_added <- FALSE
+
+                # The overall structure of this loop has been set up so that exactly one input.id and one dbfile.id will be written to newinput every iteration.
+                newinput$input.id <- c(newinput$input.id, existing.input[[i]]$id)
+                newinput$dbfile.id <- c(newinput$dbfile.id, existing.dbfile[[i]]$id)
+            }
+
+            if (overwrite) {
+                # A bit hacky, but need to make sure that all fields are updated to expected values (i.e., what they'd be if convert_input was creating a new record)
+                if (!is.null(existing.input) && nrow(existing.input[[i]]) > 0) {
+                    db.query(paste0("UPDATE inputs SET name='", basename(dirname(result[[i]]$file[1])), "' WHERE id=", existing.input[[i]]$id), con)
+                }
+
+                if (!is.null(existing.dbfile) && nrow(existing.dbfile[[i]]) > 0) {
+                    db.query(paste0("UPDATE dbfiles SET file_path='", dirname(result[[i]]$file[1]), "', file_name='", result[[i]]$dbfile.name[1], "' WHERE id=", existing.dbfile[[i]]$id), con)
+                }
+            }
+
+            # If there is no ensemble then for each record there should be one parent
+            # But when you have ensembles, all of the members have one parent !!
+            parent.id <- if (is.numeric(ensemble)) {
+                ifelse(is.null(input[[i]]), NA, input[[1]]$id)
+            } else {
+                ifelse(is.null(input[[i]]), NA, input[[i]]$id)
+            }
+
+
+            if ("newsite" %in% names(input.args) && !is.null(input.args[["newsite"]])) {
+                site.id <- input.args$newsite
+            }
+
+            if (insert.new.file && id_not_added) {
+                dbfile.id <- dbfile.insert(in.path = dirname(result[[i]]$file[1]), in.prefix = result[[i]]$dbfile.name[1], "Input", existing.input[[i]]$id, con, reuse = TRUE, hostname = machine$hostname)
+                newinput$input.id <- c(newinput$input.id, existing.input[[i]]$id)
+                newinput$dbfile.id <- c(newinput$dbfile.id, dbfile.id)
+            } else if (id_not_added) {
+                # This is to tell input.insert if we are writing ensembles
+                # Why does it need it? Because it checks for inputs with the same time period, site, and machine
+                # and if it returns something it does not insert anymore, but for ensembles, it needs to bypass this condition
+                ens.flag <- if (!is.null(ensemble) | is.null(ensemble_name)) TRUE else FALSE
+
+                new_entry <- dbfile.input.insert(
+                    in.path = dirname(result[[i]]$file[1]),
+                    in.prefix = result[[i]]$dbfile.name[1],
+                    siteid = site.id,
+                    startdate = start_date,
+                    enddate = end_date,
+                    mimetype = mimetype,
+                    formatname = formatname,
+                    parentid = parent.id,
+                    con = con,
+                    hostname = machine$hostname,
+                    allow.conflicting.dates = allow.conflicting.dates,
+                    ens = ens.flag
+                )
+
+                newinput$input.id <- c(newinput$input.id, new_entry$input.id)
+                newinput$dbfile.id <- c(newinput$dbfile.id, new_entry$dbfile.id)
+            }
+        } # End for loop
+
+        successful <- TRUE
+        return(newinput)
+    } else {
+        PEcAn.logger::logger.warn("Input was not added to the database")
+        successful <- TRUE
+        return(NULL)
+    }
+}
diff --git a/base/db/R/check.missing.files.R b/base/db/R/check.missing.files.R
@@ -0,0 +1,49 @@
+#' Function to check if result has empty or missing files
+#' 
+#' @param result A list of dataframes with file paths
+#' @param outname Name of the output file
+#' @param existing.input Existing input records
+#' @param existing.dbfile Existing dbfile records
+#' @return A list of dataframes with file paths, a list of strings with the output file name, a list of existing input records, and a list of existing dbfile records
+#' 
+#' @author Betsy Cowdery, Michael Dietze, Ankur Desai, Tony Gardella, Luke Dramko
+
+check_missing_files <- function(result, outname, existing.input = NULL, existing.dbfile = NULL) {
+  result_sizes <- purrr::map_dfr(
+    result,
+    ~ dplyr::mutate(
+      .,
+      file_size = purrr::map_dbl(file, file.size),
+      missing = is.na(file_size),
+      empty = file_size == 0
+    )
+  )
+
+  if (any(result_sizes$missing) || any(result_sizes$empty)) {
+    log_format_df <- function(df) {
+      formatted_df <- rbind(colnames(df), format(df))
+      formatted_text <- purrr::reduce(formatted_df, paste, sep = " ")
+      paste(formatted_text, collapse = "\n")
+    }
+
+    PEcAn.logger::logger.severe(
+      "Requested Processing produced empty files or Nonexistent files:\n",
+      log_format_df(result_sizes[, c(1, 8, 9, 10)]),
+      "\n Table of results printed above.",
+      wrap = FALSE
+    )
+  }
+
+  # Insert into Database
+  outlist <- unlist(strsplit(outname, "_"))
+
+  # Wrap in a list for consistant processing later
+  if (exists("existing.input") && is.data.frame(existing.input)) {
+    existing.input <- list(existing.input)
+  }
+
+  if (exists("existing.dbfile") && is.data.frame(existing.dbfile)) {
+    existing.dbfile <- list(existing.dbfile)
+  }
+  return(list(result_sizes, outlist, existing.input, existing.dbfile))
+}
diff --git a/base/db/R/convert_input.R b/base/db/R/convert_input.R
@@ -384,7 +384,7 @@ convert_input <-
     if (!is.null(ensemble) && ensemble) {
       return.all <-TRUE
 
-    }else{
+    } else{
       return.all <- FALSE
     }
       existing.dbfile <- dbfile.input.check(siteid = site.id,
@@ -734,143 +734,23 @@ convert_input <-
   #--------------------------------------------------------------------------------------------------#
   # Check if result has empty or missing files
 
-  result_sizes <- purrr::map_dfr(
-    result,
-    ~ dplyr::mutate(
-      .,
-      file_size = purrr::map_dbl(file, file.size),
-      missing = is.na(file_size),
-      empty = file_size == 0
-    )
-  )
-
-  if (any(result_sizes$missing) || any(result_sizes$empty)){
-    log_format_df = function(df){
-        rbind(colnames(df), format(df))
-        purrr::reduce( paste, sep=" ") %>%
-        paste(collapse="\n")
-    }
-
-    PEcAn.logger::logger.severe(
-      "Requested Processing produced empty files or Nonexistant files :\n",
-      log_format_df(result_sizes[,c(1,8,9,10)]),
-      "\n Table of results printed above.",
-      wrap = FALSE)
-  }
-
-  # Insert into Database
-  outlist <- unlist(strsplit(outname, "_"))
-
-  # Wrap in a list for consistant processing later
-  if (exists("existing.input") && is.data.frame(existing.input)) {
-    existing.input <- list(existing.input)
-  }
-
-  if (exists("existing.dbfile") && is.data.frame(existing.dbfile)) {
-    existing.dbfile <- list(existing.dbfile)
-  }
+  checked.missing.files <- check_missing_files(result, outname, existing.input, existing.dbfile)
+
+  # Unwrap parameters after performing checks for missing files
+  result_sizes <- checked.missing.files$result_sizes;
+  outlist <- checked.missing.files$outlist;
+  existing.input <- checked.missing.files$existing.input;
+  existing.dbfile <- checked.missing.files$existing.dbfile;
 
   #---------------------------------------------------------------#
   # New arrangement of database adding code to deal with ensembles.
-  if (write) {
-
-    # Setup newinput.  This list will contain two variables: a vector of input IDs and a vector of DB IDs for each entry in result.
-    # This list will be returned.
-    newinput = list(input.id = NULL, dbfile.id = NULL) #Blank vectors are null.
-    for(i in 1:length(result)) {  # Master for loop
-      id_not_added <- TRUE
-
-      if (exists("existing.input") && nrow(existing.input[[i]]) > 0 && 
-          (existing.input[[i]]$start_date != start_date || existing.input[[i]]$end_date != end_date)) {
-
-        # Updating record with new dates
-        db.query(paste0("UPDATE inputs SET start_date='", start_date, "', end_date='",
-                                  end_date, "'  WHERE id=", existing.input[[i]]$id), 
-                           con)
-        id_not_added = FALSE
-
-        # The overall structure of this loop has been set up so that exactly one input.id and one dbfile.id will be written to newinput every interation.
-        newinput$input.id = c(newinput$input.id, existing.input[[i]]$id)
-        newinput$dbfile.id = c(newinput$dbfile.id, existing.dbfile[[i]]$id)
-      }
-
-      if (overwrite) {
-        # A bit hacky, but need to make sure that all fields are updated to expected
-        # values (i.e., what they'd be if convert_input was creating a new record)
-        if (exists("existing.input") && nrow(existing.input[[i]]) > 0) {
-            db.query(paste0("UPDATE inputs SET name='", basename(dirname(result[[i]]$file[1])),
-                                      "' WHERE id=", existing.input[[i]]$id), con)
-
-        }
-
-        if (exists("existing.dbfile") && nrow(existing.dbfile[[i]]) > 0) {
-            db.query(paste0("UPDATE dbfiles SET file_path='", dirname(result[[i]]$file[1]),
-                                      "', ", "file_name='", result[[i]]$dbfile.name[1], 
-                                      "' WHERE id=", existing.dbfile[[i]]$id), con)
-
-        }
-      }
-
-      # If there is no ensemble then for each record there should be one parent
-      #But when you have ensembles, all of the members have one parent !!
-      if (is.numeric(ensemble)){
-        parent.id <- ifelse(is.null(input[i]), NA, input[1]$id)
-      }else{
-        parent.id <- ifelse(is.null(input[i]), NA, input[i]$id)  
-      }
-
-
-
-      if ("newsite" %in% names(input.args) && !is.null(input.args[["newsite"]])) {
-        site.id <- input.args$newsite
-      }
-
-      if (insert.new.file && id_not_added) {
-        dbfile.id <- dbfile.insert(in.path = dirname(result[[i]]$file[1]),
-                                             in.prefix = result[[i]]$dbfile.name[1], 
-                                             'Input', existing.input[[i]]$id, 
-                                             con, reuse=TRUE, hostname = machine$hostname)
-        newinput$input.id  <- c(newinput$input.id, existing.input[[i]]$id)
-        newinput$dbfile.id <- c(newinput$dbfile.id, dbfile.id)
-      } else if (id_not_added) {
-
-      # This is to tell input.insert if we are wrting ensembles
-      # Why does it need it ? bc it checks for inputs with the same time period, site and machine
-      # and if it returns somethings it does not insert anymore, but for ensembles it needs to bypass this condition
-      if (!is.null(ensemble) | is.null(ensemble_name)){
-        ens.flag <- TRUE
-      }else{
-        ens.flag <- FALSE
-      }
-
-        new_entry <- dbfile.input.insert(in.path = dirname(result[[i]]$file[1]),
-                                                   in.prefix = result[[i]]$dbfile.name[1], 
-                                                   siteid = site.id, 
-                                                   startdate = start_date,
-                                                   enddate = end_date, 
-                                                   mimetype, 
-                                                   formatname, 
-                                                   parentid = parent.id,
-                                                   con = con, 
-                                                   hostname = machine$hostname,
-                                                   allow.conflicting.dates = allow.conflicting.dates, 
-                                                   ens=ens.flag
-                                                   )
-
-
-        newinput$input.id <- c(newinput$input.id, new_entry$input.id)
-        newinput$dbfile.id <- c(newinput$dbfile.id, new_entry$dbfile.id)
-      }
-
-    } #End for loop
-
-    successful <- TRUE
-    return(newinput)
-  } else {
-    PEcAn.logger::logger.warn("Input was not added to the database")
-    successful <- TRUE
-    return(NULL)
-  }
+  return (add.database.entries(result, con, start_date,
+    end_date, write, overwrite,
+    insert.new.file, input.args,
+    machine, mimetype, formatname,
+    allow.conflicting.dates, ensemble,
+    ensemble_name, existing.input,
+    existing.dbfile, input))
 } # convert_input