Data validation functions in process_input() now print more helpful error messages

almeidasilvaf · almeidasilvaf · commit 8e2ed2c43d11 · 2024-03-20T20:27:46.000+01:00
diff --git a/.github/workflows/check-bioc.yml b/.github/workflows/check-bioc.yml
@@ -52,7 +52,7 @@ jobs:
       fail-fast: false
       matrix:
         config:
-          - { os: ubuntu-latest, r: '4.2', bioc: '3.15', cont: "bioconductor/bioconductor_docker:RELEASE_3_15", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" }
+          - { os: ubuntu-latest, r: '4.3', bioc: '3.18', cont: "bioconductor/bioconductor_docker:RELEASE_3_18", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" }
           ## Check https://github.com/r-lib/actions/tree/master/examples
           ## for examples using the http-user-agent
     env:
@@ -119,16 +119,16 @@ jobs:
         uses: actions/cache@v2
         with:
           path: ${{ env.R_LIBS_USER }}
-          key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_15-r-4.2-${{ hashFiles('.github/depends.Rds') }}
-          restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_15-r-4.2-
+          key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_18-r-4.3-${{ hashFiles('.github/depends.Rds') }}
+          restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_18-r-4.3-
 
       - name: Cache R packages on Linux
         if: "!contains(github.event.head_commit.message, '/nocache') && runner.os == 'Linux' "
         uses: actions/cache@v2
         with:
           path: /home/runner/work/_temp/Library
-          key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_15-r-4.2-${{ hashFiles('.github/depends.Rds') }}
-          restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_15-r-4.2-
+          key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_18-r-4.3-${{ hashFiles('.github/depends.Rds') }}
+          restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_18-r-4.3-
 
       - name: Install Linux system dependencies
         if: runner.os == 'Linux'
@@ -306,7 +306,7 @@ jobs:
         if: failure()
         uses: actions/upload-artifact@v2
         with:
-          name: ${{ runner.os }}-biocversion-RELEASE_3_15-r-4.2-results
+          name: ${{ runner.os }}-biocversion-RELEASE_3_18-r-4.3-results
           path: check
 
       - uses: docker/build-push-action@v1
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -48,7 +48,7 @@ biocViews: Software,
     Network
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.1
 Imports:
     Rcpp (>= 1.0.8),
     BiocParallel,
diff --git a/NAMESPACE b/NAMESPACE
@@ -6,7 +6,6 @@ export(cluster_network)
 export(collapse_protein_ids)
 export(create_species_id_table)
 export(diamond_is_installed)
-export(last_is_installed)
 export(export_sequences)
 export(fasta2AAStringSetlist)
 export(find_GS_clusters)
@@ -17,6 +16,7 @@ export(interspecies_synteny)
 export(intraspecies_synteny)
 export(iqtree_is_installed)
 export(iqtree_version)
+export(last_is_installed)
 export(parse_collinearity)
 export(phylogenomic_profile)
 export(plot_network)
@@ -61,7 +61,9 @@ importFrom(stats,ave)
 importFrom(stats,dist)
 importFrom(stats,hclust)
 importFrom(stats,reshape)
+importFrom(utils,capture.output)
 importFrom(utils,combn)
+importFrom(utils,head)
 importFrom(utils,read.csv)
 importFrom(utils,read.table)
 importFrom(utils,write.table)
diff --git a/R/07_microsynteny-based_phylogeny.R b/R/07_microsynteny-based_phylogeny.R
@@ -133,13 +133,7 @@ infer_microsynteny_phylogeny <- function(transposed_profiles = NULL,
     # Run IQ-TREE
     if(iqtree_version() == 1) { # IQ-TREE v1
         iqtree_args <- c(
-            "-s ", matrix_file, 
-            "-bb", bootr, 
-            "-alrt", alrtboot, 
-            "-nt", threads, 
-            root,
-            "-m", model,
-            "-st MORPH -redo"
+            "-s ", matrix_file, "-bb", bootr, "-alrt", alrtboot, "-nt", threads, root, "-m", model, "-st MORPH -redo"
         )
         iqtree <- system2("iqtree", args = iqtree_args, stdout = stdout)
     } else { # IQ-TREE v2
diff --git a/R/utils.R b/R/utils.R
@@ -1,7 +1,6 @@
 
 #' Check if the names of list of sequences and annotations match.
 #' 
-#'
 #' @param seq A list of AAStringSet objects.
 #' @param annotation A GRangesList, CompressedGRangesList, or list of
 #' GRanges with the annotation for the sequences in \strong{seq}.
@@ -17,18 +16,37 @@ check_list_names <- function(seq = NULL, annotation = NULL) {
     
     annot_names <- names(annotation)
     seq_names <- names(seq)
-    n_match <- annot_names %in% seq_names
     
+    # Check for differences in both sets
+    diff_seq <- setdiff(seq_names, annot_names)
+    diff_annot <- setdiff(annot_names, seq_names)
+    
+    check <- TRUE
     if(is.null(annot_names) | is.null(seq_names)) {
         stop("List-like arguments 'seq' and 'annotation' must have names.")
-    } else if(any(n_match == FALSE)) {
-        stop("Names of list elements in 'seq' and 'annotation' must match.")
-    } else {
-        check <- TRUE
+    } else if(length(diff_seq) != 0 & length(diff_annot) == 0) {
+        stop(
+            "The following elements in `seq` were not found in `annotation`:\n",
+            paste0(diff_seq, collapse = "\n")
+        )
+    } else if(length(diff_seq) == 0 & length(diff_annot) != 0) {
+        stop(
+            "The following elements in `annotation` were not found in `seq:`\n",
+            paste0(diff_annot, collapse = "\n")
+        )
+    } else if(length(diff_seq) != 0 & length(diff_annot) != 0) {
+        stop(
+            "Element in `seq` but not in `annotation`: \n",
+            paste0(diff_seq, collapse = "\n"),
+            "\n\nElements in `annotation` but not in `seq`: \n",
+            paste0(diff_annot, collapse = "\n")
+        )
     }
+    
     return(check)
 }
 
+
 #' Check if the number of sequences is less than the number of genes
 #'
 #' @param seq A list of AAStringSet objects.
@@ -37,6 +55,7 @@ check_list_names <- function(seq = NULL, annotation = NULL) {
 #'
 #' @return TRUE if the objects pass the check.
 #' @noRd 
+#' @importFrom utils capture.output
 #' @examples 
 #' data(proteomes)
 #' data(annotation)
@@ -47,34 +66,31 @@ check_ngenes <- function(seq = NULL, annotation = NULL) {
     # Data frame of species and gene count based on annotation
     gene_count <- Reduce(rbind, lapply(seq_along(annotation), function(x) {
         count <- length(annotation[[x]][annotation[[x]]$type == "gene"])
-        count_df <- data.frame(
-            Species = names(annotation)[x],
-            Genes = count
-        )
+        count_df <- data.frame(species = names(annotation)[x], ngenes = count)
         return(count_df)
     }))
     
     # Data frame of species and gene count based on sequences
     seq_count <- Reduce(rbind, lapply(seq_along(seq), function(x) {
         count <- length(seq[[x]])
-        count_df <- data.frame(
-            Species = names(annotation)[x],
-            Seqs = count
-        )
+        count_df <- data.frame(species = names(annotation)[x], nseqs = count)
         return(count_df)
     }))
     
     # Check if number of sequences is <= gene count (accounting for ncRNAs)
-    counts <- merge(gene_count, seq_count, by = "Species")
-    check_count <- counts$Seqs <= counts$Genes
-    idx_error <- which(check_count == FALSE)
-    if(length(idx_error) != 0) {
-        name <- counts$Species[idx_error]
-        name <- paste0(seq_along(name), ". ", name)
-        name <- paste0(name, collapse = "\n")
-        stop("Number of sequences in greater than the number of genes for:\n",
-             name)
-    } 
+    counts <- merge(gene_count, seq_count, by = "species")
+    check_count <- counts[counts$nseqs > counts$ngenes, ]
+    if(nrow(check_count) > 0) {
+        msg <- paste0(
+            "One or more species have more sequences in `seq` than ", 
+            "there are genes in `annotation`.\n",
+            "Did you remember to keep only one protein isoform per gene?\n",
+            "Problematic species:\n"
+        )
+        out <- capture.output(print(check_count, row.names = FALSE))
+        stop(paste(c(msg, out), collapse = "\n"))
+    }
+    
     return(TRUE)
 }
 
@@ -90,17 +106,18 @@ check_ngenes <- function(seq = NULL, annotation = NULL) {
 #' @return TRUE if the objects pass the check.
 #' @noRd 
 #' @importFrom GenomicRanges mcols
+#' @importFrom utils capture.output head
 #' @examples
 #' data(annotation)
 #' data(proteomes)
 #' seq <- proteomes
 #' check_gene_names(seq, annotation)
-check_gene_names <- function(seq = NULL, annotation = NULL, 
-                             gene_field = "gene_id") {
+check_gene_names <- function(
+        seq = NULL, annotation = NULL, gene_field = "gene_id"
+) {
     
     seq_names <- lapply(seq, names)
     gene_names <- lapply(annotation, function(x) {
-        
         ranges_cols <- GenomicRanges::mcols(x[x$type == "gene"])
         if(!gene_field %in% names(ranges_cols)) {
             stop("Could not find column '", gene_field, "' in GRanges.")
@@ -112,22 +129,37 @@ check_gene_names <- function(seq = NULL, annotation = NULL,
     
     # Check if names in `seq` match gene names in `annotation`
     check_names <- lapply(seq_along(seq_names), function(x) {
-        c <- seq_names[[x]] %in% gene_names[[x]]
-        c <- any(c == FALSE)
-        return(c)
+        sp <- names(seq_names)[x]
+        diff <- seq_names[[x]][!seq_names[[x]] %in% gene_names[[sp]]]
+        return(diff)
     })
+    names(check_names) <- names(seq_names)
     
-    idx_error <- which(check_names == TRUE) # TRUE means error
-    if(length(idx_error) != 0) {
-        name <- names(seq_names)[idx_error]
-        name <- paste0(seq_along(name), ". ", name)
-        name <- paste0(name, collapse = "\n")
-        stop("Sequence names in 'seq' do not match gene names in 'annotation' for:\n",
-             name)
+    # If there at least one species with a mismatch, show species name + info
+    n_mismatch <- lengths(check_names)
+    if(any(n_mismatch > 0)) {
+        m <- names(n_mismatch[n_mismatch > 0])
+        
+        firstn <- function(x, n = 2) {
+            return(lapply(x, function(y) paste(head(y, n), collapse = ",")))
+        }
+        m_df <- data.frame(
+            species = m,
+            sample_seqs = unlist(firstn(check_names[m])),
+            sample_genes = unlist(firstn(gene_names[m]))
+        )
+        out <- capture.output(print(m_df, row.names = FALSE))
+        msg <- paste0(
+            "Sequence names in `seq` do not match gene names ", 
+            "in `annotation` for the following ", length(m), " species:\n"
+        )
+        stop(paste(c(msg, out), collapse = "\n"))
     }
+    
     return(TRUE)
 }
 
+
 #' Create a data frame of species IDs (3-5-character abbreviations)
 #' 
 #' @param species_names A character vector of names extracted from 
diff --git a/man/run_last.Rd b/man/run_last.Rd
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
@@ -35,7 +35,7 @@ BEGIN_RCPP
 END_RCPP
 }
 
-RcppExport SEXP run_testthat_tests(void *);
+RcppExport SEXP run_testthat_tests(SEXP);
 
 static const R_CallMethodDef CallEntries[] = {
     {"_syntenet_rcpp_mcscanx_file", (DL_FUNC) &_syntenet_rcpp_mcscanx_file, 14},
diff --git a/tests/testthat/test-03_synteny_detection.R b/tests/testthat/test-03_synteny_detection.R
@@ -81,8 +81,12 @@ test_that("intraspecies_synteny() detects intraspecies synteny", {
 
     # Detect intraspecies synteny
     intrasyn <- intraspecies_synteny(blast_intra, pannotation)
+    intrasyn2 <- intraspecies_synteny(
+        blast_intra, pannotation, verbose = TRUE, is_pairwise = TRUE
+    )
 
     expect_equal(class(intrasyn), "character")
+    expect_equal(class(intrasyn2), "character")
     expect_equal(length(intrasyn), 1)
 })
 
diff --git a/vignettes/syntenet.Rmd b/vignettes/syntenet.Rmd
@@ -586,7 +586,7 @@ ggtree(angiosperm_phylogeny) +
     xlim(0, 0.3)
 ```
 
-## __syntenet__ as a synteny detection tool
+# __syntenet__ as a synteny detection tool
 
 In some cases, users do not want to infer a synteny network, but only want to
 identify syntenic regions within a single genome or between two genomes. This

Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,7 @@ BEGIN_RCPP`
`35`	`35`	`END_RCPP`
`36`	`36`	`}`
`37`	`37`
`38`		`-RcppExport SEXP run_testthat_tests(void *);`
	`38`	`+RcppExport SEXP run_testthat_tests(SEXP);`
`39`	`39`
`40`	`40`	`static const R_CallMethodDef CallEntries[] = {`
`41`	`41`	`{"_syntenet_rcpp_mcscanx_file", (DL_FUNC) &_syntenet_rcpp_mcscanx_file, 14},`