Skip to content

Commit 8e2ed2c

Browse files
committed
Data validation functions in process_input() now print more helpful error messages
1 parent f3a12b6 commit 8e2ed2c

File tree

9 files changed

+89
-57
lines changed

9 files changed

+89
-57
lines changed

.github/workflows/check-bioc.yml

+6-6
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ jobs:
5252
fail-fast: false
5353
matrix:
5454
config:
55-
- { os: ubuntu-latest, r: '4.2', bioc: '3.15', cont: "bioconductor/bioconductor_docker:RELEASE_3_15", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" }
55+
- { os: ubuntu-latest, r: '4.3', bioc: '3.18', cont: "bioconductor/bioconductor_docker:RELEASE_3_18", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" }
5656
## Check https://github.com/r-lib/actions/tree/master/examples
5757
## for examples using the http-user-agent
5858
env:
@@ -119,16 +119,16 @@ jobs:
119119
uses: actions/cache@v2
120120
with:
121121
path: ${{ env.R_LIBS_USER }}
122-
key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_15-r-4.2-${{ hashFiles('.github/depends.Rds') }}
123-
restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_15-r-4.2-
122+
key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_18-r-4.3-${{ hashFiles('.github/depends.Rds') }}
123+
restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_18-r-4.3-
124124

125125
- name: Cache R packages on Linux
126126
if: "!contains(github.event.head_commit.message, '/nocache') && runner.os == 'Linux' "
127127
uses: actions/cache@v2
128128
with:
129129
path: /home/runner/work/_temp/Library
130-
key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_15-r-4.2-${{ hashFiles('.github/depends.Rds') }}
131-
restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_15-r-4.2-
130+
key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_18-r-4.3-${{ hashFiles('.github/depends.Rds') }}
131+
restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_18-r-4.3-
132132

133133
- name: Install Linux system dependencies
134134
if: runner.os == 'Linux'
@@ -306,7 +306,7 @@ jobs:
306306
if: failure()
307307
uses: actions/upload-artifact@v2
308308
with:
309-
name: ${{ runner.os }}-biocversion-RELEASE_3_15-r-4.2-results
309+
name: ${{ runner.os }}-biocversion-RELEASE_3_18-r-4.3-results
310310
path: check
311311

312312
- uses: docker/build-push-action@v1

DESCRIPTION

+1-1
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ biocViews: Software,
4848
Network
4949
Encoding: UTF-8
5050
Roxygen: list(markdown = TRUE)
51-
RoxygenNote: 7.2.3
51+
RoxygenNote: 7.3.1
5252
Imports:
5353
Rcpp (>= 1.0.8),
5454
BiocParallel,

NAMESPACE

+3-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ export(cluster_network)
66
export(collapse_protein_ids)
77
export(create_species_id_table)
88
export(diamond_is_installed)
9-
export(last_is_installed)
109
export(export_sequences)
1110
export(fasta2AAStringSetlist)
1211
export(find_GS_clusters)
@@ -17,6 +16,7 @@ export(interspecies_synteny)
1716
export(intraspecies_synteny)
1817
export(iqtree_is_installed)
1918
export(iqtree_version)
19+
export(last_is_installed)
2020
export(parse_collinearity)
2121
export(phylogenomic_profile)
2222
export(plot_network)
@@ -61,7 +61,9 @@ importFrom(stats,ave)
6161
importFrom(stats,dist)
6262
importFrom(stats,hclust)
6363
importFrom(stats,reshape)
64+
importFrom(utils,capture.output)
6465
importFrom(utils,combn)
66+
importFrom(utils,head)
6567
importFrom(utils,read.csv)
6668
importFrom(utils,read.table)
6769
importFrom(utils,write.table)

R/07_microsynteny-based_phylogeny.R

+1-7
Original file line numberDiff line numberDiff line change
@@ -133,13 +133,7 @@ infer_microsynteny_phylogeny <- function(transposed_profiles = NULL,
133133
# Run IQ-TREE
134134
if(iqtree_version() == 1) { # IQ-TREE v1
135135
iqtree_args <- c(
136-
"-s ", matrix_file,
137-
"-bb", bootr,
138-
"-alrt", alrtboot,
139-
"-nt", threads,
140-
root,
141-
"-m", model,
142-
"-st MORPH -redo"
136+
"-s ", matrix_file, "-bb", bootr, "-alrt", alrtboot, "-nt", threads, root, "-m", model, "-st MORPH -redo"
143137
)
144138
iqtree <- system2("iqtree", args = iqtree_args, stdout = stdout)
145139
} else { # IQ-TREE v2

R/utils.R

+69-37
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11

22
#' Check if the names of list of sequences and annotations match.
33
#'
4-
#'
54
#' @param seq A list of AAStringSet objects.
65
#' @param annotation A GRangesList, CompressedGRangesList, or list of
76
#' GRanges with the annotation for the sequences in \strong{seq}.
@@ -17,18 +16,37 @@ check_list_names <- function(seq = NULL, annotation = NULL) {
1716

1817
annot_names <- names(annotation)
1918
seq_names <- names(seq)
20-
n_match <- annot_names %in% seq_names
2119

20+
# Check for differences in both sets
21+
diff_seq <- setdiff(seq_names, annot_names)
22+
diff_annot <- setdiff(annot_names, seq_names)
23+
24+
check <- TRUE
2225
if(is.null(annot_names) | is.null(seq_names)) {
2326
stop("List-like arguments 'seq' and 'annotation' must have names.")
24-
} else if(any(n_match == FALSE)) {
25-
stop("Names of list elements in 'seq' and 'annotation' must match.")
26-
} else {
27-
check <- TRUE
27+
} else if(length(diff_seq) != 0 & length(diff_annot) == 0) {
28+
stop(
29+
"The following elements in `seq` were not found in `annotation`:\n",
30+
paste0(diff_seq, collapse = "\n")
31+
)
32+
} else if(length(diff_seq) == 0 & length(diff_annot) != 0) {
33+
stop(
34+
"The following elements in `annotation` were not found in `seq:`\n",
35+
paste0(diff_annot, collapse = "\n")
36+
)
37+
} else if(length(diff_seq) != 0 & length(diff_annot) != 0) {
38+
stop(
39+
"Element in `seq` but not in `annotation`: \n",
40+
paste0(diff_seq, collapse = "\n"),
41+
"\n\nElements in `annotation` but not in `seq`: \n",
42+
paste0(diff_annot, collapse = "\n")
43+
)
2844
}
45+
2946
return(check)
3047
}
3148

49+
3250
#' Check if the number of sequences is less than the number of genes
3351
#'
3452
#' @param seq A list of AAStringSet objects.
@@ -37,6 +55,7 @@ check_list_names <- function(seq = NULL, annotation = NULL) {
3755
#'
3856
#' @return TRUE if the objects pass the check.
3957
#' @noRd
58+
#' @importFrom utils capture.output
4059
#' @examples
4160
#' data(proteomes)
4261
#' data(annotation)
@@ -47,34 +66,31 @@ check_ngenes <- function(seq = NULL, annotation = NULL) {
4766
# Data frame of species and gene count based on annotation
4867
gene_count <- Reduce(rbind, lapply(seq_along(annotation), function(x) {
4968
count <- length(annotation[[x]][annotation[[x]]$type == "gene"])
50-
count_df <- data.frame(
51-
Species = names(annotation)[x],
52-
Genes = count
53-
)
69+
count_df <- data.frame(species = names(annotation)[x], ngenes = count)
5470
return(count_df)
5571
}))
5672

5773
# Data frame of species and gene count based on sequences
5874
seq_count <- Reduce(rbind, lapply(seq_along(seq), function(x) {
5975
count <- length(seq[[x]])
60-
count_df <- data.frame(
61-
Species = names(annotation)[x],
62-
Seqs = count
63-
)
76+
count_df <- data.frame(species = names(annotation)[x], nseqs = count)
6477
return(count_df)
6578
}))
6679

6780
# Check if number of sequences is <= gene count (accounting for ncRNAs)
68-
counts <- merge(gene_count, seq_count, by = "Species")
69-
check_count <- counts$Seqs <= counts$Genes
70-
idx_error <- which(check_count == FALSE)
71-
if(length(idx_error) != 0) {
72-
name <- counts$Species[idx_error]
73-
name <- paste0(seq_along(name), ". ", name)
74-
name <- paste0(name, collapse = "\n")
75-
stop("Number of sequences in greater than the number of genes for:\n",
76-
name)
77-
}
81+
counts <- merge(gene_count, seq_count, by = "species")
82+
check_count <- counts[counts$nseqs > counts$ngenes, ]
83+
if(nrow(check_count) > 0) {
84+
msg <- paste0(
85+
"One or more species have more sequences in `seq` than ",
86+
"there are genes in `annotation`.\n",
87+
"Did you remember to keep only one protein isoform per gene?\n",
88+
"Problematic species:\n"
89+
)
90+
out <- capture.output(print(check_count, row.names = FALSE))
91+
stop(paste(c(msg, out), collapse = "\n"))
92+
}
93+
7894
return(TRUE)
7995
}
8096

@@ -90,17 +106,18 @@ check_ngenes <- function(seq = NULL, annotation = NULL) {
90106
#' @return TRUE if the objects pass the check.
91107
#' @noRd
92108
#' @importFrom GenomicRanges mcols
109+
#' @importFrom utils capture.output head
93110
#' @examples
94111
#' data(annotation)
95112
#' data(proteomes)
96113
#' seq <- proteomes
97114
#' check_gene_names(seq, annotation)
98-
check_gene_names <- function(seq = NULL, annotation = NULL,
99-
gene_field = "gene_id") {
115+
check_gene_names <- function(
116+
seq = NULL, annotation = NULL, gene_field = "gene_id"
117+
) {
100118

101119
seq_names <- lapply(seq, names)
102120
gene_names <- lapply(annotation, function(x) {
103-
104121
ranges_cols <- GenomicRanges::mcols(x[x$type == "gene"])
105122
if(!gene_field %in% names(ranges_cols)) {
106123
stop("Could not find column '", gene_field, "' in GRanges.")
@@ -112,22 +129,37 @@ check_gene_names <- function(seq = NULL, annotation = NULL,
112129

113130
# Check if names in `seq` match gene names in `annotation`
114131
check_names <- lapply(seq_along(seq_names), function(x) {
115-
c <- seq_names[[x]] %in% gene_names[[x]]
116-
c <- any(c == FALSE)
117-
return(c)
132+
sp <- names(seq_names)[x]
133+
diff <- seq_names[[x]][!seq_names[[x]] %in% gene_names[[sp]]]
134+
return(diff)
118135
})
136+
names(check_names) <- names(seq_names)
119137

120-
idx_error <- which(check_names == TRUE) # TRUE means error
121-
if(length(idx_error) != 0) {
122-
name <- names(seq_names)[idx_error]
123-
name <- paste0(seq_along(name), ". ", name)
124-
name <- paste0(name, collapse = "\n")
125-
stop("Sequence names in 'seq' do not match gene names in 'annotation' for:\n",
126-
name)
138+
# If there at least one species with a mismatch, show species name + info
139+
n_mismatch <- lengths(check_names)
140+
if(any(n_mismatch > 0)) {
141+
m <- names(n_mismatch[n_mismatch > 0])
142+
143+
firstn <- function(x, n = 2) {
144+
return(lapply(x, function(y) paste(head(y, n), collapse = ",")))
145+
}
146+
m_df <- data.frame(
147+
species = m,
148+
sample_seqs = unlist(firstn(check_names[m])),
149+
sample_genes = unlist(firstn(gene_names[m]))
150+
)
151+
out <- capture.output(print(m_df, row.names = FALSE))
152+
msg <- paste0(
153+
"Sequence names in `seq` do not match gene names ",
154+
"in `annotation` for the following ", length(m), " species:\n"
155+
)
156+
stop(paste(c(msg, out), collapse = "\n"))
127157
}
158+
128159
return(TRUE)
129160
}
130161

162+
131163
#' Create a data frame of species IDs (3-5-character abbreviations)
132164
#'
133165
#' @param species_names A character vector of names extracted from

man/run_last.Rd

+3-3
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/RcppExports.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ BEGIN_RCPP
3535
END_RCPP
3636
}
3737

38-
RcppExport SEXP run_testthat_tests(void *);
38+
RcppExport SEXP run_testthat_tests(SEXP);
3939

4040
static const R_CallMethodDef CallEntries[] = {
4141
{"_syntenet_rcpp_mcscanx_file", (DL_FUNC) &_syntenet_rcpp_mcscanx_file, 14},

tests/testthat/test-03_synteny_detection.R

+4
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,12 @@ test_that("intraspecies_synteny() detects intraspecies synteny", {
8181

8282
# Detect intraspecies synteny
8383
intrasyn <- intraspecies_synteny(blast_intra, pannotation)
84+
intrasyn2 <- intraspecies_synteny(
85+
blast_intra, pannotation, verbose = TRUE, is_pairwise = TRUE
86+
)
8487

8588
expect_equal(class(intrasyn), "character")
89+
expect_equal(class(intrasyn2), "character")
8690
expect_equal(length(intrasyn), 1)
8791
})
8892

vignettes/syntenet.Rmd

+1-1
Original file line numberDiff line numberDiff line change
@@ -586,7 +586,7 @@ ggtree(angiosperm_phylogeny) +
586586
xlim(0, 0.3)
587587
```
588588

589-
## __syntenet__ as a synteny detection tool
589+
# __syntenet__ as a synteny detection tool
590590

591591
In some cases, users do not want to infer a synteny network, but only want to
592592
identify syntenic regions within a single genome or between two genomes. This

0 commit comments

Comments
 (0)