1
1
2
2
# ' Check if the names of list of sequences and annotations match.
3
3
# '
4
- # '
5
4
# ' @param seq A list of AAStringSet objects.
6
5
# ' @param annotation A GRangesList, CompressedGRangesList, or list of
7
6
# ' GRanges with the annotation for the sequences in \strong{seq}.
@@ -17,18 +16,37 @@ check_list_names <- function(seq = NULL, annotation = NULL) {
17
16
18
17
annot_names <- names(annotation )
19
18
seq_names <- names(seq )
20
- n_match <- annot_names %in% seq_names
21
19
20
+ # Check for differences in both sets
21
+ diff_seq <- setdiff(seq_names , annot_names )
22
+ diff_annot <- setdiff(annot_names , seq_names )
23
+
24
+ check <- TRUE
22
25
if (is.null(annot_names ) | is.null(seq_names )) {
23
26
stop(" List-like arguments 'seq' and 'annotation' must have names." )
24
- } else if (any(n_match == FALSE )) {
25
- stop(" Names of list elements in 'seq' and 'annotation' must match." )
26
- } else {
27
- check <- TRUE
27
+ } else if (length(diff_seq ) != 0 & length(diff_annot ) == 0 ) {
28
+ stop(
29
+ " The following elements in `seq` were not found in `annotation`:\n " ,
30
+ paste0(diff_seq , collapse = " \n " )
31
+ )
32
+ } else if (length(diff_seq ) == 0 & length(diff_annot ) != 0 ) {
33
+ stop(
34
+ " The following elements in `annotation` were not found in `seq:`\n " ,
35
+ paste0(diff_annot , collapse = " \n " )
36
+ )
37
+ } else if (length(diff_seq ) != 0 & length(diff_annot ) != 0 ) {
38
+ stop(
39
+ " Element in `seq` but not in `annotation`: \n " ,
40
+ paste0(diff_seq , collapse = " \n " ),
41
+ " \n\n Elements in `annotation` but not in `seq`: \n " ,
42
+ paste0(diff_annot , collapse = " \n " )
43
+ )
28
44
}
45
+
29
46
return (check )
30
47
}
31
48
49
+
32
50
# ' Check if the number of sequences is less than the number of genes
33
51
# '
34
52
# ' @param seq A list of AAStringSet objects.
@@ -37,6 +55,7 @@ check_list_names <- function(seq = NULL, annotation = NULL) {
37
55
# '
38
56
# ' @return TRUE if the objects pass the check.
39
57
# ' @noRd
58
+ # ' @importFrom utils capture.output
40
59
# ' @examples
41
60
# ' data(proteomes)
42
61
# ' data(annotation)
@@ -47,34 +66,31 @@ check_ngenes <- function(seq = NULL, annotation = NULL) {
47
66
# Data frame of species and gene count based on annotation
48
67
gene_count <- Reduce(rbind , lapply(seq_along(annotation ), function (x ) {
49
68
count <- length(annotation [[x ]][annotation [[x ]]$ type == " gene" ])
50
- count_df <- data.frame (
51
- Species = names(annotation )[x ],
52
- Genes = count
53
- )
69
+ count_df <- data.frame (species = names(annotation )[x ], ngenes = count )
54
70
return (count_df )
55
71
}))
56
72
57
73
# Data frame of species and gene count based on sequences
58
74
seq_count <- Reduce(rbind , lapply(seq_along(seq ), function (x ) {
59
75
count <- length(seq [[x ]])
60
- count_df <- data.frame (
61
- Species = names(annotation )[x ],
62
- Seqs = count
63
- )
76
+ count_df <- data.frame (species = names(annotation )[x ], nseqs = count )
64
77
return (count_df )
65
78
}))
66
79
67
80
# Check if number of sequences is <= gene count (accounting for ncRNAs)
68
- counts <- merge(gene_count , seq_count , by = " Species" )
69
- check_count <- counts $ Seqs < = counts $ Genes
70
- idx_error <- which(check_count == FALSE )
71
- if (length(idx_error ) != 0 ) {
72
- name <- counts $ Species [idx_error ]
73
- name <- paste0(seq_along(name ), " . " , name )
74
- name <- paste0(name , collapse = " \n " )
75
- stop(" Number of sequences in greater than the number of genes for:\n " ,
76
- name )
77
- }
81
+ counts <- merge(gene_count , seq_count , by = " species" )
82
+ check_count <- counts [counts $ nseqs > counts $ ngenes , ]
83
+ if (nrow(check_count ) > 0 ) {
84
+ msg <- paste0(
85
+ " One or more species have more sequences in `seq` than " ,
86
+ " there are genes in `annotation`.\n " ,
87
+ " Did you remember to keep only one protein isoform per gene?\n " ,
88
+ " Problematic species:\n "
89
+ )
90
+ out <- capture.output(print(check_count , row.names = FALSE ))
91
+ stop(paste(c(msg , out ), collapse = " \n " ))
92
+ }
93
+
78
94
return (TRUE )
79
95
}
80
96
@@ -90,17 +106,18 @@ check_ngenes <- function(seq = NULL, annotation = NULL) {
90
106
# ' @return TRUE if the objects pass the check.
91
107
# ' @noRd
92
108
# ' @importFrom GenomicRanges mcols
109
+ # ' @importFrom utils capture.output head
93
110
# ' @examples
94
111
# ' data(annotation)
95
112
# ' data(proteomes)
96
113
# ' seq <- proteomes
97
114
# ' check_gene_names(seq, annotation)
98
- check_gene_names <- function (seq = NULL , annotation = NULL ,
99
- gene_field = " gene_id" ) {
115
+ check_gene_names <- function (
116
+ seq = NULL , annotation = NULL , gene_field = " gene_id"
117
+ ) {
100
118
101
119
seq_names <- lapply(seq , names )
102
120
gene_names <- lapply(annotation , function (x ) {
103
-
104
121
ranges_cols <- GenomicRanges :: mcols(x [x $ type == " gene" ])
105
122
if (! gene_field %in% names(ranges_cols )) {
106
123
stop(" Could not find column '" , gene_field , " ' in GRanges." )
@@ -112,22 +129,37 @@ check_gene_names <- function(seq = NULL, annotation = NULL,
112
129
113
130
# Check if names in `seq` match gene names in `annotation`
114
131
check_names <- lapply(seq_along(seq_names ), function (x ) {
115
- c <- seq_names [[ x ]] %in% gene_names [[ x ] ]
116
- c <- any( c == FALSE )
117
- return (c )
132
+ sp <- names( seq_names )[ x ]
133
+ diff <- seq_names [[ x ]][ ! seq_names [[ x ]] %in% gene_names [[ sp ]]]
134
+ return (diff )
118
135
})
136
+ names(check_names ) <- names(seq_names )
119
137
120
- idx_error <- which(check_names == TRUE ) # TRUE means error
121
- if (length(idx_error ) != 0 ) {
122
- name <- names(seq_names )[idx_error ]
123
- name <- paste0(seq_along(name ), " . " , name )
124
- name <- paste0(name , collapse = " \n " )
125
- stop(" Sequence names in 'seq' do not match gene names in 'annotation' for:\n " ,
126
- name )
138
+ # If there at least one species with a mismatch, show species name + info
139
+ n_mismatch <- lengths(check_names )
140
+ if (any(n_mismatch > 0 )) {
141
+ m <- names(n_mismatch [n_mismatch > 0 ])
142
+
143
+ firstn <- function (x , n = 2 ) {
144
+ return (lapply(x , function (y ) paste(head(y , n ), collapse = " ," )))
145
+ }
146
+ m_df <- data.frame (
147
+ species = m ,
148
+ sample_seqs = unlist(firstn(check_names [m ])),
149
+ sample_genes = unlist(firstn(gene_names [m ]))
150
+ )
151
+ out <- capture.output(print(m_df , row.names = FALSE ))
152
+ msg <- paste0(
153
+ " Sequence names in `seq` do not match gene names " ,
154
+ " in `annotation` for the following " , length(m ), " species:\n "
155
+ )
156
+ stop(paste(c(msg , out ), collapse = " \n " ))
127
157
}
158
+
128
159
return (TRUE )
129
160
}
130
161
162
+
131
163
# ' Create a data frame of species IDs (3-5-character abbreviations)
132
164
# '
133
165
# ' @param species_names A character vector of names extracted from
0 commit comments