Skip to content

Commit fbc9328

Browse files
authored
Merge pull request #31 from unipept/feature/suffix-to-protein-optimization
suffix to protein optimization
2 parents 46f9d26 + 11d254f commit fbc9328

File tree

8 files changed

+169
-22
lines changed

8 files changed

+169
-22
lines changed

Cargo.lock

Lines changed: 58 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

bitarray/src/binary.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ mod tests {
125125

126126
impl Read for ErrorInput {
127127
fn read(&mut self, _buf: &mut [u8]) -> std::io::Result<usize> {
128-
Err(std::io::Error::new(std::io::ErrorKind::Other, "read error"))
128+
Err(std::io::Error::other("read error"))
129129
}
130130
}
131131

sa-compression/src/lib.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ mod tests {
9898
impl Write for FailingWriter {
9999
fn write(&mut self, _: &[u8]) -> Result<usize, std::io::Error> {
100100
if self.valid_write_count == 0 {
101-
return Err(std::io::Error::new(std::io::ErrorKind::Other, "Write failed"));
101+
return Err(std::io::Error::other("Write failed"));
102102
}
103103

104104
self.valid_write_count -= 1;
@@ -118,7 +118,7 @@ mod tests {
118118
impl Read for FailingReader {
119119
fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
120120
if self.valid_read_count == 0 {
121-
return Err(std::io::Error::new(std::io::ErrorKind::Other, "Read failed"));
121+
return Err(std::io::Error::other("Read failed"));
122122
}
123123

124124
self.valid_read_count -= 1;

sa-index/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,5 @@ sa-mappings = { path = "../sa-mappings" }
1717
text-compression = { path = "../text-compression" }
1818
bitarray = { path = "../bitarray" }
1919
serde_json = "1.0.116"
20+
bitvec = "1"
21+
succinct = "0.5.2"

sa-index/src/binary.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,7 @@ mod tests {
196196
impl Write for FailingWriter {
197197
fn write(&mut self, _: &[u8]) -> Result<usize, std::io::Error> {
198198
if self.valid_write_count == 0 {
199-
return Err(std::io::Error::new(std::io::ErrorKind::Other, "Write failed"));
199+
return Err(std::io::Error::other("Write failed"));
200200
}
201201

202202
self.valid_write_count -= 1;
@@ -216,7 +216,7 @@ mod tests {
216216
impl Read for FailingReader {
217217
fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
218218
if self.valid_read_count == 0 {
219-
return Err(std::io::Error::new(std::io::ErrorKind::Other, "Read failed"));
219+
return Err(std::io::Error::other("Read failed"));
220220
}
221221

222222
self.valid_read_count -= 1;

sa-index/src/sa_searcher.rs

Lines changed: 45 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ use text_compression::ProteinTextSlice;
66
use crate::{
77
Nullable, SuffixArray,
88
sa_searcher::BoundSearch::{Maximum, Minimum},
9-
suffix_to_protein_index::{DenseSuffixToProtein, SparseSuffixToProtein, SuffixToProteinIndex}
9+
suffix_to_protein_index::{DenseSuffixToProtein, SparseSuffixToProtein, BitVecSuffixToProtein, SuffixToProteinIndex}
1010
};
1111

1212
/// Enum indicating if we are searching for the minimum, or maximum bound in the suffix array
@@ -90,6 +90,24 @@ impl Deref for SparseSearcher {
9090
}
9191
}
9292

93+
pub struct BitVecSearcher(Searcher);
94+
95+
impl BitVecSearcher {
96+
pub fn new(sa: SuffixArray, proteins: Proteins) -> Self {
97+
let suffix_index_to_protein = BitVecSuffixToProtein::new(&proteins.text);
98+
let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein));
99+
Self(searcher)
100+
}
101+
}
102+
103+
impl Deref for BitVecSearcher {
104+
type Target = Searcher;
105+
106+
fn deref(&self) -> &Self::Target {
107+
&self.0
108+
}
109+
}
110+
93111
pub struct DenseSearcher(Searcher);
94112

95113
impl DenseSearcher {
@@ -495,9 +513,7 @@ mod tests {
495513
use text_compression::ProteinText;
496514

497515
use crate::{
498-
SuffixArray,
499-
sa_searcher::{BoundSearchResult, SearchAllSuffixesResult, Searcher},
500-
suffix_to_protein_index::SparseSuffixToProtein
516+
sa_searcher::{BoundSearchResult, SearchAllSuffixesResult, Searcher}, suffix_to_protein_index::{BitVecSuffixToProtein, DenseSuffixToProtein, SparseSuffixToProtein}, SuffixArray
501517
};
502518

503519
#[test]
@@ -556,7 +572,7 @@ mod tests {
556572
let proteins = get_example_proteins();
557573
let sa = SuffixArray::Original(vec![19, 10, 2, 13, 9, 8, 11, 5, 0, 3, 12, 15, 6, 1, 4, 17, 14, 16, 7, 18], 1);
558574

559-
let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text);
575+
let suffix_index_to_protein = BitVecSuffixToProtein::new(&proteins.text);
560576
let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein));
561577

562578
// search bounds 'A'
@@ -589,12 +605,29 @@ mod tests {
589605
assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![5, 11]));
590606
}
591607

608+
#[test]
609+
fn test_search_dense() {
610+
let proteins = get_example_proteins();
611+
let sa = SuffixArray::Original(vec![9, 0, 3, 12, 15, 6, 18], 3);
612+
613+
let suffix_index_to_protein = DenseSuffixToProtein::new(&proteins.text);
614+
let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein));
615+
616+
// search suffix 'VAA'
617+
let found_suffixes = searcher.search_matching_suffixes(b"VAA", usize::MAX, false, false);
618+
assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![7]));
619+
620+
// search suffix 'AC'
621+
let found_suffixes = searcher.search_matching_suffixes(b"AC", usize::MAX, false, false);
622+
assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![5, 11]));
623+
}
624+
592625
#[test]
593626
fn test_il_equality() {
594627
let proteins = get_example_proteins();
595628
let sa = SuffixArray::Original(vec![19, 10, 2, 13, 9, 8, 11, 5, 0, 3, 12, 15, 6, 1, 4, 17, 14, 16, 7, 18], 1);
596629

597-
let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text);
630+
let suffix_index_to_protein = BitVecSuffixToProtein::new(&proteins.text);
598631
let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein));
599632

600633
let bounds_res = searcher.search_bounds(b"I");
@@ -638,7 +671,7 @@ mod tests {
638671
};
639672

640673
let sparse_sa = SuffixArray::Original(vec![0, 2, 4], 2);
641-
let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text);
674+
let suffix_index_to_protein = BitVecSuffixToProtein::new(&proteins.text);
642675
let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));
643676

644677
// search bounds 'IM' with equal I and L
@@ -661,7 +694,7 @@ mod tests {
661694
};
662695

663696
let sparse_sa = SuffixArray::Original(vec![6, 0, 1, 5, 4, 3, 2], 1);
664-
let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text);
697+
let suffix_index_to_protein = BitVecSuffixToProtein::new(&proteins.text);
665698
let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));
666699

667700
let found_suffixes = searcher.search_matching_suffixes(b"I", usize::MAX, true, false);
@@ -683,7 +716,7 @@ mod tests {
683716
};
684717

685718
let sparse_sa = SuffixArray::Original(vec![6, 5, 4, 3, 2, 1, 0], 1);
686-
let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text);
719+
let suffix_index_to_protein = BitVecSuffixToProtein::new(&proteins.text);
687720
let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));
688721

689722
let found_suffixes = searcher.search_matching_suffixes(b"II", usize::MAX, true, false);
@@ -705,7 +738,7 @@ mod tests {
705738
};
706739

707740
let sparse_sa = SuffixArray::Original(vec![6, 4, 2, 0], 2);
708-
let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text);
741+
let suffix_index_to_protein = BitVecSuffixToProtein::new(&proteins.text);
709742
let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));
710743

711744
// search all places where II is in the string IIIILL, but with a sparse SA
@@ -729,7 +762,7 @@ mod tests {
729762
};
730763

731764
let sparse_sa = SuffixArray::Original(vec![6, 5, 4, 3, 2, 1, 0], 1);
732-
let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text);
765+
let suffix_index_to_protein = BitVecSuffixToProtein::new(&proteins.text);
733766
let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));
734767

735768
// search bounds 'IM' with equal I and L
@@ -752,7 +785,7 @@ mod tests {
752785
};
753786

754787
let sparse_sa = SuffixArray::Original(vec![13, 3, 12, 11, 1, 4, 2, 5, 9, 8, 6, 10, 0, 7], 1);
755-
let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text);
788+
let suffix_index_to_protein = BitVecSuffixToProtein::new(&proteins.text);
756789
let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));
757790

758791
let found_suffixes_1 = searcher.search_matching_suffixes(b"PAA", usize::MAX, false, true);

0 commit comments

Comments
 (0)