Skip to content

Commit bf8abf0

Browse files
committed
Add multi-context seeds
1 parent 7d7b428 commit bf8abf0

File tree

12 files changed

+115
-25
lines changed

12 files changed

+115
-25
lines changed

src/arguments.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,11 @@ struct SeedingArguments {
2626
"results with non default values.", {'s'}}
2727
, bits{parser, "INT", "No. of top bits of hash to use as bucket indices (8-31)"
2828
"[determined from reference size]", {'b'}}
29+
, digest{parser, "INT", "Number of bits to be selected from the second strobe hash [24]", {"digest"}}
2930
{
3031
}
3132
args::ArgumentParser& parser;
32-
args::ValueFlag<int> r, m, k, l, u, c, s, bits;
33+
args::ValueFlag<int> r, m, k, l, u, c, s, bits, digest;
3334
};
3435

3536
#endif

src/cmdline.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ CommandLineOptions parse_command_line_arguments(int argc, char **argv) {
117117
if (seeding.s) { opt.s = args::get(seeding.s); opt.s_set = true; }
118118
if (seeding.c) { opt.c = args::get(seeding.c); opt.c_set = true; }
119119
if (seeding.bits) { opt.bits = args::get(seeding.bits); }
120+
if (seeding.digest) { opt.digest = args::get(seeding.digest); }
120121

121122
// Alignment
122123
// if (n) { n = args::get(n); }

src/cmdline.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ struct CommandLineOptions {
4545
int u { 7 };
4646
int s { 16 };
4747
int c { 8 };
48+
int digest {24};
4849

4950
// Alignment
5051
int A { 2 };

src/dumpstrobes.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ int run_dumpstrobes(int argc, char **argv) {
101101
}
102102

103103
// Seeding
104-
int r{150}, k{20}, s{16}, c{8}, l{1}, u{7};
104+
int r{150}, k{20}, s{16}, c{8}, l{1}, u{7}, digest{8};
105105
int max_seed_len{};
106106

107107
bool k_set{false}, s_set{false}, c_set{false}, max_seed_len_set{false}, l_set{false}, u_set{false};
@@ -125,7 +125,8 @@ int run_dumpstrobes(int argc, char **argv) {
125125
l_set ? l : IndexParameters::DEFAULT,
126126
u_set ? u : IndexParameters::DEFAULT,
127127
c_set ? c : IndexParameters::DEFAULT,
128-
max_seed_len_set ? max_seed_len : IndexParameters::DEFAULT
128+
max_seed_len_set ? max_seed_len : IndexParameters::DEFAULT,
129+
digest ? digest: IndexParameters::DEFAULT
129130
);
130131

131132
logger.info() << index_parameters << '\n';

src/index.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,8 @@ void StrobemerIndex::populate(float f, size_t n_threads) {
149149
}
150150
stats.tot_strobemer_count = total_randstrobes;
151151

152+
logger.info() << "Digest parameter is : " << parameters.randstrobe.digest;
153+
152154
logger.debug() << " Total number of randstrobes: " << total_randstrobes << '\n';
153155
uint64_t memory_bytes = references.total_length() + sizeof(RefRandstrobe) * total_randstrobes + sizeof(bucket_index_t) * (1u << bits);
154156
logger.debug() << " Estimated total memory usage: " << memory_bytes / 1E9 << " GB\n";

src/index.hpp

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include "refs.hpp"
1818
#include "randstrobes.hpp"
1919
#include "indexparameters.hpp"
20+
#include "logger.hpp"
2021

2122

2223
struct IndexCreationStatistics {
@@ -38,6 +39,7 @@ struct StrobemerIndex {
3839
using bucket_index_t = uint64_t;
3940
StrobemerIndex(const References& references, const IndexParameters& parameters, int bits=-1)
4041
: filter_cutoff(0)
42+
, partial_filter_cutoff(2000) //fixme initialize somewhere else
4143
, parameters(parameters)
4244
, references(references)
4345
, bits(bits == -1 ? pick_bits(references.total_length()) : bits)
@@ -47,6 +49,7 @@ struct StrobemerIndex {
4749
}
4850
}
4951
unsigned int filter_cutoff;
52+
unsigned int partial_filter_cutoff;
5053
mutable IndexCreationStatistics stats;
5154

5255
void write(const std::string& filename) const;
@@ -80,6 +83,37 @@ struct StrobemerIndex {
8083
return end();
8184
}
8285

86+
//Returns the first entry that matches the first strobe subhash (if using multi-context seeds)
87+
size_t partial_find(randstrobe_hash_t key) const {
88+
uint strobe2_digest = parameters.randstrobe.digest;
89+
randstrobe_hash_t key_prefix = key >> strobe2_digest;
90+
91+
constexpr int MAX_LINEAR_SEARCH = 4;
92+
const unsigned int top_N = key >> (64 - bits);
93+
bucket_index_t position_start = randstrobe_start_indices[top_N];
94+
bucket_index_t position_end = randstrobe_start_indices[top_N + 1];
95+
if (position_start == position_end) {
96+
return end();
97+
}
98+
99+
if (position_end - position_start < MAX_LINEAR_SEARCH) {
100+
for ( ; position_start < position_end; ++position_start) {
101+
if (randstrobes[position_start].hash >> strobe2_digest == key_prefix) return position_start;
102+
if (randstrobes[position_start].hash >> strobe2_digest > key_prefix) return end();
103+
}
104+
return end();
105+
}
106+
auto cmp = [&strobe2_digest](const RefRandstrobe lhs, const RefRandstrobe rhs) {
107+
return (lhs.hash >> strobe2_digest) < (rhs.hash >> strobe2_digest); };
108+
109+
auto pos = std::lower_bound(randstrobes.begin() + position_start,
110+
randstrobes.begin() + position_end,
111+
RefRandstrobe{key, 0, 0},
112+
cmp);
113+
if (pos->hash == key) return pos - randstrobes.begin();
114+
return end();
115+
}
116+
83117
randstrobe_hash_t get_hash(bucket_index_t position) const {
84118
if (position < randstrobes.size()) {
85119
return randstrobes[position].hash;
@@ -92,6 +126,11 @@ struct StrobemerIndex {
92126
return get_hash(position) == get_hash(position + filter_cutoff);
93127
}
94128

129+
bool is_partial_filtered(bucket_index_t position) const {
130+
uint shift = parameters.randstrobe.digest;
131+
return (get_hash(position) >> shift) == (get_hash(position + partial_filter_cutoff) >> shift);
132+
}
133+
95134
unsigned int get_strobe1_position(bucket_index_t position) const {
96135
return randstrobes[position].position;
97136
}

src/indexparameters.cpp

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@ bool RandstrobeParameters::operator==(const RandstrobeParameters& other) const {
1818
&& this->q == other.q
1919
&& this->max_dist == other.max_dist
2020
&& this->w_min == other.w_min
21-
&& this->w_max == other.w_max;
21+
&& this->w_max == other.w_max
22+
&& this->digest == other.digest;
2223
}
2324

2425
/* Pre-defined index parameters that work well for a certain
@@ -30,24 +31,25 @@ struct Profile {
3031
int s_offset;
3132
int l;
3233
int u;
34+
int digest;
3335
};
3436

3537
static auto max{std::numeric_limits<int>::max()};
3638

3739
static std::vector<Profile> profiles = {
38-
Profile{ 50, 90, 18, -4, -2, 1},
39-
Profile{100, 110, 20, -4, -2, 2},
40-
Profile{125, 135, 20, -4, -1, 4},
41-
Profile{150, 175, 20, -4, 1, 7},
42-
Profile{250, 375, 22, -4, 2, 12},
43-
Profile{400, max, 23, -6, 2, 12},
40+
Profile{ 50, 90, 18, -4, -2, 1, 24},
41+
Profile{100, 110, 20, -4, -2, 2, 24},
42+
Profile{125, 135, 20, -4, -1, 4, 24},
43+
Profile{150, 175, 20, -4, 1, 7, 24},
44+
Profile{250, 375, 22, -4, 2, 12, 24},
45+
Profile{400, max, 23, -6, 2, 12, 24},
4446
};
4547

4648
/* Create an IndexParameters instance based on a given read length.
4749
* k, s, l, u, c and max_seed_len can be used to override determined parameters
4850
* by setting them to a value other than IndexParameters::DEFAULT.
4951
*/
50-
IndexParameters IndexParameters::from_read_length(int read_length, int k, int s, int l, int u, int c, int max_seed_len) {
52+
IndexParameters IndexParameters::from_read_length(int read_length, int k, int s, int l, int u, int c, int max_seed_len, int digest) {
5153
const int default_c = 8;
5254
size_t canonical_read_length = 50;
5355
for (const auto& p : profiles) {
@@ -64,6 +66,9 @@ IndexParameters IndexParameters::from_read_length(int read_length, int k, int s,
6466
if (u == DEFAULT) {
6567
u = p.u;
6668
}
69+
if (digest == DEFAULT) {
70+
digest = p.digest;
71+
}
6772
canonical_read_length = p.canonical_read_length;
6873
break;
6974
}
@@ -78,7 +83,7 @@ IndexParameters IndexParameters::from_read_length(int read_length, int k, int s,
7883
}
7984
int q = std::pow(2, c == DEFAULT ? default_c : c) - 1;
8085

81-
return IndexParameters(canonical_read_length, k, s, l, u, q, max_dist);
86+
return IndexParameters(canonical_read_length, k, s, l, u, q, max_dist, digest);
8287
}
8388

8489
void IndexParameters::write(std::ostream& os) const {
@@ -89,6 +94,7 @@ void IndexParameters::write(std::ostream& os) const {
8994
write_int_to_ostream(os, randstrobe.u);
9095
write_int_to_ostream(os, randstrobe.q);
9196
write_int_to_ostream(os, randstrobe.max_dist);
97+
write_int_to_ostream(os, randstrobe.digest);
9298
}
9399

94100
IndexParameters IndexParameters::read(std::istream& is) {
@@ -99,7 +105,8 @@ IndexParameters IndexParameters::read(std::istream& is) {
99105
int u = read_int_from_istream(is);
100106
int q = read_int_from_istream(is);
101107
int max_dist = read_int_from_istream(is);
102-
return IndexParameters(canonical_read_length, k, s, l, u, q, max_dist);
108+
int digest = read_int_from_istream(is);
109+
return IndexParameters(canonical_read_length, k, s, l, u, q, max_dist, digest);
103110
}
104111

105112
bool IndexParameters::operator==(const IndexParameters& other) const {
@@ -119,7 +126,7 @@ std::string IndexParameters::filename_extension() const {
119126
// nothing was overridden
120127
sstream << ".r" << canonical_read_length;
121128
}
122-
sstream << ".sti";
129+
sstream << ".stimc";
123130
return sstream.str();
124131
}
125132

src/indexparameters.hpp

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,14 +43,16 @@ struct RandstrobeParameters {
4343
const int max_dist;
4444
const unsigned w_min;
4545
const unsigned w_max;
46+
const unsigned digest;
4647

47-
RandstrobeParameters(int l, int u, uint64_t q, int max_dist, unsigned w_min, unsigned w_max)
48+
RandstrobeParameters(int l, int u, uint64_t q, int max_dist, unsigned w_min, unsigned w_max, unsigned digest)
4849
: l(l)
4950
, u(u)
5051
, q(q)
5152
, max_dist(max_dist)
5253
, w_min(w_min)
5354
, w_max(w_max)
55+
, digest(digest)
5456
{
5557
verify();
5658
}
@@ -65,6 +67,9 @@ struct RandstrobeParameters {
6567
if (w_min > w_max) {
6668
throw BadParameter("w_min is greater than w_max (choose different -l/-u parameters)");
6769
}
70+
if (digest > 63) {
71+
throw BadParameter("digest size is larger than 63");
72+
}
6873
}
6974
};
7075

@@ -77,15 +82,15 @@ class IndexParameters {
7782

7883
static const int DEFAULT = std::numeric_limits<int>::min();
7984

80-
IndexParameters(size_t canonical_read_length, int k, int s, int l, int u, int q, int max_dist)
85+
IndexParameters(size_t canonical_read_length, int k, int s, int l, int u, int q, int max_dist, int digest)
8186
: canonical_read_length(canonical_read_length)
8287
, syncmer(k, s)
83-
, randstrobe(l, u, q, max_dist, std::max(0, k / (k - s + 1) + l), k / (k - s + 1) + u)
88+
, randstrobe(l, u, q, max_dist, std::max(0, k / (k - s + 1) + l), k / (k - s + 1) + u, digest)
8489
{
8590
}
8691

8792
static IndexParameters from_read_length(
88-
int read_length, int k = DEFAULT, int s = DEFAULT, int l = DEFAULT, int u = DEFAULT, int c = DEFAULT, int max_seed_len = DEFAULT
93+
int read_length, int k = DEFAULT, int s = DEFAULT, int l = DEFAULT, int u = DEFAULT, int c = DEFAULT, int max_seed_len = DEFAULT, int digest = DEFAULT
8994
);
9095
static IndexParameters read(std::istream& os);
9196
std::string filename_extension() const;

src/main.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,8 @@ int run_strobealign(int argc, char **argv) {
167167
opt.l_set ? opt.l : IndexParameters::DEFAULT,
168168
opt.u_set ? opt.u : IndexParameters::DEFAULT,
169169
opt.c_set ? opt.c : IndexParameters::DEFAULT,
170-
opt.max_seed_len_set ? opt.max_seed_len : IndexParameters::DEFAULT
170+
opt.max_seed_len_set ? opt.max_seed_len : IndexParameters::DEFAULT,
171+
opt.digest ? opt.digest : IndexParameters::DEFAULT
171172
);
172173
logger.debug() << index_parameters << '\n';
173174
AlignmentParameters aln_params;

src/nam.cpp

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#include "nam.hpp"
2+
static Logger& logger = Logger::get();
23

34
namespace {
45

@@ -14,12 +15,14 @@ inline void add_to_hits_per_ref(
1415
int query_start,
1516
int query_end,
1617
const StrobemerIndex& index,
17-
size_t position
18+
size_t position,
19+
bool is_partial
1820
) {
1921
int min_diff = std::numeric_limits<int>::max();
2022
for (const auto hash = index.get_hash(position); index.get_hash(position) == hash; ++position) {
2123
int ref_start = index.get_strobe1_position(position);
22-
int ref_end = ref_start + index.strobe2_offset(position) + index.k();
24+
uint strobe2_position = is_partial ? 0 : index.strobe2_offset(position);
25+
int ref_end = ref_start + strobe2_position + index.k();
2326
int diff = std::abs((query_end - query_start) - (ref_end - ref_start));
2427
if (diff <= min_diff) {
2528
hits_per_ref[index.reference_index(position)].push_back(Hit{query_start, query_end, ref_start, ref_end});
@@ -171,7 +174,17 @@ std::pair<float, std::vector<Nam>> find_nams(
171174
continue;
172175
}
173176
nr_good_hits++;
174-
add_to_hits_per_ref(hits_per_ref[q.is_reverse], q.start, q.end, index, position);
177+
add_to_hits_per_ref(hits_per_ref[q.is_reverse], q.start, q.end, index, position, false);
178+
} else {
179+
size_t partial_pos = index.partial_find(q.hash);
180+
if (partial_pos != index.end()) {
181+
total_hits++;
182+
if (index.is_partial_filtered(position)) {
183+
continue;
184+
}
185+
nr_good_hits++;
186+
add_to_hits_per_ref(hits_per_ref[q.is_reverse], q.start, q.end, index, partial_pos, true);
187+
}
175188
}
176189
}
177190
float nonrepetitive_fraction = total_hits > 0 ? ((float) nr_good_hits) / ((float) total_hits) : 1.0;
@@ -231,12 +244,14 @@ std::vector<Nam> find_nams_rescue(
231244
if ((rh.count > rescue_cutoff && cnt >= 5) || rh.count > 1000) {
232245
break;
233246
}
234-
add_to_hits_per_ref(hits_per_ref[is_revcomp], rh.query_start, rh.query_end, index, rh.position);
247+
add_to_hits_per_ref(hits_per_ref[is_revcomp], rh.query_start, rh.query_end, index, rh.position, false);
235248
cnt++;
236249
}
237250
is_revcomp++;
238251
}
239252

253+
254+
240255
return merge_hits_into_nams_forward_and_reverse(hits_per_ref, index.k(), true);
241256
}
242257

0 commit comments

Comments
 (0)