Skip to content

Commit ea5d6e5

Browse files
committed
better rgfa support in call
1 parent f29425d commit ea5d6e5

File tree

2 files changed

+118
-28
lines changed

2 files changed

+118
-28
lines changed

src/graph_caller.cpp

Lines changed: 101 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include "graph_caller.hpp"
22
#include "algorithms/expand_context.hpp"
33
#include "annotation.hpp"
4+
#include "rgfa.hpp"
45

56
//#define debug
67

@@ -18,42 +19,52 @@ void GraphCaller::call_top_level_snarls(const HandleGraph& graph, RecurseType re
1819

1920
// Used to recurse on children of parents that can't be called
2021
size_t thread_count = get_thread_count();
21-
vector<vector<const Snarl*>> snarl_queue(thread_count);
22+
vector<vector<pair<const Snarl*, int>>> snarl_queue(thread_count);
2223

2324
// Run the snarl caller on a snarl, and queue up the children if it fails
24-
auto process_snarl = [&](const Snarl* snarl) {
25+
auto process_snarl = [&](const Snarl* snarl, int ploidy_override) {
2526

2627
if (!snarl_manager.is_trivial(snarl, graph)) {
2728

2829
#ifdef debug
2930
cerr << "GraphCaller running call_snarl on " << pb2json(*snarl) << endl;
3031
#endif
31-
32-
bool was_called = call_snarl(*snarl);
33-
if (recurse_type == RecurseAlways || (!was_called && recurse_type == RecurseOnFail)) {
34-
const vector<const Snarl*>& children = snarl_manager.children_of(snarl);
35-
vector<const Snarl*>& thread_queue = snarl_queue[omp_get_thread_num()];
36-
thread_queue.insert(thread_queue.end(), children.begin(), children.end());
32+
const vector<const Snarl*>& children = snarl_manager.children_of(snarl);
33+
vector<int> child_ploidies(children.size(), -1);
34+
bool was_called = call_snarl(*snarl, ploidy_override, &child_ploidies);
35+
if (recurse_type == RecurseAlways || (!was_called && recurse_type == RecurseOnFail)) {
36+
vector<pair<const Snarl*, int>>& thread_queue = snarl_queue[omp_get_thread_num()];
37+
for (int64_t i = 0; i < children.size(); ++i) {
38+
thread_queue.push_back(make_pair(children[i], child_ploidies[i]));
39+
}
3740
}
3841
}
3942
};
4043

4144
// Start with the top level snarls
42-
snarl_manager.for_each_top_level_snarl_parallel(process_snarl);
45+
// Queue them up since process_snarl is no longer a valid callback for the iterator snarl_manager.for_each_top_level_snarl()
46+
vector<const Snarl*> top_level_snarls;
47+
snarl_manager.for_each_top_level_snarl([&](const Snarl* snarl) {
48+
top_level_snarls.push_back(snarl);
49+
});
50+
#pragma omp parallel for schedule(dynamic, 1)
51+
for (int64_t i = 0; i < top_level_snarls.size(); ++i) {
52+
process_snarl(top_level_snarls[i], -1);
53+
}
4354

4455
// Then recurse on any children the snarl caller failed to handle
4556
while (!std::all_of(snarl_queue.begin(), snarl_queue.end(),
46-
[](const vector<const Snarl*>& snarl_vec) {return snarl_vec.empty();})) {
47-
vector<const Snarl*> cur_queue;
48-
for (vector<const Snarl*>& thread_queue : snarl_queue) {
57+
[](const vector<pair<const Snarl*, int>>& snarl_vec) {return snarl_vec.empty();})) {
58+
vector<pair<const Snarl*, int>> cur_queue;
59+
for (vector<pair<const Snarl*, int>>& thread_queue : snarl_queue) {
4960
cur_queue.reserve(cur_queue.size() + thread_queue.size());
5061
std::move(thread_queue.begin(), thread_queue.end(), std::back_inserter(cur_queue));
5162
thread_queue.clear();
5263
}
5364

5465
#pragma omp parallel for schedule(dynamic, 1)
5566
for (int i = 0; i < cur_queue.size(); ++i) {
56-
process_snarl(cur_queue[i]);
67+
process_snarl(cur_queue[i].first, cur_queue[i].second);
5768
}
5869
}
5970

@@ -175,8 +186,48 @@ vector<Chain> GraphCaller::break_chain(const HandleGraph& graph, const Chain& ch
175186

176187
return chain_frags;
177188
}
189+
190+
191+
void GraphCaller::resolve_child_ploidies(const HandleGraph& graph, const Snarl* snarl, const vector<SnarlTraversal>& travs,
192+
const vector<int>& genotype, vector<int>& child_ploidies) {
193+
const vector<const Snarl*>& children = snarl_manager.children_of(snarl);
194+
195+
child_ploidies.resize(children.size());
178196

179-
VCFOutputCaller::VCFOutputCaller(const string& sample_name) : sample_name(sample_name), translation(nullptr), include_nested(false)
197+
if (!children.empty()) {
198+
// index the nodes in the traversals
199+
vector<unordered_set<handle_t>> trav_indexes(genotype.size());
200+
for (int64_t i = 0; i < genotype.size(); ++i) {
201+
const SnarlTraversal& trav = travs[genotype[i]];
202+
unordered_set<handle_t>& trav_idx = trav_indexes[i];
203+
if (i > 0 && genotype[i] == genotype[i-1]) {
204+
trav_idx = trav_indexes[i-1];
205+
} else {
206+
for (int j = 0; j < trav.visit_size(); ++j) {
207+
trav_idx.insert(graph.get_handle(trav.visit(j).node_id(), trav.visit(j).backward()));
208+
}
209+
}
210+
}
211+
212+
// for every child snarl, count the number of traversals that spans it -- this will be its ploidy
213+
for (int64_t i = 0; i < children.size(); ++i) {
214+
const Snarl* child_snarl = children[i];
215+
child_ploidies[i] = 0;
216+
handle_t child_start = graph.get_handle(child_snarl->start().node_id(), child_snarl->start().backward());
217+
handle_t child_end = graph.get_handle(child_snarl->end().node_id(), child_snarl->end().backward());
218+
for (int64_t j = 0; j < trav_indexes.size(); ++j) {
219+
unordered_set<handle_t>& trav_idx = trav_indexes[j];
220+
if ((trav_idx.count(child_start) && trav_idx.count(child_end)) ||
221+
(trav_idx.count(graph.flip(child_start)) && trav_idx.count(graph.flip(child_end)))) {
222+
++child_ploidies[i];
223+
}
224+
}
225+
}
226+
}
227+
}
228+
229+
230+
VCFOutputCaller::VCFOutputCaller(const string& sample_name) : sample_name(sample_name), translation(nullptr), include_nested(false), write_full_names(false)
180231
{
181232
output_variants.resize(get_thread_count());
182233
}
@@ -491,11 +542,11 @@ void VCFOutputCaller::emit_variant(const PathPositionHandleGraph& graph, SnarlCa
491542

492543
// resolve subpath naming
493544
subrange_t subrange;
494-
string basepath_name = Paths::strip_subrange(ref_path_name, &subrange);
545+
string basepath_name = Paths::strip_subrange(RGFACover::revert_rgfa_path_name(ref_path_name), &subrange);
495546
size_t basepath_offset = subrange == PathMetadata::NO_SUBRANGE ? 0 : subrange.first;
496547
// in VCF we usually just want a contig
497548
string contig_name = PathMetadata::parse_locus_name(basepath_name);
498-
if (contig_name != PathMetadata::NO_LOCUS_NAME) {
549+
if (contig_name != PathMetadata::NO_LOCUS_NAME && !this->write_full_names) {
499550
basepath_name = contig_name;
500551
}
501552
// fill out the rest of the variant
@@ -766,6 +817,20 @@ void VCFOutputCaller::scan_snarl(const string& allele_string, function<void(cons
766817
}
767818
}
768819

820+
void VCFOutputCaller::toggle_full_names_from_paths(const vector<string>& ref_paths) {
821+
set<string> samples;
822+
set<int> haplotypes;
823+
for (const string& path_name : ref_paths) {
824+
samples.insert(PathMetadata::parse_sample_name(path_name));
825+
haplotypes.insert(PathMetadata::parse_haplotype(path_name));
826+
if (samples.size() > 1 || haplotypes.size() > 1) {
827+
this->write_full_names = true;
828+
return;
829+
}
830+
}
831+
this->write_full_names = false;
832+
}
833+
769834
GAFOutputCaller::GAFOutputCaller(AlignmentEmitter* emitter, const string& sample_name, const vector<string>& ref_paths,
770835
size_t trav_padding) :
771836
emitter(emitter),
@@ -1054,7 +1119,7 @@ VCFGenotyper::~VCFGenotyper() {
10541119

10551120
}
10561121

1057-
bool VCFGenotyper::call_snarl(const Snarl& snarl) {
1122+
bool VCFGenotyper::call_snarl(const Snarl& snarl, int ploidy_override, vector<int>* out_child_ploidies) {
10581123

10591124
// could be that our graph is a subgraph of the graph the snarls were computed from
10601125
// so bypass snarls we can't process
@@ -1296,6 +1361,8 @@ LegacyCaller::LegacyCaller(const PathPositionHandleGraph& graph,
12961361
// our graph is not in vg format. we will make graphs for each site as needed and work with those
12971362
traversal_finder = nullptr;
12981363
}
1364+
1365+
this->toggle_full_names_from_paths(ref_paths);
12991366
}
13001367

13011368
LegacyCaller::~LegacyCaller() {
@@ -1305,7 +1372,7 @@ LegacyCaller::~LegacyCaller() {
13051372
}
13061373
}
13071374

1308-
bool LegacyCaller::call_snarl(const Snarl& snarl) {
1375+
bool LegacyCaller::call_snarl(const Snarl& snarl, int ploidy_override, vector<int>* out_child_ploidies) {
13091376

13101377
// if we can't handle the snarl, then the GraphCaller framework will recurse on its children
13111378
if (!is_traversable(snarl)) {
@@ -1630,20 +1697,26 @@ FlowCaller::FlowCaller(const PathPositionHandleGraph& graph,
16301697
ref_ploidies[ref_paths[i]] = i < ref_path_ploidies.size() ? ref_path_ploidies[i] : 2;
16311698
}
16321699

1700+
this->toggle_full_names_from_paths(ref_paths);
16331701
}
16341702

16351703
FlowCaller::~FlowCaller() {
16361704

16371705
}
16381706

1639-
bool FlowCaller::call_snarl(const Snarl& managed_snarl) {
1707+
bool FlowCaller::call_snarl(const Snarl& managed_snarl, int ploidy_override, vector<int>* out_child_ploidies) {
16401708

16411709
// todo: In order to experiment with merging consecutive snarls to make longer traversals,
16421710
// I am experimenting with sending "fake" snarls through this code. So make a local
16431711
// copy to work on to do things like flip -- calling any snarl_manager code that
16441712
// wants a pointer will crash.
16451713
Snarl snarl = managed_snarl;
16461714

1715+
if (ploidy_override == 0) {
1716+
// returning true is a bit ironic, but we do *not* want to recurse so it's important we do
1717+
return true;
1718+
}
1719+
16471720
if (snarl.start().node_id() == snarl.end().node_id() ||
16481721
!graph.has_node(snarl.start().node_id()) || !graph.has_node(snarl.end().node_id())) {
16491722
// can't call one-node or out-of graph snarls.
@@ -1803,10 +1876,14 @@ bool FlowCaller::call_snarl(const Snarl& managed_snarl) {
18031876
// use our support caller to choose our genotype
18041877
vector<int> trav_genotype;
18051878
unique_ptr<SnarlCaller::CallInfo> trav_call_info;
1806-
int ploidy = ref_ploidies[ref_path_name];
1879+
int ploidy = ploidy_override == -1 ? ref_ploidies[ref_path_name] : ploidy_override;
18071880
std::tie(trav_genotype, trav_call_info) = snarl_caller.genotype(snarl, travs, ref_trav_idx, ploidy, ref_path_name,
18081881
make_pair(get<0>(ref_interval), get<1>(ref_interval)));
1809-
1882+
if (out_child_ploidies != nullptr) {
1883+
// derive ploidies for child snarls from the genotype traversals and save them
1884+
resolve_child_ploidies(graph, &managed_snarl, travs, trav_genotype, *out_child_ploidies);
1885+
}
1886+
18101887
assert(trav_genotype.empty() || trav_genotype.size() == ploidy);
18111888

18121889
if (!gaf_output) {
@@ -1866,14 +1943,15 @@ NestedFlowCaller::NestedFlowCaller(const PathPositionHandleGraph& graph,
18661943
ref_path_set.insert(ref_paths[i]);
18671944
ref_ploidies[ref_paths[i]] = i < ref_path_ploidies.size() ? ref_path_ploidies[i] : 2;
18681945
}
1869-
1946+
1947+
this->toggle_full_names_from_paths(ref_paths);
18701948
}
18711949

18721950
NestedFlowCaller::~NestedFlowCaller() {
18731951

18741952
}
18751953

1876-
bool NestedFlowCaller::call_snarl(const Snarl& managed_snarl) {
1954+
bool NestedFlowCaller::call_snarl(const Snarl& managed_snarl, int ploidy_override, vector<int>* out_child_ploidies) {
18771955

18781956
// remember the calls for each child snarl in this table
18791957
CallTable call_table;

src/graph_caller.hpp

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -49,12 +49,16 @@ class GraphCaller {
4949
RecurseType recurise_type = RecurseOnFail);
5050

5151
/// Call a given snarl, and print the output to out_stream
52-
virtual bool call_snarl(const Snarl& snarl) = 0;
52+
virtual bool call_snarl(const Snarl& snarl, int ploidy_override = -1, vector<int>* out_child_ploidies = nullptr) = 0;
5353

5454
protected:
5555

5656
/// Break up a chain into bits that we want to call using size heuristics
5757
vector<Chain> break_chain(const HandleGraph& graph, const Chain& chain, size_t max_edges, size_t max_trivial);
58+
59+
/// Compute the child ploidies baseds on the called genotype
60+
void resolve_child_ploidies(const HandleGraph& graph, const Snarl* snarl, const vector<SnarlTraversal>& travs,
61+
const vector<int>& genotype, vector<int>& child_ploidies);
5862

5963
protected:
6064

@@ -135,6 +139,11 @@ class VCFOutputCaller {
135139

136140
// update the PS and LV tags in the output buffer (called in write_variants if include_nested is true)
137141
void update_nesting_info_tags(const SnarlManager* snarl_manager);
142+
143+
// toggle write_full_names automatically depending on ref_paths
144+
// (keeps backwards compatibility when it was always false whenever possible,
145+
// but flips to full names when necessary)
146+
void toggle_full_names_from_paths(const vector<string>& ref_paths);
138147

139148
/// output vcf
140149
mutable vcflib::VariantCallFile output_vcf;
@@ -154,6 +163,9 @@ class VCFOutputCaller {
154163

155164
// need to write LV/PS info tags
156165
bool include_nested;
166+
167+
// toggle writing full name or just contig name
168+
bool write_full_names;
157169
};
158170

159171
/**
@@ -221,7 +233,7 @@ class VCFGenotyper : public GraphCaller, public VCFOutputCaller, public GAFOutpu
221233

222234
virtual ~VCFGenotyper();
223235

224-
virtual bool call_snarl(const Snarl& snarl);
236+
virtual bool call_snarl(const Snarl& snarl, int ploidy_override = -1, vector<int>* out_child_ploidies = nullptr);
225237

226238
virtual string vcf_header(const PathHandleGraph& graph, const vector<string>& contigs,
227239
const vector<size_t>& contig_length_overrides = {}) const;
@@ -274,7 +286,7 @@ class LegacyCaller : public GraphCaller, public VCFOutputCaller {
274286

275287
virtual ~LegacyCaller();
276288

277-
virtual bool call_snarl(const Snarl& snarl);
289+
virtual bool call_snarl(const Snarl& snarl, int ploidy_override = -1, vector<int>* out_child_ploidies = nullptr);
278290

279291
virtual string vcf_header(const PathHandleGraph& graph, const vector<string>& contigs,
280292
const vector<size_t>& contig_length_overrides = {}) const;
@@ -375,7 +387,7 @@ class FlowCaller : public GraphCaller, public VCFOutputCaller, public GAFOutputC
375387

376388
virtual ~FlowCaller();
377389

378-
virtual bool call_snarl(const Snarl& snarl);
390+
virtual bool call_snarl(const Snarl& snarl, int ploidy_override = -1, vector<int>* out_child_ploidies = nullptr);
379391

380392
virtual string vcf_header(const PathHandleGraph& graph, const vector<string>& contigs,
381393
const vector<size_t>& contig_length_overrides = {}) const;
@@ -449,7 +461,7 @@ class NestedFlowCaller : public GraphCaller, public VCFOutputCaller, public GAFO
449461

450462
virtual ~NestedFlowCaller();
451463

452-
virtual bool call_snarl(const Snarl& snarl);
464+
virtual bool call_snarl(const Snarl& snarl, int ploidy_override = -1, vector<int>* out_child_ploidies = nullptr);
453465

454466
virtual string vcf_header(const PathHandleGraph& graph, const vector<string>& contigs,
455467
const vector<size_t>& contig_length_overrides = {}) const;

0 commit comments

Comments
 (0)