11#include " graph_caller.hpp"
22#include " algorithms/expand_context.hpp"
33#include " annotation.hpp"
4+ #include " rgfa.hpp"
45
56// #define debug
67
@@ -18,42 +19,52 @@ void GraphCaller::call_top_level_snarls(const HandleGraph& graph, RecurseType re
1819
1920 // Used to recurse on children of parents that can't be called
2021 size_t thread_count = get_thread_count ();
21- vector<vector<const Snarl*>> snarl_queue (thread_count);
22+ vector<vector<pair< const Snarl*, int > >> snarl_queue (thread_count);
2223
2324 // Run the snarl caller on a snarl, and queue up the children if it fails
24- auto process_snarl = [&](const Snarl* snarl) {
25+ auto process_snarl = [&](const Snarl* snarl, int ploidy_override ) {
2526
2627 if (!snarl_manager.is_trivial (snarl, graph)) {
2728
2829#ifdef debug
2930 cerr << " GraphCaller running call_snarl on " << pb2json (*snarl) << endl;
3031#endif
31-
32- bool was_called = call_snarl (*snarl);
33- if (recurse_type == RecurseAlways || (!was_called && recurse_type == RecurseOnFail)) {
34- const vector<const Snarl*>& children = snarl_manager.children_of (snarl);
35- vector<const Snarl*>& thread_queue = snarl_queue[omp_get_thread_num ()];
36- thread_queue.insert (thread_queue.end (), children.begin (), children.end ());
32+ const vector<const Snarl*>& children = snarl_manager.children_of (snarl);
33+ vector<int > child_ploidies (children.size (), -1 );
34+ bool was_called = call_snarl (*snarl, ploidy_override, &child_ploidies);
35+ if (recurse_type == RecurseAlways || (!was_called && recurse_type == RecurseOnFail)) {
36+ vector<pair<const Snarl*, int >>& thread_queue = snarl_queue[omp_get_thread_num ()];
37+ for (int64_t i = 0 ; i < children.size (); ++i) {
38+ thread_queue.push_back (make_pair (children[i], child_ploidies[i]));
39+ }
3740 }
3841 }
3942 };
4043
4144 // Start with the top level snarls
42- snarl_manager.for_each_top_level_snarl_parallel (process_snarl);
45+ // Queue them up since process_snarl is no longer a valid callback for the iterator snarl_manager.for_each_top_level_snarl()
46+ vector<const Snarl*> top_level_snarls;
47+ snarl_manager.for_each_top_level_snarl ([&](const Snarl* snarl) {
48+ top_level_snarls.push_back (snarl);
49+ });
50+ #pragma omp parallel for schedule(dynamic, 1)
51+ for (int64_t i = 0 ; i < top_level_snarls.size (); ++i) {
52+ process_snarl (top_level_snarls[i], -1 );
53+ }
4354
4455 // Then recurse on any children the snarl caller failed to handle
4556 while (!std::all_of (snarl_queue.begin (), snarl_queue.end (),
46- [](const vector<const Snarl*>& snarl_vec) {return snarl_vec.empty ();})) {
47- vector<const Snarl*> cur_queue;
48- for (vector<const Snarl*>& thread_queue : snarl_queue) {
57+ [](const vector<pair< const Snarl*, int > >& snarl_vec) {return snarl_vec.empty ();})) {
58+ vector<pair< const Snarl*, int > > cur_queue;
59+ for (vector<pair< const Snarl*, int > >& thread_queue : snarl_queue) {
4960 cur_queue.reserve (cur_queue.size () + thread_queue.size ());
5061 std::move (thread_queue.begin (), thread_queue.end (), std::back_inserter (cur_queue));
5162 thread_queue.clear ();
5263 }
5364
5465#pragma omp parallel for schedule(dynamic, 1)
5566 for (int i = 0 ; i < cur_queue.size (); ++i) {
56- process_snarl (cur_queue[i]);
67+ process_snarl (cur_queue[i]. first , cur_queue[i]. second );
5768 }
5869 }
5970
@@ -175,8 +186,48 @@ vector<Chain> GraphCaller::break_chain(const HandleGraph& graph, const Chain& ch
175186
176187 return chain_frags;
177188}
189+
190+
191+ void GraphCaller::resolve_child_ploidies (const HandleGraph& graph, const Snarl* snarl, const vector<SnarlTraversal>& travs,
192+ const vector<int >& genotype, vector<int >& child_ploidies) {
193+ const vector<const Snarl*>& children = snarl_manager.children_of (snarl);
194+
195+ child_ploidies.resize (children.size ());
178196
179- VCFOutputCaller::VCFOutputCaller (const string& sample_name) : sample_name(sample_name), translation(nullptr ), include_nested(false )
197+ if (!children.empty ()) {
198+ // index the nodes in the traversals
199+ vector<unordered_set<handle_t >> trav_indexes (genotype.size ());
200+ for (int64_t i = 0 ; i < genotype.size (); ++i) {
201+ const SnarlTraversal& trav = travs[genotype[i]];
202+ unordered_set<handle_t >& trav_idx = trav_indexes[i];
203+ if (i > 0 && genotype[i] == genotype[i-1 ]) {
204+ trav_idx = trav_indexes[i-1 ];
205+ } else {
206+ for (int j = 0 ; j < trav.visit_size (); ++j) {
207+ trav_idx.insert (graph.get_handle (trav.visit (j).node_id (), trav.visit (j).backward ()));
208+ }
209+ }
210+ }
211+
212+ // for every child snarl, count the number of traversals that spans it -- this will be its ploidy
213+ for (int64_t i = 0 ; i < children.size (); ++i) {
214+ const Snarl* child_snarl = children[i];
215+ child_ploidies[i] = 0 ;
216+ handle_t child_start = graph.get_handle (child_snarl->start ().node_id (), child_snarl->start ().backward ());
217+ handle_t child_end = graph.get_handle (child_snarl->end ().node_id (), child_snarl->end ().backward ());
218+ for (int64_t j = 0 ; j < trav_indexes.size (); ++j) {
219+ unordered_set<handle_t >& trav_idx = trav_indexes[j];
220+ if ((trav_idx.count (child_start) && trav_idx.count (child_end)) ||
221+ (trav_idx.count (graph.flip (child_start)) && trav_idx.count (graph.flip (child_end)))) {
222+ ++child_ploidies[i];
223+ }
224+ }
225+ }
226+ }
227+ }
228+
229+
230+ VCFOutputCaller::VCFOutputCaller (const string& sample_name) : sample_name(sample_name), translation(nullptr ), include_nested(false ), write_full_names(false )
180231{
181232 output_variants.resize (get_thread_count ());
182233}
@@ -491,11 +542,11 @@ void VCFOutputCaller::emit_variant(const PathPositionHandleGraph& graph, SnarlCa
491542
492543 // resolve subpath naming
493544 subrange_t subrange;
494- string basepath_name = Paths::strip_subrange (ref_path_name, &subrange);
545+ string basepath_name = Paths::strip_subrange (RGFACover::revert_rgfa_path_name ( ref_path_name) , &subrange);
495546 size_t basepath_offset = subrange == PathMetadata::NO_SUBRANGE ? 0 : subrange.first ;
496547 // in VCF we usually just want a contig
497548 string contig_name = PathMetadata::parse_locus_name (basepath_name);
498- if (contig_name != PathMetadata::NO_LOCUS_NAME) {
549+ if (contig_name != PathMetadata::NO_LOCUS_NAME && ! this -> write_full_names ) {
499550 basepath_name = contig_name;
500551 }
501552 // fill out the rest of the variant
@@ -766,6 +817,20 @@ void VCFOutputCaller::scan_snarl(const string& allele_string, function<void(cons
766817 }
767818}
768819
820+ void VCFOutputCaller::toggle_full_names_from_paths (const vector<string>& ref_paths) {
821+ set<string> samples;
822+ set<int > haplotypes;
823+ for (const string& path_name : ref_paths) {
824+ samples.insert (PathMetadata::parse_sample_name (path_name));
825+ haplotypes.insert (PathMetadata::parse_haplotype (path_name));
826+ if (samples.size () > 1 || haplotypes.size () > 1 ) {
827+ this ->write_full_names = true ;
828+ return ;
829+ }
830+ }
831+ this ->write_full_names = false ;
832+ }
833+
769834GAFOutputCaller::GAFOutputCaller (AlignmentEmitter* emitter, const string& sample_name, const vector<string>& ref_paths,
770835 size_t trav_padding) :
771836 emitter (emitter),
@@ -1054,7 +1119,7 @@ VCFGenotyper::~VCFGenotyper() {
10541119
10551120}
10561121
1057- bool VCFGenotyper::call_snarl (const Snarl& snarl) {
1122+ bool VCFGenotyper::call_snarl (const Snarl& snarl, int ploidy_override, vector< int >* out_child_ploidies ) {
10581123
10591124 // could be that our graph is a subgraph of the graph the snarls were computed from
10601125 // so bypass snarls we can't process
@@ -1296,6 +1361,8 @@ LegacyCaller::LegacyCaller(const PathPositionHandleGraph& graph,
12961361 // our graph is not in vg format. we will make graphs for each site as needed and work with those
12971362 traversal_finder = nullptr ;
12981363 }
1364+
1365+ this ->toggle_full_names_from_paths (ref_paths);
12991366}
13001367
13011368LegacyCaller::~LegacyCaller () {
@@ -1305,7 +1372,7 @@ LegacyCaller::~LegacyCaller() {
13051372 }
13061373}
13071374
1308- bool LegacyCaller::call_snarl (const Snarl& snarl) {
1375+ bool LegacyCaller::call_snarl (const Snarl& snarl, int ploidy_override, vector< int >* out_child_ploidies ) {
13091376
13101377 // if we can't handle the snarl, then the GraphCaller framework will recurse on its children
13111378 if (!is_traversable (snarl)) {
@@ -1630,20 +1697,26 @@ FlowCaller::FlowCaller(const PathPositionHandleGraph& graph,
16301697 ref_ploidies[ref_paths[i]] = i < ref_path_ploidies.size () ? ref_path_ploidies[i] : 2 ;
16311698 }
16321699
1700+ this ->toggle_full_names_from_paths (ref_paths);
16331701}
16341702
16351703FlowCaller::~FlowCaller () {
16361704
16371705}
16381706
1639- bool FlowCaller::call_snarl (const Snarl& managed_snarl) {
1707+ bool FlowCaller::call_snarl (const Snarl& managed_snarl, int ploidy_override, vector< int >* out_child_ploidies ) {
16401708
16411709 // todo: In order to experiment with merging consecutive snarls to make longer traversals,
16421710 // I am experimenting with sending "fake" snarls through this code. So make a local
16431711 // copy to work on to do things like flip -- calling any snarl_manager code that
16441712 // wants a pointer will crash.
16451713 Snarl snarl = managed_snarl;
16461714
1715+ if (ploidy_override == 0 ) {
1716+ // returning true is a bit ironic, but we do *not* want to recurse so it's important we do
1717+ return true ;
1718+ }
1719+
16471720 if (snarl.start ().node_id () == snarl.end ().node_id () ||
16481721 !graph.has_node (snarl.start ().node_id ()) || !graph.has_node (snarl.end ().node_id ())) {
16491722 // can't call one-node or out-of graph snarls.
@@ -1803,10 +1876,14 @@ bool FlowCaller::call_snarl(const Snarl& managed_snarl) {
18031876 // use our support caller to choose our genotype
18041877 vector<int > trav_genotype;
18051878 unique_ptr<SnarlCaller::CallInfo> trav_call_info;
1806- int ploidy = ref_ploidies[ref_path_name];
1879+ int ploidy = ploidy_override == - 1 ? ref_ploidies[ref_path_name] : ploidy_override ;
18071880 std::tie (trav_genotype, trav_call_info) = snarl_caller.genotype (snarl, travs, ref_trav_idx, ploidy, ref_path_name,
18081881 make_pair (get<0 >(ref_interval), get<1 >(ref_interval)));
1809-
1882+ if (out_child_ploidies != nullptr ) {
1883+ // derive ploidies for child snarls from the genotype traversals and save them
1884+ resolve_child_ploidies (graph, &managed_snarl, travs, trav_genotype, *out_child_ploidies);
1885+ }
1886+
18101887 assert (trav_genotype.empty () || trav_genotype.size () == ploidy);
18111888
18121889 if (!gaf_output) {
@@ -1866,14 +1943,15 @@ NestedFlowCaller::NestedFlowCaller(const PathPositionHandleGraph& graph,
18661943 ref_path_set.insert (ref_paths[i]);
18671944 ref_ploidies[ref_paths[i]] = i < ref_path_ploidies.size () ? ref_path_ploidies[i] : 2 ;
18681945 }
1869-
1946+
1947+ this ->toggle_full_names_from_paths (ref_paths);
18701948}
18711949
18721950NestedFlowCaller::~NestedFlowCaller () {
18731951
18741952}
18751953
1876- bool NestedFlowCaller::call_snarl (const Snarl& managed_snarl) {
1954+ bool NestedFlowCaller::call_snarl (const Snarl& managed_snarl, int ploidy_override, vector< int >* out_child_ploidies ) {
18771955
18781956 // remember the calls for each child snarl in this table
18791957 CallTable call_table;
0 commit comments