Skip to content

Commit 21b8dec

Browse files
committed
add tag for allele frequency to call output
1 parent 4d9fe96 commit 21b8dec

File tree

4 files changed

+55
-7
lines changed

4 files changed

+55
-7
lines changed

src/graph_caller.cpp

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -448,7 +448,8 @@ bool VCFOutputCaller::emit_variant(const PathPositionHandleGraph& graph, SnarlCa
448448
const Snarl& snarl, const vector<SnarlTraversal>& called_traversals,
449449
const vector<int>& genotype, int ref_trav_idx, const unique_ptr<SnarlCaller::CallInfo>& call_info,
450450
const string& ref_path_name, int ref_offset, bool genotype_snarls, int ploidy,
451-
function<string(const vector<SnarlTraversal>&, const vector<int>&, int, int, int)> trav_to_string) {
451+
function<string(const vector<SnarlTraversal>&, const vector<int>&, int, int, int)> trav_to_string,
452+
const vector<pair<string, string>>& info_tags) {
452453

453454
#ifdef debug
454455
cerr << "emitting variant for " << pb2json(snarl) << endl;
@@ -581,6 +582,11 @@ bool VCFOutputCaller::emit_variant(const PathPositionHandleGraph& graph, SnarlCa
581582
// add some support info
582583
snarl_caller.update_vcf_info(snarl, site_traversals, site_genotype, call_info, sample_name, out_variant);
583584

585+
// add the info
586+
for (const pair<string, string>& info_tag : info_tags) {
587+
out_variant.info[info_tag.first].push_back(info_tag.second);
588+
}
589+
584590
// if genotype_snarls, then we only flatten up to the snarl endpoints
585591
// (this is when we are in genotyping mode and want consistent calls regardless of the sample)
586592
int64_t flatten_len_s = 0;
@@ -1827,13 +1833,24 @@ bool FlowCaller::call_snarl(const Snarl& managed_snarl) {
18271833

18281834
vector<SnarlTraversal> travs;
18291835
FlowTraversalFinder* flow_trav_finder = dynamic_cast<FlowTraversalFinder*>(&traversal_finder);
1836+
vector<pair<string, string>> extra_info_tags;
1837+
int64_t site_allele_count = -1;
18301838
if (flow_trav_finder != nullptr) {
18311839
// find the max flow traversals using specialized interface that accepts avg heurstic toggle
18321840
pair<vector<SnarlTraversal>, vector<double>> weighted_travs = flow_trav_finder->find_weighted_traversals(snarl, greedy_avg_flow);
18331841
travs = std::move(weighted_travs.first);
18341842
} else {
1835-
// find the traversals using the generic interface
1836-
travs = traversal_finder.find_traversals(snarl);
1843+
// find the traversals with the gbwt. also count the haplotypes in the site and remember it
1844+
// for a VCF tag (GAN)
1845+
GBWTTraversalFinder* gbwt_trav_finder = dynamic_cast<GBWTTraversalFinder*>(&traversal_finder);
1846+
if (gbwt_trav_finder != nullptr) {
1847+
pair<vector<SnarlTraversal>, vector<gbwt::size_type>> gbwt_path_travs = gbwt_trav_finder->find_path_traversals(snarl);
1848+
travs = std::move(gbwt_path_travs.first);
1849+
extra_info_tags.push_back(make_pair("GAN", std::to_string(gbwt_trav_finder->count_haplotypes(gbwt_path_travs.second))));
1850+
} else {
1851+
// find the traversals using the generic interface
1852+
travs = traversal_finder.find_traversals(snarl);
1853+
}
18371854
}
18381855

18391856
if (travs.empty()) {
@@ -1893,7 +1910,7 @@ bool FlowCaller::call_snarl(const Snarl& managed_snarl) {
18931910
bool added = true;
18941911
if (!gaf_output) {
18951912
added = emit_variant(graph, snarl_caller, snarl, travs, trav_genotype, ref_trav_idx, trav_call_info, ref_path_name,
1896-
ref_offsets[ref_path_name], genotype_snarls, ploidy);
1913+
ref_offsets[ref_path_name], genotype_snarls, ploidy, nullptr, extra_info_tags);
18971914
} else {
18981915
pair<string, int64_t> pos_info = get_ref_position(graph, snarl, ref_path_name, ref_offsets[ref_path_name]);
18991916
emit_gaf_variant(graph, print_snarl(snarl), travs, trav_genotype, ref_trav_idx, pos_info.first, pos_info.second, &support_finder);
@@ -1908,6 +1925,9 @@ bool FlowCaller::call_snarl(const Snarl& managed_snarl) {
19081925
string FlowCaller::vcf_header(const PathHandleGraph& graph, const vector<string>& contigs,
19091926
const vector<size_t>& contig_length_overrides) const {
19101927
string header = VCFOutputCaller::vcf_header(graph, contigs, contig_length_overrides);
1928+
if (dynamic_cast<GBWTTraversalFinder*>(&this->traversal_finder) != nullptr) {
1929+
header+= "##INFO=<ID=GAN,Number=1,Type=Integer,Description=\"Number of haplotypes going through site in the graph\">\n";
1930+
}
19111931
header += "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n";
19121932
snarl_caller.update_vcf_header(header);
19131933
header += "##FILTER=<ID=PASS,Description=\"All filters passed\">\n";

src/graph_caller.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,8 @@ class VCFOutputCaller {
119119
const Snarl& snarl, const vector<SnarlTraversal>& called_traversals,
120120
const vector<int>& genotype, int ref_trav_idx, const unique_ptr<SnarlCaller::CallInfo>& call_info,
121121
const string& ref_path_name, int ref_offset, bool genotype_snarls, int ploidy,
122-
function<string(const vector<SnarlTraversal>&, const vector<int>&, int, int, int)> trav_to_string = nullptr);
122+
function<string(const vector<SnarlTraversal>&, const vector<int>&, int, int, int)> trav_to_string = nullptr,
123+
const vector<pair<string, string>>& info_tags = {});
123124

124125
/// get the interval of a snarl from our reference path using the PathPositionHandleGraph interface
125126
/// the bool is true if the snarl's backward on the path

src/traversal_finder.cpp

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include "cactus.hpp"
55
#include "gbwt_helper.hpp"
66
#include "haplotype_extracter.hpp"
7+
#include <gbwtgraph/gbwtgraph.h>
78
//#define debug
89

910
namespace vg {
@@ -3435,7 +3436,7 @@ pair<vector<SnarlTraversal>, vector<double>> FlowTraversalFinder::find_weighted_
34353436
GBWTTraversalFinder::GBWTTraversalFinder(const HandleGraph& graph, const gbwt::GBWT& gbwt) :
34363437
graph(graph),
34373438
gbwt(gbwt) {
3438-
3439+
this->gbwt_reference_samples = gbwtgraph::parse_reference_samples_tag(gbwt);
34393440
}
34403441

34413442
GBWTTraversalFinder::~GBWTTraversalFinder() {
@@ -3599,7 +3600,26 @@ pair<vector<Traversal>, vector<gbwt::size_type>> GBWTTraversalFinder::find_path_
35993600
return path_traversals;
36003601
}
36013602

3603+
int64_t GBWTTraversalFinder::count_haplotypes(const vector<gbwt::size_type>& gbwt_paths) const {
3604+
unordered_set<pair<string, int64_t>> haplotypes;
3605+
for (const gbwt::size_type& gbwt_path : gbwt_paths) {
3606+
gbwt::size_type path_id = gbwt::Path::id(gbwt_path);
3607+
// metadata check copied from vg deconsturct
3608+
if (!gbwt.hasMetadata() || !gbwt.metadata.hasPathNames() || path_id >= gbwt.metadata.paths()) {
3609+
continue;
3610+
}
3611+
PathSense sense = gbwtgraph::get_path_sense(gbwt, path_id, gbwt_reference_samples);
3612+
string sample_name = gbwtgraph::get_path_sample_name(gbwt, path_id, sense);
3613+
if (sample_name != PathMetadata::NO_SAMPLE_NAME) {
3614+
haplotypes.insert(make_pair(sample_name,
3615+
gbwtgraph::get_path_haplotype(gbwt, path_id, sense)));
3616+
} else {
3617+
haplotypes.insert(make_pair(gbwtgraph::compose_path_name(gbwt, path_id, sense), 0));
3618+
}
3619+
}
3620+
return haplotypes.size();
36023621

3622+
}
36033623

36043624
}
36053625

src/traversal_finder.hpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -638,7 +638,9 @@ class GBWTTraversalFinder : public TraversalFinder {
638638

639639
const HandleGraph& graph;
640640
const gbwt::GBWT& gbwt;
641-
641+
// When using the gbwt we need some precomputed information to ask about stored paths.
642+
unordered_set<string> gbwt_reference_samples;
643+
642644
public:
643645

644646
GBWTTraversalFinder(const HandleGraph& graph, const gbwt::GBWT& gbwt);
@@ -666,6 +668,11 @@ class GBWTTraversalFinder : public TraversalFinder {
666668
virtual pair<vector<Traversal>, vector<gbwt::size_type>> find_path_traversals(const handle_t& snarl_start, const handle_t& snarl_end);
667669

668670
const gbwt::GBWT& get_gbwt() { return gbwt; }
671+
672+
/**
673+
* Count the unique sample/haplotype pairs that are found in the nsarl (from second output of find_path_traversals)
674+
*/
675+
int64_t count_haplotypes(const vector<gbwt::size_type>& gbwt_paths) const;
669676

670677
protected:
671678

0 commit comments

Comments
 (0)