Skip to content

Commit cbf330b

Browse files
authored
Merge pull request #4698 from vgteam/vg-chains-subcommand
vg chains subcommand
2 parents e646f63 + e876470 commit cbf330b

File tree

2 files changed

+428
-0
lines changed

2 files changed

+428
-0
lines changed

src/subcommand/chains_main.cpp

Lines changed: 364 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,364 @@
1+
/** \file chains_main.cpp
2+
*
3+
* Defines the "vg chains" subcommand, which extracts the handles in top-level chains from a distance index or a snarl file.
4+
*
5+
* NOTE: If a component (~chromosome) does not start with a shared node, the top-level chain is not unique.
6+
* For example, if the component starts with edges (u, w) and (v, w), the chain may start with either u or v.
7+
*
8+
* TODO: Option to get chain (~contig, chromosome) names from a graph.
9+
*
10+
* TODO: Full GFA format with segments and jumps.
11+
*/
12+
13+
#include "subcommand.hpp"
14+
15+
#include "../gbwt_helper.hpp"
16+
#include "../snarl_distance_index.hpp"
17+
#include "../snarls.hpp"
18+
19+
#include <sdsl/int_vector.hpp>
20+
#include <vg/io/vpkg.hpp>
21+
22+
#include <fstream>
23+
#include <iostream>
24+
#include <tuple>
25+
#include <vector>
26+
27+
#include <getopt.h>
28+
29+
using namespace vg;
30+
31+
//----------------------------------------------------------------------------
32+
33+
struct ChainsConfig {
34+
std::string graph_file;
35+
std::string input_file, output_file;
36+
enum Format {
37+
BINARY,
38+
GFA
39+
} output_format = BINARY;
40+
bool progress = false;
41+
42+
ChainsConfig(int argc, char** argv);
43+
};
44+
45+
void write_binary(const std::vector<sdsl::int_vector<>>& chains, std::ostream& out);
46+
47+
void write_gfa_paths(const std::vector<sdsl::int_vector<>>& chains, std::ostream& out);
48+
49+
gbwt::vector_type extract_chain(const SnarlDistanceIndex& distance_index, const HandleGraph& graph, net_handle_t chain, size_t chain_id);
50+
51+
gbwt::vector_type extract_chain(const SnarlManager& snarls, const HandleGraph& graph, const Chain& chain, size_t chain_id);
52+
53+
sdsl::int_vector<> normalize_chain(gbwt::vector_type& chain);
54+
55+
//----------------------------------------------------------------------------
56+
57+
int main_chains(int argc, char** argv) {
58+
ChainsConfig config(argc, argv);
59+
60+
if (config.progress) {
61+
std::cerr << "Loading graph from " << config.graph_file << std::endl;
62+
}
63+
std::unique_ptr<HandleGraph> graph = io::VPKG::load_one<HandleGraph>(config.graph_file);
64+
65+
if (config.progress) {
66+
std::cerr << "Loading distance index or snarl file from " << config.input_file << std::endl;
67+
}
68+
std::unique_ptr<SnarlDistanceIndex> distance_index;
69+
std::unique_ptr<SnarlManager> snarls;
70+
std::tie(distance_index, snarls) = io::VPKG::try_load_first<SnarlDistanceIndex, SnarlManager>(config.input_file);
71+
if (distance_index != nullptr) {
72+
if (config.progress) {
73+
std::cerr << "Found a distance index" << std::endl;
74+
}
75+
} else if (snarls != nullptr) {
76+
if (config.progress) {
77+
std::cerr << "Found a snarl file" << std::endl;
78+
}
79+
} else {
80+
std::cerr << "error: [vg chains] unable to load distance index or snarl file from " << config.input_file << std::endl;
81+
return 1;
82+
}
83+
84+
if (config.progress) {
85+
std::cerr << "Extracting chains" << std::endl;
86+
}
87+
std::vector<sdsl::int_vector<>> chains;
88+
if (distance_index != nullptr) {
89+
size_t chain_id = 0;
90+
distance_index->for_each_child(distance_index->get_root(), [&](const handlegraph::net_handle_t& chain) {
91+
gbwt::vector_type buffer = extract_chain(*distance_index, *graph, chain, chain_id);
92+
if (!buffer.empty()) {
93+
chains.push_back(normalize_chain(buffer));
94+
}
95+
chain_id++;
96+
});
97+
} else {
98+
size_t chain_id = 0;
99+
snarls->for_each_top_level_chain([&](const Chain* chain) {
100+
gbwt::vector_type buffer = extract_chain(*snarls, *graph, *chain, chain_id);
101+
if (!buffer.empty()) {
102+
chains.push_back(normalize_chain(buffer));
103+
}
104+
chain_id++;
105+
});
106+
}
107+
if (config.progress) {
108+
size_t total_handles = 0;
109+
for (const auto& chain : chains) {
110+
total_handles += chain.size();
111+
}
112+
std::cerr << "Extracted " << chains.size() << " chains with " << total_handles << " handles" << std::endl;
113+
}
114+
std::sort(chains.begin(), chains.end());
115+
116+
std::ostream* out_stream;
117+
std::ofstream out_file;
118+
if (config.output_file.empty()) {
119+
if (config.progress) {
120+
std::cerr << "Writing output to stdout" << std::endl;
121+
}
122+
out_stream = &std::cout;
123+
} else {
124+
if (config.progress) {
125+
std::cerr << "Writing output to " << config.output_file << std::endl;
126+
}
127+
out_file.open(config.output_file, std::ios_base::binary);
128+
out_stream = &out_file;
129+
}
130+
if (config.output_format == ChainsConfig::BINARY) {
131+
if (config.progress) {
132+
std::cerr << "Writing binary format" << std::endl;
133+
}
134+
write_binary(chains, *out_stream);
135+
} else if (config.output_format == ChainsConfig::GFA) {
136+
if (config.progress) {
137+
std::cerr << "Writing GFA paths" << std::endl;
138+
}
139+
write_gfa_paths(chains, *out_stream);
140+
} else {
141+
std::cerr << "error: [vg chains] unknown output format" << std::endl;
142+
return 1;
143+
}
144+
out_file.close();
145+
146+
return 0;
147+
}
148+
149+
static vg::subcommand::Subcommand vg_chains("chains", "extract handles in top-level chains", vg::subcommand::WIDGET, main_chains);
150+
151+
//----------------------------------------------------------------------------
152+
153+
void help_chains(char** argv) {
154+
std::cerr << "usage: " << argv[0] << " " << argv[1] << " [options] graph input > output" << std::endl;
155+
std::cerr << std::endl;
156+
157+
std::cerr << "Extracts handles in top-level chains from a distance index or a snarl file." << std::endl;
158+
std::cerr << std::endl;
159+
160+
std::cerr << "Output options:" << std::endl;
161+
std::cerr << " -o, --output FILE write the output to FILE" << std::endl;
162+
std::cerr << " -b, --binary output binary format (default)" << std::endl;
163+
std::cerr << " -g, --gfa output GFA paths using jumps" << std::endl;
164+
std::cerr << std::endl;
165+
166+
std::cerr << "Other options:" << std::endl;
167+
std::cerr << " -h, --help print this help message to stderr and exit" << std::endl;
168+
std::cerr << " -p, --progress report progress to stderr" << std::endl;
169+
std::cerr << std::endl;
170+
}
171+
172+
ChainsConfig::ChainsConfig(int argc, char** argv) {
173+
static struct option long_options[] = {
174+
{ "output", required_argument, nullptr, 'o' },
175+
{ "binary", no_argument, nullptr, 'b' },
176+
{ "gfa", no_argument, nullptr, 'g' },
177+
{ "help", no_argument, nullptr, 'h' },
178+
{ "progress", no_argument, nullptr, 'p' },
179+
{ 0, 0, 0, 0 }
180+
};
181+
182+
optind = 2; // force optind past command positional argument
183+
while (true) {
184+
int option_index = 0;
185+
int c = getopt_long(argc, argv, "o:bgh?p", long_options, &option_index);
186+
if (c == -1) { break; } // End of options.
187+
188+
switch (c)
189+
{
190+
case 'o':
191+
this->output_file = optarg;
192+
break;
193+
case 'b':
194+
this->output_format = BINARY;
195+
break;
196+
case 'g':
197+
this->output_format = GFA;
198+
break;
199+
200+
case 'h':
201+
case '?':
202+
help_chains(argv);
203+
std::exit(EXIT_FAILURE);
204+
case 'p':
205+
this->progress = true;
206+
break;
207+
208+
default:
209+
std::abort();
210+
}
211+
}
212+
213+
if (optind + 2 != argc) {
214+
help_chains(argv);
215+
std::exit(EXIT_FAILURE);
216+
}
217+
this->graph_file = argv[optind]; optind++;
218+
this->input_file = argv[optind]; optind++;
219+
}
220+
221+
//----------------------------------------------------------------------------
222+
223+
void write_binary(const std::vector<sdsl::int_vector<>>& chains, std::ostream& out) {
224+
std::uint64_t num_chains = chains.size();
225+
sdsl::simple_sds::serialize_value(num_chains, out);
226+
for (const auto& chain : chains) {
227+
chain.simple_sds_serialize(out);
228+
}
229+
}
230+
231+
void write_gfa_paths(const std::vector<sdsl::int_vector<>>& chains, std::ostream& out) {
232+
for (size_t i = 0; i < chains.size(); i++) {
233+
const auto& chain = chains[i];
234+
out << "P\t" << i << "\t";
235+
for (size_t j = 0; j < chain.size(); j++) {
236+
nid_t id = gbwt::Node::id(chain[j]);
237+
bool is_reverse = gbwt::Node::is_reverse(chain[j]);
238+
if (j > 0) {
239+
// These are typically jumps, not edges.
240+
out << ";";
241+
}
242+
out << id << (is_reverse ? "-" : "+");
243+
}
244+
out << "\t*" << std::endl;
245+
}
246+
}
247+
248+
net_handle_t follow_chain(const SnarlDistanceIndex& distance_index, const HandleGraph& graph, size_t chain_id, net_handle_t curr) {
249+
net_handle_t next = curr;
250+
size_t successors = 0;
251+
distance_index.follow_net_edges(next, &graph, false, [&](const net_handle_t& child) {
252+
successors++;
253+
next = child;
254+
});
255+
if (successors != 1) {
256+
std::cerr << "follow_chain(): chain " << chain_id << " has " << successors << " successors for a child";
257+
std::exit(EXIT_FAILURE);
258+
}
259+
return next;
260+
}
261+
262+
void try_append(gbwt::vector_type& chain, gbwt::node_type start, gbwt::node_type end) {
263+
if (start == gbwt::ENDMARKER || end == gbwt::ENDMARKER) {
264+
return;
265+
}
266+
if (chain.empty() || chain.back() != start) {
267+
chain.push_back(start);
268+
}
269+
if (chain.empty() || chain.back() != end) {
270+
chain.push_back(end);
271+
}
272+
}
273+
274+
gbwt::vector_type extract_chain(const SnarlDistanceIndex& distance_index, const HandleGraph& graph, net_handle_t chain, size_t chain_id) {
275+
gbwt::vector_type result;
276+
277+
// Closed interval of net handles.
278+
net_handle_t curr = distance_index.get_bound(chain, false, true);
279+
net_handle_t chain_end = distance_index.get_bound(chain, true, false);
280+
gbwt::node_type prev = gbwt::ENDMARKER; // Possible start of a snarl.
281+
bool was_snarl = false;
282+
while (true) {
283+
if (distance_index.is_node(curr)) {
284+
handle_t handle = distance_index.get_handle(curr, &graph);
285+
gbwt::node_type node = handle_to_gbwt(graph, handle);
286+
if (was_snarl) {
287+
try_append(result, prev, node);
288+
}
289+
prev = node;
290+
was_snarl = false;
291+
} else if (distance_index.is_snarl(curr)) {
292+
// Trivial snarls do not show up in the traversal.
293+
was_snarl = true;
294+
} else {
295+
was_snarl = false;
296+
}
297+
if (curr == chain_end) {
298+
break;
299+
}
300+
curr = follow_chain(distance_index, graph, chain_id, curr);
301+
}
302+
303+
for (auto handle : result) {
304+
nid_t id = gbwt::Node::id(handle);
305+
if (!graph.has_node(id)) {
306+
std::cerr << "extract_chain(): chain " << chain_id << " has a handle for missing node " << id << std::endl;
307+
std::exit(EXIT_FAILURE);
308+
}
309+
}
310+
311+
return result;
312+
}
313+
314+
gbwt::vector_type extract_chain(const SnarlManager& snarls, const HandleGraph& graph, const Chain& chain, size_t chain_id) {
315+
gbwt::vector_type result;
316+
317+
for (auto iter = chain_begin(chain); iter != chain_end(chain); ++iter) {
318+
const Snarl* snarl = iter->first;
319+
if (snarls.is_trivial(snarl, graph)) {
320+
// Skip trivial snarls.
321+
continue;
322+
}
323+
gbwt::node_type start = gbwt::Node::encode(snarl->start().node_id(), snarl->start().backward());
324+
gbwt::node_type end = gbwt::Node::encode(snarl->end().node_id(), snarl->end().backward());
325+
if (iter->second) {
326+
// Reverse snarl; we must swap and flip the endpoints.
327+
std::swap(start, end);
328+
start = gbwt::Node::reverse(start);
329+
end = gbwt::Node::reverse(end);
330+
}
331+
try_append(result, start, end);
332+
}
333+
334+
return result;
335+
}
336+
337+
sdsl::int_vector<> normalize_chain(gbwt::vector_type& chain) {
338+
// Normalize the orientation.
339+
// TODO: What if there is the same number of forward and reverse nodes?
340+
size_t reverse_nodes = 0;
341+
for (const auto& node : chain) {
342+
if (gbwt::Node::is_reverse(node)) {
343+
reverse_nodes++;
344+
}
345+
}
346+
if (reverse_nodes > chain.size() / 2) {
347+
gbwt::reversePath(chain);
348+
}
349+
350+
// Bit compress the vector.
351+
size_t width = 1;
352+
auto iter = std::max_element(chain.begin(), chain.end());
353+
if (iter != chain.end()) {
354+
width = sdsl::bits::length(*iter);
355+
}
356+
sdsl::int_vector<> result(chain.size(), 0, width);
357+
for (size_t i = 0; i < chain.size(); i++) {
358+
result[i] = chain[i];
359+
}
360+
361+
return result;
362+
}
363+
364+
//----------------------------------------------------------------------------

0 commit comments

Comments
 (0)