Skip to content

Commit a3b57fa

Browse files
committed
add --snarl-outlier option
1 parent 62b3be2 commit a3b57fa

File tree

1 file changed

+83
-19
lines changed

1 file changed

+83
-19
lines changed

src/subcommand/stats_main.cpp

Lines changed: 83 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
#include "../io/save_handle_graph.hpp"
3535
#include "../gbzgraph.hpp"
3636
#include "../traversal_finder.hpp"
37+
#include "../traversal_clusters.hpp"
3738

3839
using namespace std;
3940
using namespace vg;
@@ -66,7 +67,8 @@ void help_stats(char** argv) {
6667
<< " -O, --overlap-all print overlap table for the cartesian product of paths" << endl
6768
<< " -R, --snarls print statistics for each snarl" << endl
6869
<< " --snarl-contents print out a table of <snarl, depth, parent, contained node ids>" << endl
69-
<< " --snarl-sample print out reference coordinates on given sample" << endl
70+
<< " --snarl-sample S print out reference coordinates on given sample S" << endl
71+
<< " --snarl-outlier print most diverged path from reference (--snarl-sample required)" << endl
7072
<< " -C, --chains print statistics for each chain" << endl
7173
<< " -F, --format graph format from {VG-Protobuf, PackedGraph, HashGraph, XG}. " <<
7274
"Can't detect Protobuf if graph read from stdin" << endl
@@ -109,13 +111,15 @@ int main_stats(int argc, char** argv) {
109111
bool chain_stats = false;
110112
bool snarl_contents = false;
111113
string snarl_sample;
114+
bool snarl_outlier = false;
112115
bool format = false;
113116
bool degree_dist = false;
114117
string distance_index_filename;
115118

116119
// Long options with no corresponding short options.
117120
constexpr int OPT_SNARL_CONTENTS = 1000;
118121
constexpr int OPT_SNARL_SAMPLE = 1001;
122+
constexpr int OPT_SNARL_OUTLIER = 1002;
119123
constexpr int64_t snarl_search_context = 100;
120124

121125
int c;
@@ -146,6 +150,7 @@ int main_stats(int argc, char** argv) {
146150
{"snarls", no_argument, 0, 'R'},
147151
{"snarl-contents", no_argument, 0, OPT_SNARL_CONTENTS},
148152
{"snarl-sample", required_argument, 0, OPT_SNARL_SAMPLE},
153+
{"snarl-outlier", no_argument, 0, OPT_SNARL_OUTLIER},
149154
{"chains", no_argument, 0, 'C'},
150155
{"format", no_argument, 0, 'F'},
151156
{"degree-dist", no_argument, 0, 'D'},
@@ -251,6 +256,10 @@ int main_stats(int argc, char** argv) {
251256
case OPT_SNARL_SAMPLE:
252257
snarl_sample = optarg;
253258
break;
259+
260+
case OPT_SNARL_OUTLIER:
261+
snarl_outlier = true;
262+
break;
254263

255264
case 'C':
256265
chain_stats = true;
@@ -297,6 +306,11 @@ int main_stats(int argc, char** argv) {
297306
exit(1);
298307
}
299308

309+
if (snarl_outlier && snarl_sample.empty()) {
310+
cerr << "error [vg stats]: --snarl-outlier can only be used with --snarl-sample" << endl;
311+
exit(1);
312+
}
313+
300314
bdsg::ReferencePathOverlayHelper overlay_helper;
301315
unique_ptr<PathHandleGraph> path_handle_graph;
302316
PathHandleGraph* graph = nullptr;
@@ -1142,18 +1156,24 @@ int main_stats(int argc, char** argv) {
11421156
if (!snarl_sample.empty()) {
11431157
// optionally prefix with bed-like refpath coordinates if --snarl-sample given
11441158
cout <<"Contig\tStartPos\tEndPos\t";
1159+
if (snarl_outlier) {
1160+
cout << "OutlierName\tOutlierJaccard\tOutlierLen\t";
1161+
}
11451162

11461163
if (pp_graph == nullptr) {
11471164
pp_graph = overlay_helper.apply(graph);
11481165
}
11491166
vector<string> ref_path_names;
11501167
pp_graph->for_each_path_of_sample(snarl_sample, [&](path_handle_t path_handle) {
1151-
ref_path_names.push_back(graph->get_path_name(path_handle));
1168+
ref_path_names.push_back(pp_graph->get_path_name(path_handle));
11521169
});
11531170
if (ref_path_names.empty()) {
11541171
cerr << "error [vg stats]: unable to find any paths of --snarl-sample " << snarl_sample << endl;
11551172
exit(1);
11561173
}
1174+
if (snarl_outlier) {
1175+
ref_path_names.clear(); // if searching for outliers, we want all traversals
1176+
}
11571177
path_trav_finder = unique_ptr<PathTraversalFinder>(new PathTraversalFinder(*pp_graph, ref_path_names));
11581178
}
11591179
cout << "Start\tStart-Reversed\tEnd\tEnd-Reversed\tUltrabubble\tUnary\tShallow-Nodes\tShallow-Edges\tShallow-bases\tDeep-Nodes\tDeep-Edges\tDeep-Bases\tDepth\tChildren\tChains\tChains-Children\tNet-Graph-Size\n";
@@ -1164,15 +1184,26 @@ int main_stats(int argc, char** argv) {
11641184
if (snarl_stats) {
11651185
if (!snarl_sample.empty()) {
11661186
// find the reference interval from path index if snarl lies directly on reference path
1167-
vector<PathInterval> travs;
1168-
std::tie(std::ignore, travs) = path_trav_finder->find_path_traversals(
1187+
vector<Traversal> travs;
1188+
vector<PathInterval> trav_intervals;
1189+
std::tie(travs, trav_intervals) = path_trav_finder->find_path_traversals(
11691190
pp_graph->get_handle(snarl->start().node_id(), snarl->start().backward()),
11701191
pp_graph->get_handle(snarl->end().node_id(), snarl->end().backward()));
1171-
if (travs.empty() && snarl_to_ref.count(manager.parent_of(snarl))) {
1192+
for (int64_t i = 0; i < trav_intervals.size(); ++i) {
1193+
// make sure ref trav is first
1194+
if (pp_graph->get_sample_name(pp_graph->get_path_handle_of_step(trav_intervals[i].first)) == snarl_sample) {
1195+
if (i > 1) {
1196+
swap(travs[0], travs[1]);
1197+
swap(trav_intervals[0], trav_intervals[i]);
1198+
}
1199+
break;
1200+
}
1201+
}
1202+
if (trav_intervals.empty() && snarl_to_ref.count(manager.parent_of(snarl))) {
11721203
// snarl's not on the reference path, us its parent interval
1173-
travs.push_back(snarl_to_ref.at(manager.parent_of(snarl)));
1204+
trav_intervals.push_back(snarl_to_ref.at(manager.parent_of(snarl)));
11741205
}
1175-
if (travs.empty()) {
1206+
if (trav_intervals.empty()) {
11761207
// search toward the reference path
11771208
unordered_map<path_handle_t, vector<step_handle_t>> start_steps;
11781209
unordered_map<path_handle_t, vector<step_handle_t>> end_steps;
@@ -1206,43 +1237,43 @@ int main_stats(int argc, char** argv) {
12061237
step != graph->path_end(graph->get_path_handle_of_step(start_path_steps.second[0]));
12071238
step = graph->get_next_step(step)) {
12081239
if (end_ids.count(graph->get_id(graph->get_handle_of_step(step)))) {
1209-
travs.push_back(make_pair(start_path_steps.second[0],
1240+
trav_intervals.push_back(make_pair(start_path_steps.second[0],
12101241
end_ids[graph->get_id(graph->get_handle_of_step(step))]));
12111242
break;
12121243
}
12131244
}
12141245
// search backward
1215-
if (travs.empty()) {
1246+
if (trav_intervals.empty()) {
12161247
for (step_handle_t step = graph->get_previous_step(start_path_steps.second[0]);
12171248
step != graph->path_front_end(graph->get_path_handle_of_step(start_path_steps.second[0]));
12181249
step = graph->get_previous_step(step)) {
12191250
if (end_ids.count(graph->get_id(graph->get_handle_of_step(step)))) {
1220-
travs.push_back(make_pair(start_path_steps.second[0],
1251+
trav_intervals.push_back(make_pair(start_path_steps.second[0],
12211252
end_ids[graph->get_id(graph->get_handle_of_step(step))]));
12221253
break;
12231254
}
12241255
}
12251256
}
12261257
}
1227-
if (!travs.empty()) {
1258+
if (!trav_intervals.empty()) {
12281259
break;
12291260
}
12301261
}
12311262
// unlikely, but maybe only one end of the snarl reaches the reference. in that case
12321263
// we just consider it both ends of the interval
1233-
if (travs.empty() && !start_steps.empty()) {
1234-
travs.push_back(make_pair(start_steps.begin()->second.front(), start_steps.begin()->second.front()));
1264+
if (trav_intervals.empty() && !start_steps.empty()) {
1265+
trav_intervals.push_back(make_pair(start_steps.begin()->second.front(), start_steps.begin()->second.front()));
12351266
}
1236-
if (travs.empty() && !end_steps.empty()) {
1237-
travs.push_back(make_pair(end_steps.begin()->second.front(), end_steps.begin()->second.front()));
1267+
if (trav_intervals.empty() && !end_steps.empty()) {
1268+
trav_intervals.push_back(make_pair(end_steps.begin()->second.front(), end_steps.begin()->second.front()));
12381269
}
12391270
}
1240-
if (travs.empty()) {
1271+
if (trav_intervals.empty()) {
12411272
cout << ".\t.\t.\t";
12421273
} else {
12431274
// note: in case of a cycle, we're just taking the first found
1244-
step_handle_t start_step = travs.front().first;
1245-
step_handle_t end_step = travs.front().second;
1275+
step_handle_t start_step = trav_intervals.front().first;
1276+
step_handle_t end_step = trav_intervals.front().second;
12461277
int64_t start_pos = pp_graph->get_position_of_step(start_step);
12471278
int64_t end_pos = pp_graph->get_position_of_step(end_step);
12481279
if (end_pos < start_pos) {
@@ -1254,7 +1285,40 @@ int main_stats(int argc, char** argv) {
12541285
<< end_pos << "\t";
12551286

12561287
// use this interval for off-reference children
1257-
snarl_to_ref[snarl] = travs.front();
1288+
snarl_to_ref[snarl] = trav_intervals.front();
1289+
1290+
// print the outlier of snarl (only works if snarl is exactly on reference since it
1291+
// requires the full traversal list)
1292+
if (snarl_outlier) {
1293+
int64_t outlier_idx = -1;
1294+
double outlier_jaccard = numeric_limits<double>::max();
1295+
int64_t outlier_len = -1;
1296+
if (trav_intervals.size() > 1) {
1297+
vector<double> jaccards(trav_intervals.size());
1298+
multiset<handle_t> refset;
1299+
for (handle_t handle : travs[0]) {
1300+
refset.insert(handle);
1301+
}
1302+
for (int64_t i = 1; i < trav_intervals.size(); ++i) {
1303+
multiset<handle_t> travset;
1304+
int64_t travlen = 0;
1305+
for (handle_t handle : travs[i]) {
1306+
travset.insert(handle);
1307+
travlen += pp_graph->get_length(handle);
1308+
}
1309+
double jaccard = weighted_jaccard_coefficient(pp_graph, refset, travset);
1310+
if (jaccard < outlier_jaccard) {
1311+
outlier_idx = i;
1312+
outlier_jaccard = jaccard;
1313+
outlier_len = travlen;
1314+
}
1315+
}
1316+
cout << pp_graph->get_path_name(pp_graph->get_path_handle_of_step(trav_intervals[outlier_idx].first))
1317+
<< "\t" << outlier_jaccard << "\t" << outlier_len << "\t";
1318+
} else {
1319+
cout << ".\t.\t.\t";
1320+
}
1321+
}
12581322
}
12591323
}
12601324

0 commit comments

Comments
 (0)