3434#include " ../io/save_handle_graph.hpp"
3535#include " ../gbzgraph.hpp"
3636#include " ../traversal_finder.hpp"
37+ #include " ../traversal_clusters.hpp"
3738
3839using namespace std ;
3940using namespace vg ;
@@ -66,7 +67,8 @@ void help_stats(char** argv) {
6667 << " -O, --overlap-all print overlap table for the cartesian product of paths" << endl
6768 << " -R, --snarls print statistics for each snarl" << endl
6869 << " --snarl-contents print out a table of <snarl, depth, parent, contained node ids>" << endl
69- << " --snarl-sample print out reference coordinates on given sample" << endl
70+ << " --snarl-sample S print out reference coordinates on given sample S" << endl
71+ << " --snarl-outlier print most diverged path from reference (--snarl-sample required)" << endl
7072 << " -C, --chains print statistics for each chain" << endl
7173 << " -F, --format graph format from {VG-Protobuf, PackedGraph, HashGraph, XG}. " <<
7274 " Can't detect Protobuf if graph read from stdin" << endl
@@ -109,13 +111,15 @@ int main_stats(int argc, char** argv) {
109111 bool chain_stats = false ;
110112 bool snarl_contents = false ;
111113 string snarl_sample;
114+ bool snarl_outlier = false ;
112115 bool format = false ;
113116 bool degree_dist = false ;
114117 string distance_index_filename;
115118
116119 // Long options with no corresponding short options.
117120 constexpr int OPT_SNARL_CONTENTS = 1000 ;
118121 constexpr int OPT_SNARL_SAMPLE = 1001 ;
122+ constexpr int OPT_SNARL_OUTLIER = 1002 ;
119123 constexpr int64_t snarl_search_context = 100 ;
120124
121125 int c;
@@ -146,6 +150,7 @@ int main_stats(int argc, char** argv) {
146150 {" snarls" , no_argument, 0 , ' R' },
147151 {" snarl-contents" , no_argument, 0 , OPT_SNARL_CONTENTS},
148152 {" snarl-sample" , required_argument, 0 , OPT_SNARL_SAMPLE},
153+ {" snarl-outlier" , no_argument, 0 , OPT_SNARL_OUTLIER},
149154 {" chains" , no_argument, 0 , ' C' },
150155 {" format" , no_argument, 0 , ' F' },
151156 {" degree-dist" , no_argument, 0 , ' D' },
@@ -251,6 +256,10 @@ int main_stats(int argc, char** argv) {
251256 case OPT_SNARL_SAMPLE:
252257 snarl_sample = optarg;
253258 break ;
259+
260+ case OPT_SNARL_OUTLIER:
261+ snarl_outlier = true ;
262+ break ;
254263
255264 case ' C' :
256265 chain_stats = true ;
@@ -297,6 +306,11 @@ int main_stats(int argc, char** argv) {
297306 exit (1 );
298307 }
299308
309+ if (snarl_outlier && snarl_sample.empty ()) {
310+ cerr << " error [vg stats]: --snarl-outlier can only be used with --snarl-sample" << endl;
311+ exit (1 );
312+ }
313+
300314 bdsg::ReferencePathOverlayHelper overlay_helper;
301315 unique_ptr<PathHandleGraph> path_handle_graph;
302316 PathHandleGraph* graph = nullptr ;
@@ -1142,18 +1156,24 @@ int main_stats(int argc, char** argv) {
11421156 if (!snarl_sample.empty ()) {
11431157 // optionally prefix with bed-like refpath coordinates if --snarl-sample given
11441158 cout <<" Contig\t StartPos\t EndPos\t " ;
1159+ if (snarl_outlier) {
1160+ cout << " OutlierName\t OutlierJaccard\t OutlierLen\t " ;
1161+ }
11451162
11461163 if (pp_graph == nullptr ) {
11471164 pp_graph = overlay_helper.apply (graph);
11481165 }
11491166 vector<string> ref_path_names;
11501167 pp_graph->for_each_path_of_sample (snarl_sample, [&](path_handle_t path_handle) {
1151- ref_path_names.push_back (graph ->get_path_name (path_handle));
1168+ ref_path_names.push_back (pp_graph ->get_path_name (path_handle));
11521169 });
11531170 if (ref_path_names.empty ()) {
11541171 cerr << " error [vg stats]: unable to find any paths of --snarl-sample " << snarl_sample << endl;
11551172 exit (1 );
11561173 }
1174+ if (snarl_outlier) {
1175+ ref_path_names.clear (); // if searching for outliers, we want all traversals
1176+ }
11571177 path_trav_finder = unique_ptr<PathTraversalFinder>(new PathTraversalFinder (*pp_graph, ref_path_names));
11581178 }
11591179 cout << " Start\t Start-Reversed\t End\t End-Reversed\t Ultrabubble\t Unary\t Shallow-Nodes\t Shallow-Edges\t Shallow-bases\t Deep-Nodes\t Deep-Edges\t Deep-Bases\t Depth\t Children\t Chains\t Chains-Children\t Net-Graph-Size\n " ;
@@ -1164,15 +1184,26 @@ int main_stats(int argc, char** argv) {
11641184 if (snarl_stats) {
11651185 if (!snarl_sample.empty ()) {
11661186 // find the reference interval from path index if snarl lies directly on reference path
1167- vector<PathInterval> travs;
1168- std::tie (std::ignore, travs) = path_trav_finder->find_path_traversals (
1187+ vector<Traversal> travs;
1188+ vector<PathInterval> trav_intervals;
1189+ std::tie (travs, trav_intervals) = path_trav_finder->find_path_traversals (
11691190 pp_graph->get_handle (snarl->start ().node_id (), snarl->start ().backward ()),
11701191 pp_graph->get_handle (snarl->end ().node_id (), snarl->end ().backward ()));
1171- if (travs.empty () && snarl_to_ref.count (manager.parent_of (snarl))) {
1192+ for (int64_t i = 0 ; i < trav_intervals.size (); ++i) {
1193+ // make sure ref trav is first
1194+ if (pp_graph->get_sample_name (pp_graph->get_path_handle_of_step (trav_intervals[i].first )) == snarl_sample) {
1195+ if (i > 1 ) {
1196+ swap (travs[0 ], travs[1 ]);
1197+ swap (trav_intervals[0 ], trav_intervals[i]);
1198+ }
1199+ break ;
1200+ }
1201+ }
1202+ if (trav_intervals.empty () && snarl_to_ref.count (manager.parent_of (snarl))) {
11721203 // snarl's not on the reference path, us its parent interval
1173- travs .push_back (snarl_to_ref.at (manager.parent_of (snarl)));
1204+ trav_intervals .push_back (snarl_to_ref.at (manager.parent_of (snarl)));
11741205 }
1175- if (travs .empty ()) {
1206+ if (trav_intervals .empty ()) {
11761207 // search toward the reference path
11771208 unordered_map<path_handle_t , vector<step_handle_t >> start_steps;
11781209 unordered_map<path_handle_t , vector<step_handle_t >> end_steps;
@@ -1206,43 +1237,43 @@ int main_stats(int argc, char** argv) {
12061237 step != graph->path_end (graph->get_path_handle_of_step (start_path_steps.second [0 ]));
12071238 step = graph->get_next_step (step)) {
12081239 if (end_ids.count (graph->get_id (graph->get_handle_of_step (step)))) {
1209- travs .push_back (make_pair (start_path_steps.second [0 ],
1240+ trav_intervals .push_back (make_pair (start_path_steps.second [0 ],
12101241 end_ids[graph->get_id (graph->get_handle_of_step (step))]));
12111242 break ;
12121243 }
12131244 }
12141245 // search backward
1215- if (travs .empty ()) {
1246+ if (trav_intervals .empty ()) {
12161247 for (step_handle_t step = graph->get_previous_step (start_path_steps.second [0 ]);
12171248 step != graph->path_front_end (graph->get_path_handle_of_step (start_path_steps.second [0 ]));
12181249 step = graph->get_previous_step (step)) {
12191250 if (end_ids.count (graph->get_id (graph->get_handle_of_step (step)))) {
1220- travs .push_back (make_pair (start_path_steps.second [0 ],
1251+ trav_intervals .push_back (make_pair (start_path_steps.second [0 ],
12211252 end_ids[graph->get_id (graph->get_handle_of_step (step))]));
12221253 break ;
12231254 }
12241255 }
12251256 }
12261257 }
1227- if (!travs .empty ()) {
1258+ if (!trav_intervals .empty ()) {
12281259 break ;
12291260 }
12301261 }
12311262 // unlikely, but maybe only one end of the snarl reaches the reference. in that case
12321263 // we just consider it both ends of the interval
1233- if (travs .empty () && !start_steps.empty ()) {
1234- travs .push_back (make_pair (start_steps.begin ()->second .front (), start_steps.begin ()->second .front ()));
1264+ if (trav_intervals .empty () && !start_steps.empty ()) {
1265+ trav_intervals .push_back (make_pair (start_steps.begin ()->second .front (), start_steps.begin ()->second .front ()));
12351266 }
1236- if (travs .empty () && !end_steps.empty ()) {
1237- travs .push_back (make_pair (end_steps.begin ()->second .front (), end_steps.begin ()->second .front ()));
1267+ if (trav_intervals .empty () && !end_steps.empty ()) {
1268+ trav_intervals .push_back (make_pair (end_steps.begin ()->second .front (), end_steps.begin ()->second .front ()));
12381269 }
12391270 }
1240- if (travs .empty ()) {
1271+ if (trav_intervals .empty ()) {
12411272 cout << " .\t .\t .\t " ;
12421273 } else {
12431274 // note: in case of a cycle, we're just taking the first found
1244- step_handle_t start_step = travs .front ().first ;
1245- step_handle_t end_step = travs .front ().second ;
1275+ step_handle_t start_step = trav_intervals .front ().first ;
1276+ step_handle_t end_step = trav_intervals .front ().second ;
12461277 int64_t start_pos = pp_graph->get_position_of_step (start_step);
12471278 int64_t end_pos = pp_graph->get_position_of_step (end_step);
12481279 if (end_pos < start_pos) {
@@ -1254,7 +1285,40 @@ int main_stats(int argc, char** argv) {
12541285 << end_pos << " \t " ;
12551286
12561287 // use this interval for off-reference children
1257- snarl_to_ref[snarl] = travs.front ();
1288+ snarl_to_ref[snarl] = trav_intervals.front ();
1289+
1290+ // print the outlier of snarl (only works if snarl is exactly on reference since it
1291+ // requires the full traversal list)
1292+ if (snarl_outlier) {
1293+ int64_t outlier_idx = -1 ;
1294+ double outlier_jaccard = numeric_limits<double >::max ();
1295+ int64_t outlier_len = -1 ;
1296+ if (trav_intervals.size () > 1 ) {
1297+ vector<double > jaccards (trav_intervals.size ());
1298+ multiset<handle_t > refset;
1299+ for (handle_t handle : travs[0 ]) {
1300+ refset.insert (handle);
1301+ }
1302+ for (int64_t i = 1 ; i < trav_intervals.size (); ++i) {
1303+ multiset<handle_t > travset;
1304+ int64_t travlen = 0 ;
1305+ for (handle_t handle : travs[i]) {
1306+ travset.insert (handle);
1307+ travlen += pp_graph->get_length (handle);
1308+ }
1309+ double jaccard = weighted_jaccard_coefficient (pp_graph, refset, travset);
1310+ if (jaccard < outlier_jaccard) {
1311+ outlier_idx = i;
1312+ outlier_jaccard = jaccard;
1313+ outlier_len = travlen;
1314+ }
1315+ }
1316+ cout << pp_graph->get_path_name (pp_graph->get_path_handle_of_step (trav_intervals[outlier_idx].first ))
1317+ << " \t " << outlier_jaccard << " \t " << outlier_len << " \t " ;
1318+ } else {
1319+ cout << " .\t .\t .\t " ;
1320+ }
1321+ }
12581322 }
12591323 }
12601324
0 commit comments