Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
663 changes: 509 additions & 154 deletions src/multipath_alignment_graph.cpp

Large diffs are not rendered by default.

154 changes: 114 additions & 40 deletions src/multipath_alignment_graph.hpp

Large diffs are not rendered by default.

124 changes: 124 additions & 0 deletions src/path.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2233,6 +2233,130 @@ pair<Path, Path> cut_path(const Path& path, size_t offset) {
return make_pair(p1, p2);
}

// cut a path_mapping_t at a to_length offset
pair<path_mapping_t, path_mapping_t> cut_mapping(const path_mapping_t& m, size_t offset) {
path_mapping_t left, right;

// both result mappings will be in the same orientation as the original
left.mutable_position()->set_is_reverse(m.position().is_reverse());
right.mutable_position()->set_is_reverse(m.position().is_reverse());

// left always has the position of the input mapping
*left.mutable_position() = m.position();

// nothing to cut
if (offset == 0) {
// we will get a 0-length left
right = m;
} else if (offset >= mapping_to_length(m)) {
// or a 0-length right
left = m;
} else {
// we need to cut the mapping

// find the cut point and build the two mappings
size_t seen = 0;
size_t j = 0;
// loop over those before our position
for ( ; j < m.edit_size() && seen < offset; ++j) {
const edit_t& e = m.edit(j);
if (seen + e.to_length() > offset) {
// we need to divide this edit
// Convert to protobuf, cut, then convert back
Edit proto_edit;
to_proto_edit(e, proto_edit);
auto proto_edits = cut_edit_at_to(proto_edit, seen + e.to_length() - offset);
from_proto_edit(proto_edits.first, *left.add_edit());
from_proto_edit(proto_edits.second, *right.add_edit());
} else {
// this would be the last edit before the target position
*left.add_edit() = e;
}
seen += e.to_length();
}
// now we add to the second path
for ( ; j < m.edit_size(); ++j) {
*right.add_edit() = m.edit(j);
}
}
// The right mapping has a position on this same node
right.mutable_position()->set_node_id(m.position().node_id());
right.mutable_position()->set_offset(left.position().offset() + mapping_from_length(left));

return make_pair(left, right);
}

// divide the path_t at a path-relative offset as measured in to_length from start
pair<path_t, path_t> cut_path(const path_t& path, size_t offset) {
path_t p1, p2;
size_t seen = 0;
size_t i = 0;
if (!path.mapping_size()) {
return make_pair(path, path);
}

// seek forward to the cut point
for ( ; i < path.mapping_size() && seen < offset; ++i) {
const path_mapping_t& m = path.mapping(i);
// the position is in this node, so make the cut
if (seen + mapping_to_length(m) == offset) {
*p1.add_mapping() = m;
} else if (seen + mapping_to_length(m) > offset) {
auto mappings = cut_mapping(m, offset - seen);
// and save the cuts
*p1.add_mapping() = mappings.first;
*p2.add_mapping() = mappings.second;
++i; // we don't increment our mapping index when we break here
seen += mapping_to_length(m); // same problem
break;
} else {
// otherwise keep adding the mappings onto our first path
*p1.add_mapping() = m;
}
seen += mapping_to_length(m);
}

// add in the rest of the edits
for ( ; i < path.mapping_size(); ++i) {
const path_mapping_t& m = path.mapping(i);
*p2.add_mapping() = m;
}

return make_pair(p1, p2);
}

path_t extract_subpath(const path_t& path, size_t start_offset, size_t end_offset) {
// Handle edge cases
if (start_offset >= end_offset) {
return path_t(); // return empty path
}

size_t path_len = path_to_length(path);
if (start_offset >= path_len) {
return path_t(); // return empty path
}

// Clamp end_offset to path length
if (end_offset > path_len) {
end_offset = path_len;
}

// If we're extracting the whole path, just return it
if (start_offset == 0 && end_offset == path_len) {
return path;
}

// Cut at start to get everything from start onward
auto first_cut = cut_path(path, start_offset);
path_t suffix = first_cut.second;

// Cut the suffix at (end_offset - start_offset) to get just the middle part
size_t middle_length = end_offset - start_offset;
auto second_cut = cut_path(suffix, middle_length);

return second_cut.first;
}

bool maps_to_node(const Path& p, id_t id) {
for (size_t i = 0; i < p.mapping_size(); ++i) {
if (p.mapping(i).position().node_id() == id) return true;
Expand Down
6 changes: 6 additions & 0 deletions src/path.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -558,6 +558,12 @@ pos_t final_position(const path_t& path);
int corresponding_to_length(const path_t& path, int from_length, bool from_end);
int corresponding_from_length(const path_t& path, int to_length, bool from_end);

pair<path_t, path_t> cut_path(const path_t& path, size_t offset);
pair<path_mapping_t, path_mapping_t> cut_mapping(const path_mapping_t& m, size_t offset);
/// Extract a subpath from start_offset to end_offset (measured in to_length,
/// along the read side)
path_t extract_subpath(const path_t& path, size_t start_offset, size_t end_offset);

string debug_string(const path_t& path);
string debug_string(const path_mapping_t& mapping);
string debug_string(const edit_t& edit);
Expand Down
20 changes: 16 additions & 4 deletions src/subcommand/surject_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include "../multipath_alignment_emitter.hpp"
#include "../crash.hpp"
#include "../watchdog.hpp"
#include "../explainer.hpp"


using namespace std;
Expand Down Expand Up @@ -57,6 +58,8 @@ void help_surject(char** argv) {
<< " -l, --subpath-local let the multipath mapping surjection produce local" << endl
<< " (rather than global) alignments" << endl
<< " -T, --max-tail-len N only align up to N bases of read tails [10000]" << endl
<< " -e, --max-tail-cells N only fill up to N alignment matrix cells to align tails" << endl
<< " (default: " << Surjector::DEFAULT_MAX_TAIL_CELLS << ")" << endl
<< " -g, --max-graph-scale X make reads unmapped if alignment target subgraph" << endl
<< " size exceeds read length by a factor of X " << endl
<< " (default: " << Surjector::DEFAULT_SUBGRAPH_LIMIT
Expand Down Expand Up @@ -160,7 +163,8 @@ int main_surject(int argc, char** argv) {
int64_t min_splice_length = 20;
size_t watchdog_timeout = 10;
bool subpath_global = true; // force full length alignments in mpmap resolution
size_t max_tail_len = 10000;
size_t max_tail_length = 10000;
uint64_t max_tail_cells = Surjector::DEFAULT_MAX_TAIL_CELLS;
// This needs to be nullable so that we can use the default for spliced if doing spliced mode.
std::unique_ptr<double> max_graph_scale;
bool qual_adj = false;
Expand Down Expand Up @@ -190,6 +194,7 @@ int main_surject(int argc, char** argv) {
{"ref-sample", required_argument, 0, 'n'}, // Provide an alias to match Giraffe
{"subpath-local", no_argument, 0, 'l'},
{"max-tail-len", required_argument, 0, 'T'},
{"max-tail-cells", required_argument, 0, 'e'},
{"max-graph-scale", required_argument, 0, 'g'},
{"interleaved", no_argument, 0, 'i'},
{"multimap", no_argument, 0, 'M'},
Expand Down Expand Up @@ -218,7 +223,7 @@ int main_surject(int argc, char** argv) {
};

int option_index = 0;
c = getopt_long (argc, argv, "h?x:p:F:n:lT:g:iGmcbsuN:R:f:C:t:SPI:a:AE:LHMVw:r",
c = getopt_long (argc, argv, "h?x:p:F:n:lT:e:g:iGmcbsuN:R:f:C:t:SPI:a:AE:LHMVw:r",
long_options, &option_index);

// Detect the end of the options.
Expand Down Expand Up @@ -249,7 +254,11 @@ int main_surject(int argc, char** argv) {
break;

case 'T':
max_tail_len = parse<size_t>(optarg);
max_tail_length = parse<size_t>(optarg);
break;

case 'e':
max_tail_cells = parse<uint64_t>(optarg);
break;

case 'g':
Expand Down Expand Up @@ -364,6 +373,8 @@ int main_surject(int argc, char** argv) {
}
}

Explainer::save_explanations = true;

string file_name = get_input_file_name(optind, argc, argv);

if (have_input_file(optind, argc, argv)) {
Expand Down Expand Up @@ -449,7 +460,8 @@ int main_surject(int argc, char** argv) {
else {
surjector.min_splice_length = numeric_limits<int64_t>::max();
}
surjector.max_tail_length = max_tail_len;
surjector.max_tail_length = max_tail_length;
surjector.max_tail_cells = max_tail_cells;
surjector.annotate_with_all_path_scores = annotate_with_all_path_scores;
surjector.annotate_with_graph_alignment = annotate_with_graph_alignment;
if (max_graph_scale) {
Expand Down
7 changes: 6 additions & 1 deletion src/surjector.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,11 @@ using namespace std;

/// the maximum length of a tail that we will try to align
size_t max_tail_length = 10000;

/// An accessible default max_tail_cells
static constexpr double DEFAULT_MAX_TAIL_CELLS = 25000000000;
/// The maximum number of cells that we are willing to fill when aligning a tail
uint64_t max_tail_cells = DEFAULT_MAX_TAIL_CELLS;

// the maximum number of estimated band cells that we are willing to try to fill when connecting anchors
uint64_t max_band_cells = 8000000000;
Expand All @@ -144,7 +149,7 @@ using namespace std;
static constexpr double DEFAULT_SUBGRAPH_LIMIT = 100 * 1024 / 125.0;
/// How big of a graph (in graph bases per read base) should we ever try to align against for realigning surjection?
double max_subgraph_bases_per_read_base = DEFAULT_SUBGRAPH_LIMIT;
/// Don't refuse to align (graph size) * (read size) is at least this size (overrides max_subgraph_bases_per_read_base)
/// Don't refuse to align unless (graph size) * (read size) is at least this size (overrides max_subgraph_bases_per_read_base)
int64_t min_absolute_align_size_to_refuse = 1024;

/// in spliced surject, downsample if the base-wise average coverage by chunks is this high
Expand Down
Loading
Loading