Skip to content

Commit f31e875

Browse files
committed
Merge remote-tracking branch 'origin/master' into deconstruct
2 parents 7ccb000 + 284f30a commit f31e875

35 files changed

+7114
-6005
lines changed

src/algorithms/chain_items.cpp

Lines changed: 247 additions & 155 deletions
Large diffs are not rendered by default.

src/algorithms/chain_items.hpp

Lines changed: 49 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -148,15 +148,30 @@ class Anchor {
148148

149149
/// Compose a read start position, graph start position, and match length into an Anchor.
150150
/// Can also bring along a distance hint and a seed number.
151-
inline Anchor(size_t read_start, const pos_t& graph_start, size_t length, size_t margin_before, size_t margin_after, int score, size_t seed_number = std::numeric_limits<size_t>::max(), ZipCode* hint = nullptr, size_t hint_start = 0, bool skippable = false) : start(read_start), size(length), margin_before(margin_before), margin_after(margin_after), start_pos(graph_start), end_pos(advance(graph_start, length)), points(score), start_seed(seed_number), end_seed(seed_number), start_zip(hint), end_zip(hint), start_offset(hint_start), end_offset(length - hint_start), seed_length(margin_before + length + margin_after), skippable(skippable) {
151+
inline Anchor(size_t read_start, const pos_t& graph_start, size_t length, size_t margin_before, size_t margin_after,
152+
int score, size_t seed_number = std::numeric_limits<size_t>::max(),
153+
ZipCode* hint = nullptr, size_t hint_start = 0, bool skippable = false) :
154+
start(read_start), size(length), margin_before(margin_before), margin_after(margin_after),
155+
start_pos(graph_start), end_pos(advance(graph_start, length)), points(score),
156+
start_seed(seed_number), end_seed(seed_number), start_zip(hint), end_zip(hint),
157+
start_offset(hint_start), end_offset(length - hint_start),
158+
seed_length(margin_before + length + margin_after), skippable(skippable) {
152159
// Nothing to do!
153160
}
154161

155162
/// Compose two Anchors into an Anchor that represents coming in through
156163
/// the first one and going out through the second, like a tunnel. Useful
157164
/// for representing chains as chainable items.
158-
inline Anchor(const Anchor& first, const Anchor& last, size_t extra_margin_before, size_t extra_margin_after, int score) : start(first.read_start()), size(last.read_end() - first.read_start()), margin_before(first.margin_before + extra_margin_before), margin_after(last.margin_after + extra_margin_after), start_pos(first.graph_start()), end_pos(last.graph_end()), points(score), start_seed(first.seed_start()), end_seed(last.seed_end()), start_zip(first.start_hint()), end_zip(last.end_hint()), start_offset(first.start_offset), end_offset(last.end_offset), seed_length((first.base_seed_length() + last.base_seed_length()) / 2), skippable(first.is_skippable() || last.is_skippable()) {
159-
// Nothing to do!
165+
inline Anchor(const Anchor& first, const Anchor& last,
166+
size_t extra_margin_before, size_t extra_margin_after, int score) :
167+
start(first.read_start()), size(last.read_end() - first.read_start()),
168+
margin_before(first.margin_before + extra_margin_before),
169+
margin_after(last.margin_after + extra_margin_after), start_pos(first.graph_start()),
170+
end_pos(last.graph_end()), points(score), start_seed(first.seed_start()), end_seed(last.seed_end()),
171+
start_zip(first.start_hint()), end_zip(last.end_hint()),
172+
start_offset(first.start_offset), end_offset(last.end_offset),
173+
seed_length((first.base_seed_length() + last.base_seed_length()) / 2),
174+
skippable(first.is_skippable() || last.is_skippable()) {
160175
}
161176

162177
// Act like data
@@ -319,6 +334,37 @@ transition_iterator zip_tree_transition_iterator(const std::vector<SnarlDistance
319334
size_t max_graph_lookback_bases,
320335
size_t max_read_lookback_bases);
321336

337+
/**
338+
* Walk through the ziptree from left to right to find all seeds,
339+
* then generate all possible transitions from each given seed.
340+
*
341+
* Calls ZipCodeTree.find_distances() as the core of the algorithm.
342+
* Used as a helper by zip_tree_transition_iterator().
343+
*
344+
* Transitions are (source anchor, destination anchor, graph distance).
345+
*/
346+
std::vector<std::tuple<size_t, size_t, size_t>> generate_zip_tree_transitions(
347+
const std::vector<SnarlDistanceIndexClusterer::Seed>& seeds,
348+
const ZipCodeTree& zip_code_tree,
349+
size_t max_graph_lookback_bases,
350+
const std::unordered_map<size_t, size_t>& seed_to_starting,
351+
const std::unordered_map<size_t, size_t>& seed_to_ending);
352+
353+
/**
354+
* Calculate read distances for each of the zip tree's transitions.
355+
* Also filters out transitions that can't be used,
356+
* e.g. not reachable in the read.
357+
*
358+
* Used as a helper by zip_tree_transition_iterator().
359+
*
360+
* Transitions are taken as (source anchor, destination anchor, graph distance)
361+
* and output as (source anchor, destination anchor, graph distance, read distance).
362+
*/
363+
std::vector<std::tuple<size_t, size_t, size_t, size_t>> calculate_transition_read_distances(
364+
const std::vector<std::tuple<size_t, size_t, size_t>>& all_transitions,
365+
const VectorView<Anchor>& to_chain,
366+
size_t max_read_lookback_bases);
367+
322368
/**
323369
* Fill in the given DP table for the explored chain scores ending with each
324370
* item. Returns the best observed score overall from that table, with

src/alignment.cpp

Lines changed: 235 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -614,6 +614,8 @@ string alignment_to_sam_internal(const Alignment& alignment,
614614
bool paired,
615615
const int32_t tlen_max) {
616616

617+
618+
617619
// Determine flags, using orientation, next/prev fragments, and pairing status.
618620
int32_t flags = determine_flag(alignment, refseq, refpos, refrev, mateseq, matepos, materev, tlen, paired, tlen_max);
619621

@@ -691,8 +693,12 @@ int32_t determine_flag(const Alignment& alignment,
691693
// Determine flags, using orientation, next/prev fragments, and pairing status.
692694
int32_t flags = sam_flag(alignment, refrev, paired);
693695

694-
// We've observed some reads with the unmapped flag set and also a CIGAR string set, which shouldn't happen.
695-
// We will check for this. The CIGAR string will only be set in the output if the alignment has a path.
696+
// We've observed some reads with the unmapped flag set and also a CIGAR
697+
// string set, which is allowed by the SAM spec but which we are not
698+
// supposed to generate.
699+
//
700+
// We will check for this. The CIGAR string will only be set in the output
701+
// if the alignment has a path.
696702
assert((bool)(flags & BAM_FUNMAP) != (alignment.has_path() && alignment.path().mapping_size()));
697703

698704
if (!((bool)(flags & BAM_FUNMAP)) && paired && !refseq.empty() && refseq == mateseq) {
@@ -725,6 +731,10 @@ int32_t determine_flag(const Alignment& alignment,
725731
flags |= BAM_FMREVERSE;
726732
}
727733

734+
if (is_supplementary(alignment)) {
735+
flags |= BAM_FSUPPLEMENTARY;
736+
}
737+
728738
return flags;
729739
}
730740

@@ -805,7 +815,7 @@ bam1_t* alignment_to_bam_internal(bam_hdr_t* header,
805815
const bool refrev,
806816
const vector<pair<int, char>>& cigar,
807817
const string& mateseq,
808-
const int32_t matepos,
818+
int32_t matepos,
809819
bool materev,
810820
const int32_t tlen,
811821
bool paired,
@@ -1362,6 +1372,131 @@ vector<pair<int, char>> cigar_against_path(const Alignment& alignment, bool on_r
13621372
return cigar;
13631373
}
13641374

1375+
vector<pair<int, char>> spliced_cigar_against_path(const Alignment& aln, const PathPositionHandleGraph& graph, const string& path_name,
1376+
int64_t pos, bool rev, int64_t min_splice_length) {
1377+
// the return value
1378+
vector<pair<int, char>> cigar;
1379+
1380+
if (aln.has_path() && aln.path().mapping_size() > 0) {
1381+
// the read is aligned to the path
1382+
1383+
path_handle_t path_handle = graph.get_path_handle(path_name);
1384+
step_handle_t step = graph.get_step_at_position(path_handle, pos);
1385+
1386+
// to indicate whether we've found the edit that corresponds to the BAM position
1387+
bool found_pos = false;
1388+
1389+
const Path& path = aln.path();
1390+
for (size_t i = 0; i < path.mapping_size(); ++i) {
1391+
1392+
// we traverse backwards on a reverse strand mapping
1393+
const Mapping& mapping = path.mapping(rev ? path.mapping_size() - 1 - i : i);
1394+
1395+
for (size_t j = 0; j < mapping.edit_size(); ++j) {
1396+
1397+
// we traverse backwards on a reverse strand mapping
1398+
const Edit& edit = mapping.edit(rev ? mapping.edit_size() - 1 - j : j);
1399+
1400+
if (!found_pos) {
1401+
// we may still be searching through an initial softclip to find
1402+
// the edit that corresponds to the BAM position
1403+
if (edit.to_length() > 0 && edit.from_length() == 0) {
1404+
append_cigar_operation(edit.to_length(), 'S', cigar);
1405+
// skip the main block where we assign cigar operations
1406+
continue;
1407+
}
1408+
else {
1409+
found_pos = true;
1410+
}
1411+
}
1412+
1413+
// identify the cigar operation
1414+
char cigar_code;
1415+
int length;
1416+
if (edit.from_length() == edit.to_length()) {
1417+
cigar_code = 'M';
1418+
length = edit.from_length();
1419+
}
1420+
else if (edit.from_length() > 0 && edit.to_length() == 0) {
1421+
cigar_code = 'D';
1422+
length = edit.from_length();
1423+
}
1424+
else if (edit.to_length() > 0 && edit.from_length() == 0) {
1425+
cigar_code = 'I';
1426+
length = edit.to_length();
1427+
}
1428+
else {
1429+
throw std::runtime_error("Spliced CIGAR construction can only convert simple edits");
1430+
}
1431+
1432+
append_cigar_operation(length, cigar_code, cigar);
1433+
} // close loop over edits
1434+
1435+
if (found_pos && i + 1 < path.mapping_size()) {
1436+
// we're anchored on the path by the annotated position, and we're transitioning between
1437+
// two mappings, so we should check for a deletion/splice edge
1438+
1439+
step_handle_t next_step = graph.get_next_step(step);
1440+
1441+
handle_t next_handle = graph.get_handle_of_step(next_step);
1442+
const Position& next_pos = path.mapping(rev ? path.mapping_size() - 2 - i : i + 1).position();
1443+
if (graph.get_id(next_handle) != next_pos.node_id()
1444+
|| (graph.get_is_reverse(next_handle) != next_pos.is_reverse()) != rev) {
1445+
1446+
// the next mapping in the alignment is not the next mapping on the path, so we must have
1447+
// taken a deletion
1448+
1449+
// find the closest step that is further along the path than the current one
1450+
// and matches the next position (usually there will only be one)
1451+
size_t curr_offset = graph.get_position_of_step(step);
1452+
size_t nearest_offset = numeric_limits<size_t>::max();
1453+
graph.for_each_step_on_handle(graph.get_handle(next_pos.node_id()),
1454+
[&](const step_handle_t& candidate) {
1455+
1456+
if (graph.get_path_handle_of_step(candidate) == path_handle) {
1457+
size_t candidate_offset = graph.get_position_of_step(candidate);
1458+
if (candidate_offset < nearest_offset && candidate_offset > curr_offset) {
1459+
nearest_offset = candidate_offset;
1460+
next_step = candidate;
1461+
}
1462+
}
1463+
});
1464+
1465+
if (nearest_offset == numeric_limits<size_t>::max()) {
1466+
throw std::runtime_error("Spliced BAM conversion could not find path steps that match alignment");
1467+
}
1468+
1469+
// the gap between the current step and the next one along the path
1470+
size_t deletion_length = (nearest_offset - curr_offset -
1471+
graph.get_length(graph.get_handle_of_step(step)));
1472+
1473+
// add to the cigar
1474+
if (deletion_length >= min_splice_length) {
1475+
// long enough to be a splice
1476+
append_cigar_operation(deletion_length, 'N', cigar);
1477+
}
1478+
else if (deletion_length) {
1479+
// create or extend a deletion
1480+
append_cigar_operation(deletion_length, 'D', cigar);
1481+
}
1482+
}
1483+
1484+
// iterate along the path
1485+
step = next_step;
1486+
}
1487+
} // close loop over mappings
1488+
1489+
if (cigar.back().second == 'I') {
1490+
// the final insertion is actually a softclip
1491+
cigar.back().second = 'S';
1492+
}
1493+
}
1494+
1495+
simplify_cigar(cigar);
1496+
1497+
return cigar;
1498+
}
1499+
13651500
void simplify_cigar(vector<pair<int, char>>& cigar) {
13661501

13671502
size_t removed = 0;
@@ -1664,7 +1799,7 @@ int32_t sam_flag(const Alignment& alignment, bool on_reverse_strand, bool paired
16641799
// TODO: catch reads with pair partners when they shouldn't be paired?
16651800
}
16661801

1667-
if (!alignment.has_path() || alignment.path().mapping_size() == 0) {
1802+
if (!is_mapped(alignment)) {
16681803
// unmapped
16691804
flag |= BAM_FUNMAP;
16701805
}
@@ -1823,18 +1958,28 @@ Alignment bam_to_alignment(const bam1_t *b,
18231958

18241959
}
18251960
alignment.set_read_paired((b->core.flag & BAM_FPAIRED) != 0);
1961+
1962+
// Consult the One True Place in SAM for knowing if a read is aligned or not.
1963+
bool is_aligned = !(b->core.flag & BAM_FUNMAP);
18261964

1827-
if (graph != nullptr && bh != nullptr && b->core.tid >= 0) {
1965+
if (is_aligned) {
1966+
// Set the little-used GAM field that exists to represent this.
1967+
alignment.set_read_mapped(is_aligned);
1968+
1969+
// Use the MAPQ
18281970
alignment.set_mapping_quality(b->core.qual);
1829-
alignment.set_read_mapped(true);
1830-
// Look for the path handle this is against.
1831-
auto found = tid_path_handle.find(b->core.tid);
1832-
if (found == tid_path_handle.end()) {
1833-
cerr << "[vg::alignment.cpp] error: alignment references path not present in graph: "
1834-
<< bh->target_name[b->core.tid] << endl;
1835-
exit(1);
1971+
1972+
if (graph != nullptr && bh != nullptr && b->core.tid >= 0) {
1973+
// We can actually build the path for this read.
1974+
// Look for the path handle this is against.
1975+
auto found = tid_path_handle.find(b->core.tid);
1976+
if (found == tid_path_handle.end()) {
1977+
cerr << "[vg::alignment.cpp] error: alignment references path not present in graph: "
1978+
<< bh->target_name[b->core.tid] << endl;
1979+
exit(1);
1980+
}
1981+
mapping_against_path(alignment, b, found->second, graph, b->core.flag & BAM_FREVERSE);
18361982
}
1837-
mapping_against_path(alignment, b, found->second, graph, b->core.flag & BAM_FREVERSE);
18381983
}
18391984

18401985
// TODO: htslib doesn't wrap this flag for some reason.
@@ -1856,7 +2001,8 @@ Alignment bam_to_alignment(const bam1_t *b,
18562001
}
18572002
++removed;
18582003
}
1859-
else if (tag_name == "AS") {
2004+
else if (tag_name == "AS" && is_aligned) {
2005+
// Set the score, for aligned reads.
18602006
alignment.set_score(parse<int64_t>(tag.substr(5, string::npos)));
18612007
++removed;
18622008
}
@@ -2228,9 +2374,8 @@ int softclip_trim(Alignment& alignment) {
22282374
}
22292375

22302376
bool is_perfect(const Alignment& alignment) {
2231-
// Non-aligned paths can't be perfect (code from subcommand/stats_main.cpp)
2232-
bool has_alignment = alignment.score() > 0 || alignment.path().mapping_size() > 0;
2233-
if (!has_alignment) return false;
2377+
// Non-aligned paths can't be perfect
2378+
if (!is_mapped(alignment)) return false;
22342379

22352380
// Check that the path is perfect
22362381
for (size_t i = 0; i < alignment.path().mapping_size(); ++i) {
@@ -2242,6 +2387,79 @@ bool is_perfect(const Alignment& alignment) {
22422387
return true;
22432388
}
22442389

2390+
bool is_supplementary(const Alignment& alignment) {
2391+
if (!has_annotation(alignment, "supplementary")) {
2392+
return false;
2393+
}
2394+
else {
2395+
return get_annotation<bool>(alignment, "supplementary");
2396+
}
2397+
}
2398+
2399+
pair<int64_t, int64_t> aligned_interval(const Alignment& aln) {
2400+
return pair<int64_t, int64_t>(softclip_start(aln), aln.sequence().size() - softclip_end(aln));
2401+
}
2402+
2403+
string mate_info(const string& path, int32_t pos, bool rev_strand, bool is_read1) {
2404+
subrange_t subrange;
2405+
string path_name = Paths::strip_subrange(path, &subrange);
2406+
if (subrange != PathMetadata::NO_SUBRANGE) {
2407+
pos += subrange.first;
2408+
}
2409+
string info;
2410+
info.append(path_name);
2411+
info.push_back('\t');
2412+
info.append(to_string(pos));
2413+
info.push_back('\t');
2414+
info.push_back(rev_strand ? '1' : '0');
2415+
info.push_back(is_read1 ? '1' : '0');
2416+
return info;
2417+
}
2418+
2419+
tuple<string, int32_t, bool, bool> parse_mate_info(const string& info) {
2420+
tuple<string, int64_t, bool, bool> parsed;
2421+
size_t i = 0;
2422+
while (info[i] != '\t') {
2423+
++i;
2424+
}
2425+
get<0>(parsed) = info.substr(0, i);
2426+
++i;
2427+
size_t j = i;
2428+
while (info[j] != '\t') {
2429+
++j;
2430+
}
2431+
get<1>(parsed) = stoi(info.substr(i, j - i));
2432+
get<2>(parsed) = info[j + 1] == '1';
2433+
get<3>(parsed) = info[j + 2] == '1';
2434+
return parsed;
2435+
}
2436+
2437+
bool is_mapped(const Alignment& alignment) {
2438+
if (alignment.read_mapped()) {
2439+
// The flag for being mapped is set.
2440+
return true;
2441+
}
2442+
2443+
// Otherwise it's not set, but we don't consistently set it in vg. (SAM
2444+
// uses an *un*mapped flag so you don't forget to set it on mapped reads
2445+
// ever.) So second-guess the flag and up on round-tripping SAM with
2446+
// alignment fields set for unmapped reads.
2447+
2448+
if (alignment.path().mapping_size() > 0) {
2449+
// If an alignment path is filled in, it is mapped.
2450+
return true;
2451+
}
2452+
2453+
if (alignment.score() > 0) {
2454+
// If there's no alignment actually there but anonzero score, we
2455+
// represent a mapped read, we're just not sure where to.
2456+
return true;
2457+
}
2458+
2459+
// If there's no evidence of being mapped, we're unmapped.
2460+
return false;
2461+
}
2462+
22452463
int query_overlap(const Alignment& aln1, const Alignment& aln2) {
22462464
if (!alignment_to_length(aln1) || !alignment_to_length(aln2)
22472465
|| !aln1.path().mapping_size() || !aln2.path().mapping_size()

0 commit comments

Comments
 (0)