Skip to content

Commit 284f30a

Browse files
authored
Merge pull request #4699 from eizengaj-roche/surject-supplementary
Add the capability of producing supplementary alignments to vg surject
2 parents 354d2b0 + 360fd3a commit 284f30a

21 files changed

+2561
-997
lines changed

src/alignment.cpp

Lines changed: 179 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -614,6 +614,8 @@ string alignment_to_sam_internal(const Alignment& alignment,
614614
bool paired,
615615
const int32_t tlen_max) {
616616

617+
618+
617619
// Determine flags, using orientation, next/prev fragments, and pairing status.
618620
int32_t flags = determine_flag(alignment, refseq, refpos, refrev, mateseq, matepos, materev, tlen, paired, tlen_max);
619621

@@ -729,6 +731,10 @@ int32_t determine_flag(const Alignment& alignment,
729731
flags |= BAM_FMREVERSE;
730732
}
731733

734+
if (is_supplementary(alignment)) {
735+
flags |= BAM_FSUPPLEMENTARY;
736+
}
737+
732738
return flags;
733739
}
734740

@@ -809,7 +815,7 @@ bam1_t* alignment_to_bam_internal(bam_hdr_t* header,
809815
const bool refrev,
810816
const vector<pair<int, char>>& cigar,
811817
const string& mateseq,
812-
const int32_t matepos,
818+
int32_t matepos,
813819
bool materev,
814820
const int32_t tlen,
815821
bool paired,
@@ -1366,6 +1372,131 @@ vector<pair<int, char>> cigar_against_path(const Alignment& alignment, bool on_r
13661372
return cigar;
13671373
}
13681374

1375+
vector<pair<int, char>> spliced_cigar_against_path(const Alignment& aln, const PathPositionHandleGraph& graph, const string& path_name,
1376+
int64_t pos, bool rev, int64_t min_splice_length) {
1377+
// the return value
1378+
vector<pair<int, char>> cigar;
1379+
1380+
if (aln.has_path() && aln.path().mapping_size() > 0) {
1381+
// the read is aligned to the path
1382+
1383+
path_handle_t path_handle = graph.get_path_handle(path_name);
1384+
step_handle_t step = graph.get_step_at_position(path_handle, pos);
1385+
1386+
// to indicate whether we've found the edit that corresponds to the BAM position
1387+
bool found_pos = false;
1388+
1389+
const Path& path = aln.path();
1390+
for (size_t i = 0; i < path.mapping_size(); ++i) {
1391+
1392+
// we traverse backwards on a reverse strand mapping
1393+
const Mapping& mapping = path.mapping(rev ? path.mapping_size() - 1 - i : i);
1394+
1395+
for (size_t j = 0; j < mapping.edit_size(); ++j) {
1396+
1397+
// we traverse backwards on a reverse strand mapping
1398+
const Edit& edit = mapping.edit(rev ? mapping.edit_size() - 1 - j : j);
1399+
1400+
if (!found_pos) {
1401+
// we may still be searching through an initial softclip to find
1402+
// the edit that corresponds to the BAM position
1403+
if (edit.to_length() > 0 && edit.from_length() == 0) {
1404+
append_cigar_operation(edit.to_length(), 'S', cigar);
1405+
// skip the main block where we assign cigar operations
1406+
continue;
1407+
}
1408+
else {
1409+
found_pos = true;
1410+
}
1411+
}
1412+
1413+
// identify the cigar operation
1414+
char cigar_code;
1415+
int length;
1416+
if (edit.from_length() == edit.to_length()) {
1417+
cigar_code = 'M';
1418+
length = edit.from_length();
1419+
}
1420+
else if (edit.from_length() > 0 && edit.to_length() == 0) {
1421+
cigar_code = 'D';
1422+
length = edit.from_length();
1423+
}
1424+
else if (edit.to_length() > 0 && edit.from_length() == 0) {
1425+
cigar_code = 'I';
1426+
length = edit.to_length();
1427+
}
1428+
else {
1429+
throw std::runtime_error("Spliced CIGAR construction can only convert simple edits");
1430+
}
1431+
1432+
append_cigar_operation(length, cigar_code, cigar);
1433+
} // close loop over edits
1434+
1435+
if (found_pos && i + 1 < path.mapping_size()) {
1436+
// we're anchored on the path by the annotated position, and we're transitioning between
1437+
// two mappings, so we should check for a deletion/splice edge
1438+
1439+
step_handle_t next_step = graph.get_next_step(step);
1440+
1441+
handle_t next_handle = graph.get_handle_of_step(next_step);
1442+
const Position& next_pos = path.mapping(rev ? path.mapping_size() - 2 - i : i + 1).position();
1443+
if (graph.get_id(next_handle) != next_pos.node_id()
1444+
|| (graph.get_is_reverse(next_handle) != next_pos.is_reverse()) != rev) {
1445+
1446+
// the next mapping in the alignment is not the next mapping on the path, so we must have
1447+
// taken a deletion
1448+
1449+
// find the closest step that is further along the path than the current one
1450+
// and matches the next position (usually there will only be one)
1451+
size_t curr_offset = graph.get_position_of_step(step);
1452+
size_t nearest_offset = numeric_limits<size_t>::max();
1453+
graph.for_each_step_on_handle(graph.get_handle(next_pos.node_id()),
1454+
[&](const step_handle_t& candidate) {
1455+
1456+
if (graph.get_path_handle_of_step(candidate) == path_handle) {
1457+
size_t candidate_offset = graph.get_position_of_step(candidate);
1458+
if (candidate_offset < nearest_offset && candidate_offset > curr_offset) {
1459+
nearest_offset = candidate_offset;
1460+
next_step = candidate;
1461+
}
1462+
}
1463+
});
1464+
1465+
if (nearest_offset == numeric_limits<size_t>::max()) {
1466+
throw std::runtime_error("Spliced BAM conversion could not find path steps that match alignment");
1467+
}
1468+
1469+
// the gap between the current step and the next one along the path
1470+
size_t deletion_length = (nearest_offset - curr_offset -
1471+
graph.get_length(graph.get_handle_of_step(step)));
1472+
1473+
// add to the cigar
1474+
if (deletion_length >= min_splice_length) {
1475+
// long enough to be a splice
1476+
append_cigar_operation(deletion_length, 'N', cigar);
1477+
}
1478+
else if (deletion_length) {
1479+
// create or extend a deletion
1480+
append_cigar_operation(deletion_length, 'D', cigar);
1481+
}
1482+
}
1483+
1484+
// iterate along the path
1485+
step = next_step;
1486+
}
1487+
} // close loop over mappings
1488+
1489+
if (cigar.back().second == 'I') {
1490+
// the final insertion is actually a softclip
1491+
cigar.back().second = 'S';
1492+
}
1493+
}
1494+
1495+
simplify_cigar(cigar);
1496+
1497+
return cigar;
1498+
}
1499+
13691500
void simplify_cigar(vector<pair<int, char>>& cigar) {
13701501

13711502
size_t removed = 0;
@@ -2256,6 +2387,53 @@ bool is_perfect(const Alignment& alignment) {
22562387
return true;
22572388
}
22582389

2390+
bool is_supplementary(const Alignment& alignment) {
2391+
if (!has_annotation(alignment, "supplementary")) {
2392+
return false;
2393+
}
2394+
else {
2395+
return get_annotation<bool>(alignment, "supplementary");
2396+
}
2397+
}
2398+
2399+
pair<int64_t, int64_t> aligned_interval(const Alignment& aln) {
2400+
return pair<int64_t, int64_t>(softclip_start(aln), aln.sequence().size() - softclip_end(aln));
2401+
}
2402+
2403+
string mate_info(const string& path, int32_t pos, bool rev_strand, bool is_read1) {
2404+
subrange_t subrange;
2405+
string path_name = Paths::strip_subrange(path, &subrange);
2406+
if (subrange != PathMetadata::NO_SUBRANGE) {
2407+
pos += subrange.first;
2408+
}
2409+
string info;
2410+
info.append(path_name);
2411+
info.push_back('\t');
2412+
info.append(to_string(pos));
2413+
info.push_back('\t');
2414+
info.push_back(rev_strand ? '1' : '0');
2415+
info.push_back(is_read1 ? '1' : '0');
2416+
return info;
2417+
}
2418+
2419+
tuple<string, int32_t, bool, bool> parse_mate_info(const string& info) {
2420+
tuple<string, int64_t, bool, bool> parsed;
2421+
size_t i = 0;
2422+
while (info[i] != '\t') {
2423+
++i;
2424+
}
2425+
get<0>(parsed) = info.substr(0, i);
2426+
++i;
2427+
size_t j = i;
2428+
while (info[j] != '\t') {
2429+
++j;
2430+
}
2431+
get<1>(parsed) = stoi(info.substr(i, j - i));
2432+
get<2>(parsed) = info[j + 1] == '1';
2433+
get<3>(parsed) = info[j + 2] == '1';
2434+
return parsed;
2435+
}
2436+
22592437
bool is_mapped(const Alignment& alignment) {
22602438
if (alignment.read_mapped()) {
22612439
// The flag for being mapped is set.

src/alignment.hpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,12 @@ int32_t determine_flag(const Alignment& alignment,
237237
/// suppress softclips up to that length. This will necessitate adjusting pos,
238238
/// which is why it is passed by reference.
239239
vector<pair<int, char>> cigar_against_path(const Alignment& alignment, bool on_reverse_strand, int64_t& pos, size_t path_len, size_t softclip_suppress);
240+
241+
/// Convert a spliced alignment against a path to a cigar. The alignment must be
242+
/// colinear along a path and contain only mappings on the path, but it can have
243+
/// deletions relative to the path that follow edges in the graph.
244+
vector<pair<int, char>> spliced_cigar_against_path(const Alignment& aln, const PathPositionHandleGraph& graph, const string& path_name,
245+
int64_t pos, bool rev, int64_t min_splice_length);
240246

241247
/// Merge runs of successive I/D operations into a single I and D, remove 0-length
242248
/// operations, and merge adjacent operations of the same type
@@ -312,6 +318,17 @@ pair<string, string> middle_signature(const Alignment& aln1, const Alignment& al
312318
/// Return whether the path is a perfect match (i.e. contains no non-match edits)
313319
/// and has no soft clips (e.g. like in vg stats -a)
314320
bool is_perfect(const Alignment& alignment);
321+
bool is_supplementary(const Alignment& alignment);
322+
// The indexes on the read sequence of the portion of the read that is aligned outside of soft clips
323+
pair<int64_t, int64_t> aligned_interval(const Alignment& aln);
324+
325+
// create an annotation string required to properly set the SAM fields/flags of a supplementary alignment
326+
// the arguments all refer to properties of the primary *mate* alignment
327+
// the path name saved in the info is the base path name, with any subrange info reflected in the position
328+
string mate_info(const string& path, int32_t pos, bool rev_strand, bool is_read1);
329+
// parse the annotation string, returns tuple of (mate path name, mate path pos, mate rev strand, mate is read1)
330+
tuple<string, int32_t, bool, bool> parse_mate_info(const string& info);
331+
315332
/// Return whether the Alignment represents a mapped read (true) or an
316333
/// unaligned read (false). Uses the GAM read_mapped flag, but also sniffs for
317334
/// mapped reads which forgot to set it.

0 commit comments

Comments
 (0)