@@ -614,6 +614,8 @@ string alignment_to_sam_internal(const Alignment& alignment,
614614 bool paired,
615615 const int32_t tlen_max) {
616616
617+
618+
617619 // Determine flags, using orientation, next/prev fragments, and pairing status.
618620 int32_t flags = determine_flag (alignment, refseq, refpos, refrev, mateseq, matepos, materev, tlen, paired, tlen_max);
619621
@@ -691,8 +693,12 @@ int32_t determine_flag(const Alignment& alignment,
691693 // Determine flags, using orientation, next/prev fragments, and pairing status.
692694 int32_t flags = sam_flag (alignment, refrev, paired);
693695
694- // We've observed some reads with the unmapped flag set and also a CIGAR string set, which shouldn't happen.
695- // We will check for this. The CIGAR string will only be set in the output if the alignment has a path.
696+ // We've observed some reads with the unmapped flag set and also a CIGAR
697+ // string set, which is allowed by the SAM spec but which we are not
698+ // supposed to generate.
699+ //
700+ // We will check for this. The CIGAR string will only be set in the output
701+ // if the alignment has a path.
696702 assert ((bool )(flags & BAM_FUNMAP) != (alignment.has_path () && alignment.path ().mapping_size ()));
697703
698704 if (!((bool )(flags & BAM_FUNMAP)) && paired && !refseq.empty () && refseq == mateseq) {
@@ -725,6 +731,10 @@ int32_t determine_flag(const Alignment& alignment,
725731 flags |= BAM_FMREVERSE;
726732 }
727733
734+ if (is_supplementary (alignment)) {
735+ flags |= BAM_FSUPPLEMENTARY;
736+ }
737+
728738 return flags;
729739}
730740
@@ -805,7 +815,7 @@ bam1_t* alignment_to_bam_internal(bam_hdr_t* header,
805815 const bool refrev,
806816 const vector<pair<int , char >>& cigar,
807817 const string& mateseq,
808- const int32_t matepos,
818+ int32_t matepos,
809819 bool materev,
810820 const int32_t tlen,
811821 bool paired,
@@ -1362,6 +1372,131 @@ vector<pair<int, char>> cigar_against_path(const Alignment& alignment, bool on_r
13621372 return cigar;
13631373}
13641374
1375+ vector<pair<int , char >> spliced_cigar_against_path (const Alignment& aln, const PathPositionHandleGraph& graph, const string& path_name,
1376+ int64_t pos, bool rev, int64_t min_splice_length) {
1377+ // the return value
1378+ vector<pair<int , char >> cigar;
1379+
1380+ if (aln.has_path () && aln.path ().mapping_size () > 0 ) {
1381+ // the read is aligned to the path
1382+
1383+ path_handle_t path_handle = graph.get_path_handle (path_name);
1384+ step_handle_t step = graph.get_step_at_position (path_handle, pos);
1385+
1386+ // to indicate whether we've found the edit that corresponds to the BAM position
1387+ bool found_pos = false ;
1388+
1389+ const Path& path = aln.path ();
1390+ for (size_t i = 0 ; i < path.mapping_size (); ++i) {
1391+
1392+ // we traverse backwards on a reverse strand mapping
1393+ const Mapping& mapping = path.mapping (rev ? path.mapping_size () - 1 - i : i);
1394+
1395+ for (size_t j = 0 ; j < mapping.edit_size (); ++j) {
1396+
1397+ // we traverse backwards on a reverse strand mapping
1398+ const Edit& edit = mapping.edit (rev ? mapping.edit_size () - 1 - j : j);
1399+
1400+ if (!found_pos) {
1401+ // we may still be searching through an initial softclip to find
1402+ // the edit that corresponds to the BAM position
1403+ if (edit.to_length () > 0 && edit.from_length () == 0 ) {
1404+ append_cigar_operation (edit.to_length (), ' S' , cigar);
1405+ // skip the main block where we assign cigar operations
1406+ continue ;
1407+ }
1408+ else {
1409+ found_pos = true ;
1410+ }
1411+ }
1412+
1413+ // identify the cigar operation
1414+ char cigar_code;
1415+ int length;
1416+ if (edit.from_length () == edit.to_length ()) {
1417+ cigar_code = ' M' ;
1418+ length = edit.from_length ();
1419+ }
1420+ else if (edit.from_length () > 0 && edit.to_length () == 0 ) {
1421+ cigar_code = ' D' ;
1422+ length = edit.from_length ();
1423+ }
1424+ else if (edit.to_length () > 0 && edit.from_length () == 0 ) {
1425+ cigar_code = ' I' ;
1426+ length = edit.to_length ();
1427+ }
1428+ else {
1429+ throw std::runtime_error (" Spliced CIGAR construction can only convert simple edits" );
1430+ }
1431+
1432+ append_cigar_operation (length, cigar_code, cigar);
1433+ } // close loop over edits
1434+
1435+ if (found_pos && i + 1 < path.mapping_size ()) {
1436+ // we're anchored on the path by the annotated position, and we're transitioning between
1437+ // two mappings, so we should check for a deletion/splice edge
1438+
1439+ step_handle_t next_step = graph.get_next_step (step);
1440+
1441+ handle_t next_handle = graph.get_handle_of_step (next_step);
1442+ const Position& next_pos = path.mapping (rev ? path.mapping_size () - 2 - i : i + 1 ).position ();
1443+ if (graph.get_id (next_handle) != next_pos.node_id ()
1444+ || (graph.get_is_reverse (next_handle) != next_pos.is_reverse ()) != rev) {
1445+
1446+ // the next mapping in the alignment is not the next mapping on the path, so we must have
1447+ // taken a deletion
1448+
1449+ // find the closest step that is further along the path than the current one
1450+ // and matches the next position (usually there will only be one)
1451+ size_t curr_offset = graph.get_position_of_step (step);
1452+ size_t nearest_offset = numeric_limits<size_t >::max ();
1453+ graph.for_each_step_on_handle (graph.get_handle (next_pos.node_id ()),
1454+ [&](const step_handle_t & candidate) {
1455+
1456+ if (graph.get_path_handle_of_step (candidate) == path_handle) {
1457+ size_t candidate_offset = graph.get_position_of_step (candidate);
1458+ if (candidate_offset < nearest_offset && candidate_offset > curr_offset) {
1459+ nearest_offset = candidate_offset;
1460+ next_step = candidate;
1461+ }
1462+ }
1463+ });
1464+
1465+ if (nearest_offset == numeric_limits<size_t >::max ()) {
1466+ throw std::runtime_error (" Spliced BAM conversion could not find path steps that match alignment" );
1467+ }
1468+
1469+ // the gap between the current step and the next one along the path
1470+ size_t deletion_length = (nearest_offset - curr_offset -
1471+ graph.get_length (graph.get_handle_of_step (step)));
1472+
1473+ // add to the cigar
1474+ if (deletion_length >= min_splice_length) {
1475+ // long enough to be a splice
1476+ append_cigar_operation (deletion_length, ' N' , cigar);
1477+ }
1478+ else if (deletion_length) {
1479+ // create or extend a deletion
1480+ append_cigar_operation (deletion_length, ' D' , cigar);
1481+ }
1482+ }
1483+
1484+ // iterate along the path
1485+ step = next_step;
1486+ }
1487+ } // close loop over mappings
1488+
1489+ if (cigar.back ().second == ' I' ) {
1490+ // the final insertion is actually a softclip
1491+ cigar.back ().second = ' S' ;
1492+ }
1493+ }
1494+
1495+ simplify_cigar (cigar);
1496+
1497+ return cigar;
1498+ }
1499+
13651500void simplify_cigar (vector<pair<int , char >>& cigar) {
13661501
13671502 size_t removed = 0 ;
@@ -1664,7 +1799,7 @@ int32_t sam_flag(const Alignment& alignment, bool on_reverse_strand, bool paired
16641799 // TODO: catch reads with pair partners when they shouldn't be paired?
16651800 }
16661801
1667- if (!alignment. has_path () || alignment. path (). mapping_size () == 0 ) {
1802+ if (!is_mapped ( alignment) ) {
16681803 // unmapped
16691804 flag |= BAM_FUNMAP;
16701805 }
@@ -1823,18 +1958,28 @@ Alignment bam_to_alignment(const bam1_t *b,
18231958
18241959 }
18251960 alignment.set_read_paired ((b->core .flag & BAM_FPAIRED) != 0 );
1961+
1962+ // Consult the One True Place in SAM for knowing if a read is aligned or not.
1963+ bool is_aligned = !(b->core .flag & BAM_FUNMAP);
18261964
1827- if (graph != nullptr && bh != nullptr && b->core .tid >= 0 ) {
1965+ if (is_aligned) {
1966+ // Set the little-used GAM field that exists to represent this.
1967+ alignment.set_read_mapped (is_aligned);
1968+
1969+ // Use the MAPQ
18281970 alignment.set_mapping_quality (b->core .qual );
1829- alignment.set_read_mapped (true );
1830- // Look for the path handle this is against.
1831- auto found = tid_path_handle.find (b->core .tid );
1832- if (found == tid_path_handle.end ()) {
1833- cerr << " [vg::alignment.cpp] error: alignment references path not present in graph: "
1834- << bh->target_name [b->core .tid ] << endl;
1835- exit (1 );
1971+
1972+ if (graph != nullptr && bh != nullptr && b->core .tid >= 0 ) {
1973+ // We can actually build the path for this read.
1974+ // Look for the path handle this is against.
1975+ auto found = tid_path_handle.find (b->core .tid );
1976+ if (found == tid_path_handle.end ()) {
1977+ cerr << " [vg::alignment.cpp] error: alignment references path not present in graph: "
1978+ << bh->target_name [b->core .tid ] << endl;
1979+ exit (1 );
1980+ }
1981+ mapping_against_path (alignment, b, found->second , graph, b->core .flag & BAM_FREVERSE);
18361982 }
1837- mapping_against_path (alignment, b, found->second , graph, b->core .flag & BAM_FREVERSE);
18381983 }
18391984
18401985 // TODO: htslib doesn't wrap this flag for some reason.
@@ -1856,7 +2001,8 @@ Alignment bam_to_alignment(const bam1_t *b,
18562001 }
18572002 ++removed;
18582003 }
1859- else if (tag_name == " AS" ) {
2004+ else if (tag_name == " AS" && is_aligned) {
2005+ // Set the score, for aligned reads.
18602006 alignment.set_score (parse<int64_t >(tag.substr (5 , string::npos)));
18612007 ++removed;
18622008 }
@@ -2228,9 +2374,8 @@ int softclip_trim(Alignment& alignment) {
22282374}
22292375
22302376bool is_perfect (const Alignment& alignment) {
2231- // Non-aligned paths can't be perfect (code from subcommand/stats_main.cpp)
2232- bool has_alignment = alignment.score () > 0 || alignment.path ().mapping_size () > 0 ;
2233- if (!has_alignment) return false ;
2377+ // Non-aligned paths can't be perfect
2378+ if (!is_mapped (alignment)) return false ;
22342379
22352380 // Check that the path is perfect
22362381 for (size_t i = 0 ; i < alignment.path ().mapping_size (); ++i) {
@@ -2242,6 +2387,79 @@ bool is_perfect(const Alignment& alignment) {
22422387 return true ;
22432388}
22442389
2390+ bool is_supplementary (const Alignment& alignment) {
2391+ if (!has_annotation (alignment, " supplementary" )) {
2392+ return false ;
2393+ }
2394+ else {
2395+ return get_annotation<bool >(alignment, " supplementary" );
2396+ }
2397+ }
2398+
2399+ pair<int64_t , int64_t > aligned_interval (const Alignment& aln) {
2400+ return pair<int64_t , int64_t >(softclip_start (aln), aln.sequence ().size () - softclip_end (aln));
2401+ }
2402+
2403+ string mate_info (const string& path, int32_t pos, bool rev_strand, bool is_read1) {
2404+ subrange_t subrange;
2405+ string path_name = Paths::strip_subrange (path, &subrange);
2406+ if (subrange != PathMetadata::NO_SUBRANGE) {
2407+ pos += subrange.first ;
2408+ }
2409+ string info;
2410+ info.append (path_name);
2411+ info.push_back (' \t ' );
2412+ info.append (to_string (pos));
2413+ info.push_back (' \t ' );
2414+ info.push_back (rev_strand ? ' 1' : ' 0' );
2415+ info.push_back (is_read1 ? ' 1' : ' 0' );
2416+ return info;
2417+ }
2418+
2419+ tuple<string, int32_t , bool , bool > parse_mate_info (const string& info) {
2420+ tuple<string, int64_t , bool , bool > parsed;
2421+ size_t i = 0 ;
2422+ while (info[i] != ' \t ' ) {
2423+ ++i;
2424+ }
2425+ get<0 >(parsed) = info.substr (0 , i);
2426+ ++i;
2427+ size_t j = i;
2428+ while (info[j] != ' \t ' ) {
2429+ ++j;
2430+ }
2431+ get<1 >(parsed) = stoi (info.substr (i, j - i));
2432+ get<2 >(parsed) = info[j + 1 ] == ' 1' ;
2433+ get<3 >(parsed) = info[j + 2 ] == ' 1' ;
2434+ return parsed;
2435+ }
2436+
2437+ bool is_mapped (const Alignment& alignment) {
2438+ if (alignment.read_mapped ()) {
2439+ // The flag for being mapped is set.
2440+ return true ;
2441+ }
2442+
2443+ // Otherwise it's not set, but we don't consistently set it in vg. (SAM
2444+ // uses an *un*mapped flag so you don't forget to set it on mapped reads
2445+ // ever.) So second-guess the flag and up on round-tripping SAM with
2446+ // alignment fields set for unmapped reads.
2447+
2448+ if (alignment.path ().mapping_size () > 0 ) {
2449+ // If an alignment path is filled in, it is mapped.
2450+ return true ;
2451+ }
2452+
2453+ if (alignment.score () > 0 ) {
2454+ // If there's no alignment actually there but anonzero score, we
2455+ // represent a mapped read, we're just not sure where to.
2456+ return true ;
2457+ }
2458+
2459+ // If there's no evidence of being mapped, we're unmapped.
2460+ return false ;
2461+ }
2462+
22452463int query_overlap (const Alignment& aln1, const Alignment& aln2) {
22462464 if (!alignment_to_length (aln1) || !alignment_to_length (aln2)
22472465 || !aln1.path ().mapping_size () || !aln2.path ().mapping_size ()
0 commit comments