|
1 | 1 | #include "bam.hpp"
|
2 | 2 |
|
3 |
| -using namespace std ; |
| 3 | +using namespace std; |
4 | 4 |
|
5 |
| -uint32_t cigar_len_mask = 0xFFFFFFF0 ; |
6 |
| -uint32_t cigar_type_mask = 0xF ; |
| 5 | +uint32_t cigar_len_mask = 0xFFFFFFF0; |
| 6 | +uint32_t cigar_type_mask = 0xF; |
7 | 7 |
|
8 | 8 | string print_cigar_symbol(int type) {
|
9 |
| - if (type == BAM_CMATCH) { |
10 |
| - return "M" ; |
11 |
| - } |
12 |
| - if (type == BAM_CINS) { |
13 |
| - return "I" ; |
14 |
| - } |
15 |
| - if (type == BAM_CDEL) { |
16 |
| - return "D" ; |
17 |
| - } |
18 |
| - if (type == BAM_CSOFT_CLIP) { |
19 |
| - return "S" ; |
20 |
| - } |
21 |
| - if (type == BAM_CHARD_CLIP) { |
22 |
| - return "H" ; |
23 |
| - } |
24 |
| - return "X" ; |
| 9 | + if (type == BAM_CMATCH) { |
| 10 | + return "M"; |
| 11 | + } |
| 12 | + if (type == BAM_CINS) { |
| 13 | + return "I"; |
| 14 | + } |
| 15 | + if (type == BAM_CDEL) { |
| 16 | + return "D"; |
| 17 | + } |
| 18 | + if (type == BAM_CSOFT_CLIP) { |
| 19 | + return "S"; |
| 20 | + } |
| 21 | + if (type == BAM_CHARD_CLIP) { |
| 22 | + return "H"; |
| 23 | + } |
| 24 | + return "X"; |
25 | 25 | }
|
26 | 26 |
|
27 |
| -vector<pair<uint32_t, uint32_t>> decode_cigar(bam1_t* read) { |
28 |
| - // get CIGAR |
29 |
| - vector<pair<uint32_t, uint32_t>> cigar_offsets ; |
30 |
| - uint32_t* cigar = bam_get_cigar(read) ; |
31 |
| - int offset = 0 ; |
32 |
| - for (int i = 0; i < read->core.n_cigar; i++) { |
33 |
| - uint32_t type = cigar[i] & cigar_type_mask ; |
34 |
| - uint32_t length = cigar[i] >> 4 ; |
35 |
| - cigar_offsets.push_back(make_pair(length, type)) ; |
36 |
| - } |
37 |
| - return cigar_offsets ; |
| 27 | +vector<pair<uint32_t, uint32_t>> decode_cigar(bam1_t *read) { |
| 28 | + // get CIGAR |
| 29 | + vector<pair<uint32_t, uint32_t>> cigar_offsets; |
| 30 | + uint32_t *cigar = bam_get_cigar(read); |
| 31 | + int offset = 0; |
| 32 | + for (int i = 0; i < read->core.n_cigar; i++) { |
| 33 | + uint32_t type = cigar[i] & cigar_type_mask; |
| 34 | + uint32_t length = cigar[i] >> 4; |
| 35 | + cigar_offsets.push_back(make_pair(length, type)); |
| 36 | + } |
| 37 | + return cigar_offsets; |
38 | 38 | }
|
39 | 39 |
|
40 |
| -uint8_t* encode_cigar(vector<pair<uint32_t, uint32_t>> cigar) { |
41 |
| - uint32_t* cigar_bytes = (uint32_t*) malloc(sizeof(uint32_t) * cigar.size()) ; |
42 |
| - for (int i = 0; i < cigar.size(); i++) { |
43 |
| - cigar_bytes[i] = (cigar[i].first << 4) | (cigar[i].second & cigar_type_mask) ; |
44 |
| - } |
45 |
| - return (uint8_t*) cigar_bytes ; |
| 40 | +uint8_t *encode_cigar(vector<pair<uint32_t, uint32_t>> cigar) { |
| 41 | + uint32_t *cigar_bytes = (uint32_t *)malloc(sizeof(uint32_t) * cigar.size()); |
| 42 | + for (int i = 0; i < cigar.size(); i++) { |
| 43 | + cigar_bytes[i] = |
| 44 | + (cigar[i].first << 4) | (cigar[i].second & cigar_type_mask); |
| 45 | + } |
| 46 | + return (uint8_t *)cigar_bytes; |
46 | 47 | }
|
47 | 48 |
|
48 |
| -uint8_t* encode_bam_seq(char* seq) { |
49 |
| - int n = (strlen(seq) + 1) >> 1 ; |
50 |
| - int l_seq = strlen(seq) ; |
51 |
| - uint8_t* seq_bytes = (uint8_t*) malloc(sizeof(uint8_t) * n) ; |
52 |
| - int i = 0 ; |
53 |
| - n = 0 ; |
54 |
| - for (i = 0; i + 1 < l_seq; i += 2) { |
55 |
| - seq_bytes[n] = (seq_nt16_table[(unsigned char)seq[i]] << 4) | seq_nt16_table[(unsigned char)seq[i + 1]]; |
56 |
| - n += 1 ; |
57 |
| - } |
58 |
| - for (; i < l_seq; i++) { |
59 |
| - seq_bytes[n] = seq_nt16_table[(unsigned char)seq[i]] << 4; |
60 |
| - n += 1 ; |
61 |
| - } |
62 |
| - return seq_bytes ; |
| 49 | +uint8_t *encode_bam_seq(char *seq) { |
| 50 | + int n = (strlen(seq) + 1) >> 1; |
| 51 | + int l_seq = strlen(seq); |
| 52 | + uint8_t *seq_bytes = (uint8_t *)malloc(sizeof(uint8_t) * n); |
| 53 | + int i = 0; |
| 54 | + n = 0; |
| 55 | + for (i = 0; i + 1 < l_seq; i += 2) { |
| 56 | + seq_bytes[n] = (seq_nt16_table[(unsigned char)seq[i]] << 4) | |
| 57 | + seq_nt16_table[(unsigned char)seq[i + 1]]; |
| 58 | + n += 1; |
| 59 | + } |
| 60 | + for (; i < l_seq; i++) { |
| 61 | + seq_bytes[n] = seq_nt16_table[(unsigned char)seq[i]] << 4; |
| 62 | + n += 1; |
| 63 | + } |
| 64 | + return seq_bytes; |
63 | 65 | }
|
64 | 66 |
|
65 | 67 | char reverse_complement_base(char base) {
|
66 |
| - if (base == 'C' || base == 'c') { |
67 |
| - return 'G' ; |
68 |
| - } |
69 |
| - if (base == 'A' || base == 'a') { |
70 |
| - return 'T' ; |
71 |
| - } |
72 |
| - if (base == 'G' || base == 'g') { |
73 |
| - return 'C' ; |
74 |
| - } |
75 |
| - if (base == 'T' || base == 't') { |
76 |
| - return 'A' ; |
77 |
| - } |
78 |
| - else { |
79 |
| - return 'N' ; |
80 |
| - } |
| 68 | + if (base == 'C' || base == 'c') { |
| 69 | + return 'G'; |
| 70 | + } |
| 71 | + if (base == 'A' || base == 'a') { |
| 72 | + return 'T'; |
| 73 | + } |
| 74 | + if (base == 'G' || base == 'g') { |
| 75 | + return 'C'; |
| 76 | + } |
| 77 | + if (base == 'T' || base == 't') { |
| 78 | + return 'A'; |
| 79 | + } else { |
| 80 | + return 'N'; |
| 81 | + } |
81 | 82 | }
|
82 | 83 |
|
83 |
| -void reverse_complement_read(char* seq) { |
84 |
| - int l = strlen(seq) ; |
85 |
| - int i = 0 ; |
86 |
| - while (i < l / 2) { |
87 |
| - auto t = reverse_complement_base(seq[l - i]) ; |
88 |
| - seq[l - 1 - i] = reverse_complement_base(seq[i]) ; |
89 |
| - seq[i] = t ; |
90 |
| - i += 1 ; |
91 |
| - } |
| 84 | +void reverse_complement_read(char *seq) { |
| 85 | + int l = strlen(seq); |
| 86 | + int i = 0; |
| 87 | + while (i < l / 2) { |
| 88 | + auto t = reverse_complement_base(seq[l - i]); |
| 89 | + seq[l - 1 - i] = reverse_complement_base(seq[i]); |
| 90 | + seq[i] = t; |
| 91 | + i += 1; |
| 92 | + } |
92 | 93 | }
|
93 | 94 |
|
94 | 95 | vector<pair<int, int>> get_aligned_pairs(bam1_t *alignment) {
|
95 |
| - vector<pair<int, int>> result ; |
96 |
| - uint ref_pos = alignment->core.pos ; |
97 |
| - uint read_pos = 0 ; |
98 |
| - auto cigar_offsets = decode_cigar(alignment) ; |
99 |
| - int m = 0 ; |
100 |
| - while (true) { |
101 |
| - if (m == cigar_offsets.size()) { |
102 |
| - break ; |
103 |
| - } |
104 |
| - if (cigar_offsets[m].second == BAM_CMATCH or cigar_offsets[m].second == BAM_CEQUAL or cigar_offsets[m].second == BAM_CDIFF) { |
105 |
| - for (uint i = ref_pos; i < ref_pos + cigar_offsets[m].first; ++i) { |
106 |
| - result.push_back(make_pair(read_pos, i)); |
107 |
| - read_pos++; |
108 |
| - } |
109 |
| - ref_pos += cigar_offsets[m].first; |
110 |
| - } else if (cigar_offsets[m].second == BAM_CINS or cigar_offsets[m].second == BAM_CSOFT_CLIP) { |
111 |
| - for (uint i = 0; i < cigar_offsets[m].first; ++i) { |
112 |
| - result.push_back(make_pair(read_pos, -1)); |
113 |
| - read_pos++; |
114 |
| - } |
115 |
| - } else if (cigar_offsets[m].second == BAM_CDEL) { |
116 |
| - for (uint i = ref_pos; i < ref_pos + cigar_offsets[m].first; ++i) { |
117 |
| - result.push_back(make_pair(-1, i)); |
118 |
| - } |
119 |
| - ref_pos += cigar_offsets[m].first; |
120 |
| - } else if (cigar_offsets[m].second == BAM_CHARD_CLIP) { |
121 |
| - // advances neither |
122 |
| - } else if (cigar_offsets[m].second == BAM_CREF_SKIP) { |
123 |
| - for (uint i = ref_pos; i < ref_pos + cigar_offsets[m].first; ++i) { |
124 |
| - result.push_back(make_pair(-1, i)); |
125 |
| - } |
126 |
| - ref_pos += cigar_offsets[m].first; |
127 |
| - } else { //if (cigar_offsets[m].second == BAM_CPAD) { |
128 |
| - //TODO |
129 |
| - } |
130 |
| - m++ ; |
131 |
| - } |
132 |
| - return result; |
| 96 | + vector<pair<int, int>> result; |
| 97 | + uint ref_pos = alignment->core.pos; |
| 98 | + uint read_pos = 0; |
| 99 | + auto cigar_offsets = decode_cigar(alignment); |
| 100 | + int m = 0; |
| 101 | + while (true) { |
| 102 | + if (m == cigar_offsets.size()) { |
| 103 | + break; |
| 104 | + } |
| 105 | + if (cigar_offsets[m].second == BAM_CMATCH or |
| 106 | + cigar_offsets[m].second == BAM_CEQUAL or |
| 107 | + cigar_offsets[m].second == BAM_CDIFF) { |
| 108 | + for (uint i = ref_pos; i < ref_pos + cigar_offsets[m].first; ++i) { |
| 109 | + result.push_back(make_pair(read_pos, i)); |
| 110 | + read_pos++; |
| 111 | + } |
| 112 | + ref_pos += cigar_offsets[m].first; |
| 113 | + } else if (cigar_offsets[m].second == BAM_CINS or |
| 114 | + cigar_offsets[m].second == BAM_CSOFT_CLIP) { |
| 115 | + for (uint i = 0; i < cigar_offsets[m].first; ++i) { |
| 116 | + result.push_back(make_pair(read_pos, -1)); |
| 117 | + read_pos++; |
| 118 | + } |
| 119 | + } else if (cigar_offsets[m].second == BAM_CDEL) { |
| 120 | + for (uint i = ref_pos; i < ref_pos + cigar_offsets[m].first; ++i) { |
| 121 | + result.push_back(make_pair(-1, i)); |
| 122 | + } |
| 123 | + ref_pos += cigar_offsets[m].first; |
| 124 | + } else if (cigar_offsets[m].second == BAM_CHARD_CLIP) { |
| 125 | + // advances neither |
| 126 | + } else if (cigar_offsets[m].second == BAM_CREF_SKIP) { |
| 127 | + for (uint i = ref_pos; i < ref_pos + cigar_offsets[m].first; ++i) { |
| 128 | + result.push_back(make_pair(-1, i)); |
| 129 | + } |
| 130 | + ref_pos += cigar_offsets[m].first; |
| 131 | + } else { // if (cigar_offsets[m].second == BAM_CPAD) { |
| 132 | + // TODO |
| 133 | + } |
| 134 | + m++; |
| 135 | + } |
| 136 | + return result; |
133 | 137 | }
|
134 |
| - |
|
0 commit comments