9
9
#include < tuple>
10
10
using namespace std ::chrono;
11
11
12
- void parse_cigar (const bam1_t * read1, std::vector<uint32_t > &positions, std::vector<std::string> &bases, std::vector<uint32_t > &qualities, uint32_t total_ref_pos, uint32_t total_query_pos, uint32_t ref_start_pos, std::vector<std::string> &overlap_seq, uint32_t o_lower, uint32_t o_upper, std::vector< uint32_t > &insertion_positions ){
12
+ void parse_cigar (const bam1_t * read1, std::vector<uint32_t > &positions, std::vector<std::string> &bases, std::vector<uint32_t > &qualities, uint32_t total_ref_pos, uint32_t total_query_pos, uint32_t ref_start_pos){
13
13
// ref_start_pos describe the point after which we start recording bases
14
14
const uint8_t * seq_field1 = bam_get_seq (read1);
15
15
uint32_t *cigar1 = bam_get_cigar (read1);
@@ -23,46 +23,27 @@ void parse_cigar(const bam1_t* read1, std::vector<uint32_t> &positions, std::vec
23
23
for (uint32_t j=total_query_pos; j < total_query_pos+len; j++){
24
24
char nuc = seq_nt16_str[bam_seqi (seq_field1, j)];
25
25
std::string tmp = std::string (1 , nuc);
26
- if (total_ref_pos+counter >= ref_start_pos){
27
- positions.push_back (total_ref_pos+counter);
28
- bases.push_back (tmp);
29
- qualities.push_back ((uint32_t )qual[j]);
30
- }
31
- if (total_ref_pos+counter >= o_lower && total_ref_pos+counter <= o_upper){
32
- // std::cerr << total_ref_pos+counter << " this " << std::endl;
33
- overlap_seq.push_back (tmp);
34
- }
26
+ positions.push_back (total_ref_pos+counter);
27
+ bases.push_back (tmp);
28
+ qualities.push_back ((uint32_t )qual[j]);
35
29
counter++;
36
30
}
37
31
}
38
32
if (op == 2 ){
39
- for (uint32_t j=0 ; j < len; j++){
40
- if (total_ref_pos+counter >= ref_start_pos){
41
- positions.push_back (total_ref_pos+counter);
42
- bases.push_back (" -" );
43
- qualities.push_back ((uint32_t )20 );
44
- }
45
- if (total_ref_pos+counter >= o_lower && total_ref_pos+counter <= o_upper){
46
- overlap_seq.push_back (" -" );
47
- // std::cerr << total_ref_pos+counter << " - " << std::endl;
48
- }
33
+ for (uint32_t j=0 ; j < len; j++){
34
+ positions.push_back (total_ref_pos+counter);
35
+ bases.push_back (" -" );
36
+ qualities.push_back ((uint32_t )20 );
49
37
counter++;
50
38
}
51
39
}
52
40
if (op == 1 ){
53
41
for (uint32_t j=total_query_pos; j < total_query_pos+len; j++){
54
42
char nuc = seq_nt16_str[bam_seqi (seq_field1, j)];
55
43
std::string tmp = " +" + std::string (1 , nuc);
56
- if (total_ref_pos+counter >= ref_start_pos){
57
- positions.push_back (total_ref_pos+counter);
58
- bases.push_back (tmp);
59
- qualities.push_back ((uint32_t )qual[j]);
60
- }
61
- if (total_ref_pos+counter >= o_lower && total_ref_pos+counter <= o_upper){
62
- overlap_seq.push_back (tmp);
63
- insertion_positions.push_back (total_query_pos+counter);
64
- // std::cerr << total_ref_pos << " " << nuc << std::endl;
65
- }
44
+ positions.push_back (total_ref_pos+counter);
45
+ bases.push_back (tmp);
46
+ qualities.push_back ((uint32_t )qual[j]);
66
47
counter++;
67
48
}
68
49
}
@@ -103,7 +84,7 @@ uint32_t find_sequence_end(const bam1_t* read){
103
84
return start;
104
85
}
105
86
106
- void merge_reads (const bam1_t * read1, const bam1_t * read2, IntervalTree amplicons){
87
+ void merge_reads (const bam1_t * read1, const bam1_t * read2, IntervalTree & amplicons){
107
88
// pass the forward first then reverse
108
89
// underlying assumption here is that the overlap region is identical
109
90
// also assumes that the forward read starts more "left" than the reverse
@@ -128,10 +109,6 @@ void merge_reads(const bam1_t* read1, const bam1_t* read2, IntervalTree amplicon
128
109
std::vector<std::string> bases2;
129
110
std::vector<uint32_t > qualities2;
130
111
131
- // CLEANUP can probably get rid of this
132
- std::vector<std::string> overlap_seq1;
133
- std::vector<std::string> overlap_seq2;
134
-
135
112
// we use all of the first read
136
113
uint32_t end_overlap, begin_overlap;
137
114
if (end_reverse <= end_forward){
@@ -141,57 +118,107 @@ void merge_reads(const bam1_t* read1, const bam1_t* read2, IntervalTree amplicon
141
118
}
142
119
begin_overlap = start_reverse;
143
120
144
- // std::cerr << start_forward << " " << end_forward << std::endl;
145
- // std::cerr << start_reverse << " " << end_reverse << std::endl;
146
- std::vector<uint32_t > insertion_positions1;
147
- std::vector<uint32_t > insertion_positions2;
148
-
149
- parse_cigar (read1, positions1, bases1, qualities1, total_ref_pos, total_query_pos, start_forward, overlap_seq1, begin_overlap, end_overlap, insertion_positions1);
150
- parse_cigar (read2, positions2, bases2, qualities2, start_reverse, total_query_pos, end_forward, overlap_seq2, begin_overlap, end_overlap, insertion_positions2);
151
-
152
- std::vector<uint32_t > removal;
121
+ parse_cigar (read1, positions1, bases1, qualities1, total_ref_pos, total_query_pos, start_forward);
122
+ parse_cigar (read2, positions2, bases2, qualities2, start_reverse, total_query_pos, end_forward);
153
123
154
124
// find all unique positions we need to cover
155
- std::unordered_set<int > unique_elements (positions1.begin (), positions1.end ());
125
+ std::unordered_set<uint32_t > unique_elements (positions1.begin (), positions1.end ());
156
126
unique_elements.insert (positions2.begin (), positions2.end ());
157
127
128
+ std::vector<uint32_t > final_positions;
129
+ std::vector<uint32_t > final_qualities;
130
+ std::vector<std::string> final_bases;
131
+
158
132
// for every position make sure the bases and qualities match
159
133
// make sure insertions match
160
134
for (auto pos : unique_elements){
161
- std::cerr << p << std::endl;
162
- }
163
- exit (0 );
164
- /* if(overlap_seq1.size() == overlap_seq2.size(){
165
- for(uint32_t j=0; j < overlap_seq1.size(); j++){
166
- if(overlap_seq1[j] != overlap_seq2[j]){
167
- removal.push_back(j);
135
+ auto first_it = std::find (positions1.begin (), positions1.end (), pos);
136
+ auto second_it = std::find (positions2.begin (), positions2.end (), pos);
137
+
138
+ // if its in the first read but not the second
139
+ if (first_it != positions1.end () && second_it == positions2.end ()){
140
+ auto it = positions1.begin ();
141
+ while ((it = std::find (it, positions1.end (), pos)) != positions1.end ()) {
142
+ uint32_t index1 = std::distance (positions1.begin (), it);
143
+ final_positions.push_back (positions1[index1]);
144
+ final_qualities.push_back (qualities1[index1]);
145
+ final_bases.push_back (bases1[index1]);
146
+ ++it;
168
147
}
169
- }
170
- } else {
171
- uint32_t max_range = std::max(overlap_seq1.size(), overlap_seq2.size());
172
- uint32_t counter1=0;
173
- uint32_t counter2=0;
174
- for(uint32_t j=0; j < max_range; j++){
175
- //check if this position is marked as an insertion in either sequence
176
- bool exists1 = std::count(insertion_positions1.begin(), insertion_positions1.end(), j) > 0;
177
- bool exists2 = std::count(insertion_positions2.begin(), insertion_positions2.end(), j) > 0;
178
- if(exists1 == exists2){
179
- if(overlap_seq1[counter1] != overlap_seq2[counter2]){
180
- removal.push_back();
148
+ } else if (first_it == positions1.end () && second_it != positions2.end ()){
149
+ // if its in the second read but not the first
150
+ auto it = positions2.begin ();
151
+ while ((it = std::find (it, positions2.end (), pos)) != positions2.end ()) {
152
+ uint32_t index2 = std::distance (positions2.begin (), it);
153
+ final_positions.push_back (positions2[index2]);
154
+ final_qualities.push_back (qualities2[index2]);
155
+ final_bases.push_back (bases2[index2]);
156
+ ++it;
157
+ }
158
+ } else if (first_it != positions1.end () && second_it != positions2.end ()){
159
+ // if its in both reads
160
+ // check if the number of times the position occurs is the same
161
+ uint32_t count1 = std::count (positions1.begin (), positions1.end (), pos);
162
+ uint32_t count2 = std::count (positions2.begin (), positions2.end (), pos);
163
+ if (count1 != count2) continue ;
164
+
165
+ std::vector<uint32_t > tmp_pos2;
166
+ std::vector<std::string> tmp_base2;
167
+ std::vector<uint32_t > tmp_qual2;
168
+ // go get all the second read info
169
+ auto sit = positions2.begin ();
170
+ while ((sit = std::find (sit, positions2.end (), pos)) != positions2.end ()) {
171
+ uint32_t index2 = std::distance (positions2.begin (), sit);
172
+ tmp_pos2.push_back (positions2[index2]);
173
+ tmp_qual2.push_back (qualities2[index2]);
174
+ tmp_base2.push_back (bases2[index2]);
175
+ ++sit;
176
+ }
177
+ std::vector<uint32_t > tmp_pos1;
178
+ std::vector<std::string> tmp_base1;
179
+ std::vector<uint32_t > tmp_qual1;
180
+ // go get all the first read info
181
+ auto fit = positions1.begin ();
182
+ while ((fit = std::find (fit, positions1.end (), pos)) != positions1.end ()) {
183
+ uint32_t index1 = std::distance (positions1.begin (), fit);
184
+ tmp_pos1.push_back (positions1[index1]);
185
+ tmp_qual1.push_back (qualities1[index1]);
186
+ tmp_base1.push_back (bases1[index1]);
187
+ ++fit;
188
+ }
189
+ // now do the base and quality comparison
190
+ bool use= true ;
191
+ for (uint32_t j=0 ; j < tmp_pos1.size (); j++){
192
+ if (tmp_base1[j] != tmp_base2[j]){
193
+ use = false ;
194
+ break ;
195
+ }
196
+ }
197
+ if (use) {
198
+ for (uint32_t j=0 ; j < tmp_pos1.size (); j++){
199
+ final_positions.push_back (tmp_pos1[j]);
200
+ final_qualities.push_back (tmp_qual1[j]);
201
+ final_bases.push_back (tmp_base1[j]);
181
202
}
182
203
}
183
204
}
184
- }*/
185
-
186
- // remove positions, bases, and qualitieis where it is not the same
187
- /* std::sort(removal.rbegin(), removal.rend());
188
- for (uint32_t idx : removal) {
189
- positions.erase(positions.begin() + idx);
190
- bases.erase(bases.begin() + idx);
191
- qualities.erase(qualities.begin() + idx);
192
- }*/
205
+ }
206
+
207
+ // TESTLINES
208
+ /*
209
+ for(uint32_t i=0; i < final_positions.size(); i++){
210
+ std::cerr << final_positions[i] << " " << final_qualities[i] << " " << final_bases[i] << std::endl;
211
+ }
212
+ exit(0);*/
213
+
214
+
193
215
// find assigned amplicon and populate position vector
194
- amplicons.find_read_amplicon (start_forward, end_reverse, positions1, bases1, qualities1);
216
+ bool found_amplicon = false ;
217
+ amplicons.find_read_amplicon (start_forward, end_reverse, final_positions, final_bases, final_qualities, found_amplicon);
218
+ if (!found_amplicon){
219
+ amplicons.add_read_variants (final_positions, final_bases, final_qualities);
220
+ std::cerr << " back " << found_amplicon << std::endl;
221
+ }
195
222
}
196
223
197
224
void generate_range (uint32_t start, uint32_t end, std::vector<uint32_t > &result) {
@@ -356,7 +383,7 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out,
356
383
while (sam_read1 (in, header, aln) >= 0 ) {
357
384
// get the name of the read
358
385
std::string read_name = bam_get_qname (aln);
359
- if (read_name != " A01535:8:HJ3YYDSX2:4:1103:6840:26052" ) continue ;
386
+ // if(read_name != "A01535:8:HJ3YYDSX2:4:1103:6840:26052") continue;
360
387
std::cerr << read_name << std::endl;
361
388
// std::cerr << read_name << std::endl;
362
389
strand = ' +' ;
@@ -367,7 +394,7 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out,
367
394
start_pos = aln->core .pos ;
368
395
}
369
396
// TEST LINES
370
- if (start_pos < 13000 || start_pos > 15000 ) continue ;
397
+ // if(start_pos < 13000 || start_pos > 15000) continue;
371
398
bam1_t *r = aln;
372
399
// get the md tag
373
400
uint8_t *aux = bam_aux_get (aln, " MD" );
0 commit comments