@@ -67,18 +67,20 @@ void IntervalTree::combine_haplotypes(ITNode *root){
67
67
combine_haplotypes (root->right );
68
68
}
69
69
70
- void IntervalTree::assign_read_amplicon (ITNode *root, uint32_t amp_start, std::vector<uint32_t > positions, std::vector<std::string> bases, std::vector<uint32_t > qualities){
70
+ void IntervalTree::assign_read_amplicon (ITNode *root, uint32_t amp_start, std::vector<uint32_t > positions, std::vector<std::string> bases, std::vector<uint32_t > qualities, uint8_t min_qual ){
71
71
if (root==NULL ) return ;
72
72
if ((uint32_t )root->data ->low == amp_start){
73
73
for (uint32_t i=0 ; i < positions.size (); i++){
74
74
for (uint32_t j=0 ; j < root->amp_positions .size (); j++){
75
75
if (positions[i] == root->amp_positions [j].pos ){
76
- root->amp_positions [j].update_alleles (bases[i], 1 , qualities[i]);
76
+ if (qualities[i] >= (uint32_t ) min_qual){
77
+ root->amp_positions [j].update_alleles (bases[i], 1 , qualities[i]);
78
+ }
77
79
}
78
80
}
79
81
}
80
82
}
81
- assign_read_amplicon (root->right , amp_start, positions, bases, qualities);
83
+ assign_read_amplicon (root->right , amp_start, positions, bases, qualities, min_qual );
82
84
}
83
85
84
86
void IntervalTree::find_read_amplicon (ITNode *root, uint32_t lower, uint32_t upper, bool &found, std::string read_name, uint32_t &_start, uint32_t &_dist){
@@ -99,7 +101,7 @@ void IntervalTree::find_read_amplicon(ITNode *root, uint32_t lower, uint32_t upp
99
101
100
102
void IntervalTree::amplicon_position_pop (ITNode *root){
101
103
if (root==NULL ) return ;
102
- for (uint32_t i=root->data ->low ; i < root->data ->high ; i++){
104
+ for (uint32_t i=root->data ->low ; i < ( uint32_t ) root->data ->high ; i++){
103
105
position add_pos;
104
106
add_pos.pos = i;
105
107
add_pos.alleles = populate_basic_alleles ();
@@ -149,212 +151,11 @@ void IntervalTree::detect_abberations(ITNode *root, uint32_t find_position){
149
151
150
152
}
151
153
152
- void IntervalTree::add_read_variants (std::vector<uint32_t > positions, std::vector<std::string> bases, std::vector<uint32_t > qualities){
154
+ void IntervalTree::add_read_variants (std::vector<uint32_t > positions, std::vector<std::string> bases, std::vector<uint32_t > qualities, uint8_t min_qual ){
153
155
for (uint32_t i=0 ; i < positions.size (); i++){
154
- variants[positions[i]].update_alleles (bases[i], 1 , qualities[i]);
155
- }
156
- }
157
-
158
- void IntervalTree::add_read_variants (uint32_t *cigar, uint32_t start_pos, uint32_t nlength, uint8_t *seq, uint8_t *aux, uint8_t * qual, std::string qname) {
159
- uint32_t consumed_query = 0 ;
160
- uint32_t consumed_ref = 0 ;
161
- std::vector<uint32_t > sc_positions; // sc positions
162
- std::vector<uint32_t > ignore_sequence; // positions in query that are insertions
163
- // first we handle insertion from the cigar
164
- std::vector<uint32_t > useful;
165
- std::vector<uint8_t > sequence;
166
- std::vector<uint8_t > quality;
167
-
168
- uint32_t quality_threshold=20 ;
169
- uint8_t nt = 0 ;
170
- uint32_t length = 0 , ll=0 , op=0 ;
171
- // auto start = high_resolution_clock::now();
172
- for (uint32_t i=0 ; i < nlength; i++){
173
- op = bam_cigar_op (cigar[i]);
174
- if (bam_cigar_type (op) & 1 ){
175
- length = bam_cigar_oplen (cigar[i]);
176
- for (uint32_t k=0 ; k < length; k++){
177
- useful.push_back (ll+k);
178
- }
179
- ll += length;
180
- }
181
- }
182
- // TESTLINES
183
- uint32_t qual_threshold = 20 ;
184
- quality.reserve (useful.size ());
185
- sequence.reserve (useful.size ());
186
- for (uint32_t k=0 ; k < useful.size (); k++){
187
- if (qual[useful[k]] < qual_threshold){
188
- sequence.push_back (' L' );
189
- total_qual += qual[useful[k]];
190
- total_bases++;
191
- } else {
192
- nt = seq_nt16_str[bam_seqi (seq, useful[k])];
193
- sequence.push_back (nt);
194
- }
195
- quality.push_back (qual[useful[k]]);;
196
- }
197
- useful.clear ();
198
- std::string nuc;
199
- for (uint32_t j=0 ; j < nlength; j++){
200
- op = bam_cigar_op (cigar[j]);
201
- uint32_t oplen = bam_cigar_oplen (cigar[j]);
202
- if (op == 1 ){
203
- std::ostringstream convert;
204
- uint32_t q = 0 ;
205
- for (uint32_t k=0 ; k < oplen; k++) {
206
- // convert data type to get the characters
207
- ignore_sequence.push_back (k+consumed_ref+start_pos);
208
- convert << sequence[k+consumed_query];
209
- q += quality[k+consumed_query];
210
- }
211
- nuc.clear ();
212
- nuc = convert.str ();
213
-
214
- int avg_q = (int )q/nuc.size ();
215
- char ch = ' L' ;
216
- std::string nuc = " +" + convert.str ();
217
- // check if this position exists
218
- int exists = check_position_exists (start_pos+consumed_ref, variants);
219
- if (exists != -1 && nuc.find (ch) == std::string::npos) {
220
- variants[exists].update_alleles (nuc, 1 , avg_q);
221
- } else {
222
- // add position to vector
223
- position add_pos;
224
- add_pos.pos = start_pos+consumed_ref; // don't add a position
225
- if (nuc.find (ch) == std::string::npos) {
226
- add_pos.update_alleles (nuc, 1 , avg_q);
227
- variants.push_back (add_pos);
228
- }
229
- }
230
- consumed_query += oplen;
231
- continue ;
232
- }
233
- if (!(bam_cigar_type (op) & 1 ) && !(bam_cigar_type (op) & 2 )){
234
- continue ;
235
- }
236
- // if we don't consume both query and reference
237
- if (!(bam_cigar_type (op) & 1 ) || !(bam_cigar_type (op) & 2 )){
238
- if (op != 2 ){
239
- for (uint32_t k=0 ; k < oplen; k++) {
240
- // convert data type to get the characters
241
- sc_positions.push_back (k+consumed_query);
242
- }
243
- }
244
- }
245
- // consumes query
246
- if (bam_cigar_type (op) & 1 ){
247
- consumed_query += oplen;
248
- }
249
- // consumes ref
250
- if (bam_cigar_type (op) & 2 ){
251
- consumed_ref += oplen;
252
- }
253
- }
254
- // we will use the aux tag to handle deletions
255
- bool deletion = false ;
256
- bool substitution = false ;
257
- uint32_t current_pos = start_pos;
258
- std::vector<uint32_t > deletion_positions; // the aux tag does NOT recognize insertions
259
- std::vector<uint32_t > substitutions; // handle substitutions
260
- std::string gather_digits = " " ;
261
- std::string deleted_char;
262
- std::string sub_char;
263
- uint32_t last_char = 0 ;
264
- uint32_t j = 0 ;
265
- do {
266
- char character = (char )aux[j];
267
- if (character == ' ^' ){
268
- current_pos += std::stoi (gather_digits);
269
- gather_digits = " " ;
270
- deletion = true ;
271
- } else if (isdigit (character) && deletion) {
272
- deleted_char = " " ;
273
- deletion = false ;
274
- gather_digits += character;
275
- } else if (isalpha (character) && deletion) {
276
- variants[current_pos].update_alleles (" -" , 1 , 0 );
277
- deletion_positions.push_back (current_pos);
278
- current_pos += 1 ;
279
- deleted_char += character;
280
- deletion = true ;
281
- } else if (isdigit (character) && !deletion) {
282
- if (substitution){
283
- for (uint32_t z=1 ; z < sub_char.size (); z++){
284
- substitutions.push_back (current_pos + z);
285
- }
286
- substitution = false ;
287
- sub_char.clear ();
288
- }
289
- if (last_char > 0 ){
290
- current_pos += last_char;
291
- last_char = 0 ;
292
- }
293
- gather_digits += character;
294
- } else if (isalpha (character) && !deletion) {
295
- last_char += 1 ;
296
- substitution = true ;
297
- sub_char += character;
298
- if (gather_digits.size () > 0 ){
299
- current_pos += std::stoi (gather_digits);
300
- }
301
- gather_digits = " " ;
302
- }
303
- j++;
304
- } while (aux[j] != ' \0 ' );
305
- // now that we know where the insertions and deletions are, let's just iterate the query sequence and add it in, skipping problem positions
306
- current_pos = start_pos;
307
- bool prev_insertion = false ;
308
- std::ostringstream convert;
309
- nuc.clear ();
310
- std::vector<uint32_t > seen_insertions;
311
- std::vector<uint32_t >::iterator i_it;
312
- std::string test = " " ;
313
- // j is relative to the sequence and current pos to the reference
314
- for (uint32_t j=0 ; j < sequence.size (); j++){
315
- if (qname == test){
316
- std::cerr << " j " << j << " cur pos " << current_pos << " " << sequence[j] << std::endl;
317
- }
318
- std::vector<uint32_t >::iterator it = find (deletion_positions.begin (), deletion_positions.end (), current_pos);
319
- if (it != deletion_positions.end ()) {
320
- current_pos += 1 ;
321
- j -= 1 ;
322
- continue ;
323
- }
324
- it = find (ignore_sequence.begin (), ignore_sequence.end (), current_pos);
325
- i_it = find (seen_insertions.begin (), seen_insertions.end (), current_pos);
326
- // handle insertions
327
- if (it != ignore_sequence.end () && i_it == seen_insertions.end ()) {
328
- std::ostringstream convert;
329
- convert << sequence[j];
330
- nuc += convert.str ();
331
- seen_insertions.push_back (current_pos);
332
- current_pos += 1 ;
333
- prev_insertion = true ;
334
- continue ;
335
- } else if (it == ignore_sequence.end ()){
336
- // insertion is over
337
- if (prev_insertion) {
338
- current_pos -= nuc.size ();
339
- }
340
- prev_insertion = false ;
341
- nuc = " " ;
342
- }
343
- it = find (sc_positions.begin (), sc_positions.end (), j);
344
- if (it != sc_positions.end ()){
345
- continue ;
346
- }
347
- current_pos += 1 ;
348
- std::ostringstream convert;
349
- convert << sequence[j];
350
- std::string nuc = convert.str ();
351
- if ((uint32_t )quality[j] < quality_threshold){
352
- continue ;
156
+ if (qualities[i] >= (uint32_t )min_qual){
157
+ variants[positions[i]].update_alleles (bases[i], 1 , qualities[i]);
353
158
}
354
- char ch = ' L' ;
355
- if (nuc.find (ch) == std::string::npos) {
356
- variants[current_pos].update_alleles (nuc, 1 , quality[j]);
357
- }
358
159
}
359
160
}
360
161
@@ -379,44 +180,6 @@ int IntervalTree::unpaired_primers(ITNode *root, primer prim){
379
180
return ret;
380
181
}
381
182
382
- void IntervalTree::set_haplotypes (ITNode *root, primer prim){
383
- if (root==NULL ) return ;
384
- char strand = prim.get_strand ();
385
- // these are the bounds on the amplicon
386
- if (strand == ' +' && ((int )prim.get_start () != root->data ->low )){
387
- set_haplotypes (root->right , prim);
388
- } else if (strand == ' -' && ((int )prim.get_end ()+1 != root->data ->high )){
389
- set_haplotypes (root->right , prim);
390
- } else {
391
- // we found the matching amplion, now we add this cigarotype to the amplicon
392
- std::vector<position> tmp_pos = prim.get_positions ();
393
- for (uint32_t i=0 ; i < tmp_pos.size (); i++){
394
- position add_pos = tmp_pos[i];
395
- bool found = false ;
396
- // check if we already have a pos for this pos
397
- for (uint32_t j=0 ; j < root->amp_positions .size (); j++){
398
- position here_pos = root->amp_positions [j];
399
- if (here_pos.pos == tmp_pos[i].pos ){
400
- found = true ;
401
- root->amp_positions [j].depth += tmp_pos[i].depth ;
402
- std::vector<allele> new_alleles = add_allele_vectors (tmp_pos[i].alleles , root->amp_positions [j].alleles );
403
- root->amp_positions [j].alleles = new_alleles;
404
- break ;
405
- }
406
- }
407
- // if we've never seen this pos for this haplotype, push a new one
408
- if (!found){
409
- position tmp;
410
- tmp.depth = add_pos.depth ;
411
- tmp.alleles = add_pos.alleles ;
412
- tmp.pos = add_pos.pos ;
413
- root->amp_positions .push_back (tmp);
414
- }
415
- }
416
- return ;
417
- }
418
- }
419
-
420
183
// A utility function to insert a new Interval Search Tree Node
421
184
// This is similar to BST Insert. Here the low value of interval
422
185
// is used tomaintain BST property
0 commit comments