9
9
#include < tuple>
10
10
using namespace std ::chrono;
11
11
12
- void parse_cigar (const bam1_t * read1, std::vector<uint32_t > &positions, std::vector<std::string> &bases, std::vector<uint32_t > &qualities, uint32_t total_ref_pos, uint32_t total_query_pos, uint32_t ref_start_pos ){
13
- // ref_start_pos describe the point after which we start recording bases
12
+ void parse_cigar (const bam1_t * read1, std::vector<uint32_t > &positions, std::vector<std::string> &bases, std::vector<uint32_t > &qualities, uint32_t total_ref_pos){
13
+ uint32_t total_query_pos= 0 ;
14
14
const uint8_t * seq_field1 = bam_get_seq (read1);
15
15
uint32_t *cigar1 = bam_get_cigar (read1);
16
16
uint8_t * qual = bam_get_qual (read1);
17
-
17
+ total_ref_pos += 1 ;
18
18
for (uint32_t i = 0 ; i < read1->core .n_cigar ; i++){
19
19
uint32_t op = bam_cigar_op (cigar1[i]);
20
20
uint32_t len = bam_cigar_oplen (cigar1[i]);
@@ -88,18 +88,14 @@ void merge_reads(const bam1_t* read1, const bam1_t* read2, IntervalTree &lico
88
88
// pass the forward first then reverse
89
89
// underlying assumption here is that the overlap region is identical
90
90
// also assumes that the forward read starts more "left" than the reverse
91
-
91
+ // std::cerr << "merging reads"<< std::endl;
92
92
// get coordinates for potential overlap area
93
- uint32_t end_forward = find_sequence_end (read1);
93
+ // uint32_t end_forward = find_sequence_end(read1);
94
94
uint32_t start_reverse = read2->core .pos ;
95
95
96
96
uint32_t start_forward = read1->core .pos ;
97
97
uint32_t end_reverse = find_sequence_end (read2);
98
-
99
- // iterate the first cigar string
100
- uint32_t total_ref_pos = start_forward;
101
- uint32_t total_query_pos = 0 ;
102
-
98
+
103
99
// record the positions and their bases
104
100
std::vector<uint32_t > positions1;
105
101
std::vector<std::string> bases1;
@@ -108,18 +104,10 @@ void merge_reads(const bam1_t* read1, const bam1_t* read2, IntervalTree &lico
108
104
std::vector<uint32_t > positions2;
109
105
std::vector<std::string> bases2;
110
106
std::vector<uint32_t > qualities2;
111
-
112
- // we use all of the first read
113
- uint32_t end_overlap, begin_overlap;
114
- if (end_reverse <= end_forward){
115
- end_overlap = end_reverse-1 ;
116
- } else {
117
- end_overlap = end_forward-1 ;
118
- }
119
- begin_overlap = start_reverse;
120
-
121
- parse_cigar (read1, positions1, bases1, qualities1, total_ref_pos, total_query_pos, start_forward);
122
- parse_cigar (read2, positions2, bases2, qualities2, start_reverse, total_query_pos, end_forward);
107
+
108
+ // start of read,
109
+ parse_cigar (read1, positions1, bases1, qualities1, start_forward);
110
+ parse_cigar (read2, positions2, bases2, qualities2, start_reverse);
123
111
124
112
// find all unique positions we need to cover
125
113
std::unordered_set<uint32_t > unique_elements (positions1.begin (), positions1.end ());
@@ -205,19 +193,27 @@ void merge_reads(const bam1_t* read1, const bam1_t* read2, IntervalTree &lico
205
193
}
206
194
207
195
// TESTLINES
208
- /*
196
+ uint32_t test_counter = 0 ;
209
197
for (uint32_t i=0 ; i < final_positions.size (); i++){
210
- std::cerr << final_positions[i] << " " << final_qualities[i] << " " << final_bases[i] << std::endl;
211
- }
212
- exit(0);*/
213
-
198
+ if (final_positions[i] == 16176 ){
199
+ test_counter ++;
200
+ // std::cerr << bam_get_qname(read1) << std::endl;
201
+ // std::cerr << final_positions[i] << " " << final_qualities[i] << " " << final_bases[i] << std::endl;
202
+ }
203
+ }
204
+ // exit(0);
214
205
215
206
// find assigned amplicon and populate position vector
216
207
bool found_amplicon = false ;
217
- amplicons.find_read_amplicon (start_forward, end_reverse, final_positions, final_bases, final_qualities, found_amplicon);
208
+ // std::cerr << bam_get_qname(read1) << std::endl;
209
+ uint32_t amp_dist = 429496729 ;
210
+ uint32_t amp_start = 0 ;
211
+ amplicons.find_read_amplicon (start_forward, end_reverse, found_amplicon, bam_get_qname (read1), amp_start, amp_dist);
212
+ // std::cerr << found_amplicon << std::endl;
218
213
if (!found_amplicon){
219
214
amplicons.add_read_variants (final_positions, final_bases, final_qualities);
220
- std::cerr << " back " << found_amplicon << std::endl;
215
+ } else {
216
+ amplicons.assign_read_amplicon (amp_start, final_positions, final_bases, final_qualities);
221
217
}
222
218
}
223
219
@@ -383,8 +379,10 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out,
383
379
while (sam_read1 (in, header, aln) >= 0 ) {
384
380
// get the name of the read
385
381
std::string read_name = bam_get_qname (aln);
386
- // if(read_name != "A01535:8:HJ3YYDSX2:4:1103:6840:26052") continue;
387
- std::cerr << read_name << std::endl;
382
+ // TESTLINES
383
+ // if(read_name != "A01535:8:HJ3YYDSX2:4:1147:23972:15421") continue;
384
+ // if(read_name != "A01535:8:HJ3YYDSX2:4:1108:10212:30326") continue;
385
+
388
386
// std::cerr << read_name << std::endl;
389
387
strand = ' +' ;
390
388
if (bam_is_rev (aln)) {
@@ -394,7 +392,7 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out,
394
392
start_pos = aln->core .pos ;
395
393
}
396
394
// TEST LINES
397
- // if(start_pos < 13000 || start_pos > 15000 ) continue;
395
+ // if(start_pos < 15176 || start_pos > 17176 ) continue;
398
396
bam1_t *r = aln;
399
397
// get the md tag
400
398
uint8_t *aux = bam_aux_get (aln, " MD" );
@@ -415,17 +413,27 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out,
415
413
amplicons.add_read_variants (cigar, aln->core .pos , nlength, seq, aux, qualities, bam_get_qname (aln));
416
414
continue ;
417
415
}
418
- // TESTLINES
419
- if (aln->core .flag & BAM_FPAIRED){
420
- // std::cerr << bam_get_qname(aln) << std::endl;
421
- } else {
422
- // TODO HANDLE THIS CASE
423
- std::cerr << " read is unpaired" << std::endl;
424
- std::cerr << bam_get_qname (aln) << std::endl;
425
- exit (0 );
416
+ if (!aln->core .flag & BAM_FPAIRED){
417
+ std::cerr << " not paired" << std::endl;
418
+ // if the read is unpaired try to assign it to an amplicon anyways
419
+ std::vector<uint32_t > positions;
420
+ std::vector<std::string> bases;
421
+ std::vector<uint32_t > qualities;
422
+ uint32_t start_read = aln->core .pos ;
423
+ uint32_t end_read = find_sequence_end (aln);
424
+ parse_cigar (aln, positions, bases, qualities, start_read);
425
+ bool found_amplicon = false ;
426
+ uint32_t amp_dist = 429496729 ;
427
+ uint32_t amp_start = 0 ;
428
+ amplicons.find_read_amplicon (start_read, end_read, found_amplicon, read_name, amp_start, amp_dist);
429
+ if (!found_amplicon){
430
+ amplicons.add_read_variants (positions, bases, qualities);
431
+ } else {
432
+ amplicons.assign_read_amplicon (amp_start, positions, bases, qualities);
433
+ }
434
+ std::cerr << read_name << std::endl;
435
+ continue ;
426
436
}
427
-
428
-
429
437
auto it = read_map.find (read_name);
430
438
// assumption is that read pairs share a name
431
439
// execute if we've already seen the mate
@@ -444,7 +452,6 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out,
444
452
// Store the current read in the map
445
453
read_map[read_name] = bam_dup1 (aln); // Duplicate the read to avoid overwriting
446
454
}
447
- std::cerr << " end this" << std::endl;
448
455
continue ;
449
456
450
457
// TEST LINES
@@ -515,7 +522,6 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out,
515
522
}
516
523
517
524
}
518
- std::cerr << " this" << std::endl;
519
525
/*
520
526
std::cerr << "transforming primers" << std::endl;
521
527
//this is super time costly
@@ -537,10 +543,23 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out,
537
543
amplicons.write_out_frequencies (amp_file);
538
544
}
539
545
546
+ std::cerr << " test here " << amplicons.variants [16176 ].depth << std::endl;
547
+ std::cerr << " read map remaining " << read_map.size () << std::endl;
548
+ for (auto allele : amplicons.variants [16176 ].alleles ){
549
+ std::cerr << allele.nuc << " " << allele.depth << std::endl;
550
+ }
551
+
540
552
// combine amplicon counts to get total variants
541
553
amplicons.combine_haplotypes ();
542
554
// detect primer binding issues
543
555
std::vector<position> variants = amplicons.variants ;
556
+
557
+ std::cerr << " test here " << amplicons.variants [16176 ].depth << std::endl;
558
+ for (auto allele : amplicons.variants [16176 ].alleles ){
559
+ std::cerr << allele.nuc << " " << allele.depth << std::endl;
560
+ }
561
+ // exit(0);
562
+
544
563
545
564
// add in primer info
546
565
for (uint32_t i=0 ; i < variants.size (); i++){
@@ -567,7 +586,7 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out,
567
586
std::vector<uint32_t > flagged_positions;
568
587
std::vector<float > std_deviations;
569
588
std::vector<std::string> pos_nuc;
570
- uint32_t test_pos = 13572 ;
589
+ uint32_t test_pos = 0 ;
571
590
// detect fluctuating variants across amplicons
572
591
for (uint32_t i=0 ; i < amplicons.max_pos ; i++){
573
592
amplicons.test_flux .clear ();
0 commit comments