@@ -151,18 +151,19 @@ void generate_range(uint32_t start, uint32_t end, std::vector<uint32_t> &result)
151
151
}
152
152
}
153
153
154
- float calculate_standard_deviation (std::vector<float > frequencies) {
155
- float sum = 0.0 , mean = 0.0 , sd = 0.0 ;
156
- uint32_t i = 0 ;
157
- for (i = 0 ; i < frequencies. size (); ++i) {
158
- sum += frequencies [i];
154
+ float calculate_standard_deviation (std::vector<float > frequencies, std::vector< uint32_t > depths ) {
155
+ double weighted_sum = 0.0 , total_weight = 0.0 ;
156
+ for ( uint32_t i = 0 ; i < frequencies. size (); i++) {
157
+ weighted_sum += frequencies[i] * depths[i];
158
+ total_weight += depths [i];
159
159
}
160
- mean = sum / frequencies. size () ;
161
-
162
- for ( i = 0 ; i < frequencies.size (); ++i ) {
163
- sd += pow (frequencies[i] - mean, 2 );
160
+ double mean = weighted_sum / total_weight ;
161
+ double weighted_variance = 0.0 ;
162
+ for ( uint32_t i = 0 ; i < frequencies.size (); i++ ) {
163
+ weighted_variance += depths[i] * std:: pow (frequencies[i] - mean, 2 );
164
164
}
165
- return sqrt (sd / frequencies.size ());
165
+ weighted_variance /= total_weight;
166
+ return std::sqrt (weighted_variance);
166
167
}
167
168
168
169
// first main function call
@@ -276,8 +277,6 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out,
276
277
} else {
277
278
start_pos = aln->core .pos ;
278
279
}
279
- // TESTLINES
280
- if (start_pos > 450 ) continue ;
281
280
bam1_t *r = aln;
282
281
// get the md tag
283
282
uint8_t *aux = bam_aux_get (aln, " MD" );
@@ -328,7 +327,10 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out,
328
327
read_map[read_name] = bam_dup1(aln); // Duplicate the read to avoid overwriting
329
328
}*/
330
329
331
- // if(start_pos > 22664 || start_pos < 24664) continue;
330
+ // TEST LINES
331
+ // if(start_pos > 3500) continue;
332
+ // std::string test = bam_get_qname(aln);
333
+ // if(test != "A01535:8:HJ3YYDSX2:4:1126:24433:27305") continue;
332
334
counter += 1 ;
333
335
overlapping_primers.clear ();
334
336
if (strand == ' +' ){
@@ -404,7 +406,15 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out,
404
406
for (uint32_t i=0 ; i < primers.size (); i++){
405
407
amplicons.set_haplotypes (primers[i]);
406
408
}
407
-
409
+ /*
410
+ //TEST LINES
411
+ std::string amp_file = "/home/chrissy/paper_ivar_2.0/real_primer_binding/var/file_121_all.txt";
412
+ std::ofstream file_amp(amp_file, std::ios::trunc);
413
+ file_amp << "POS\tALLELE\tDEPTH\tFREQ\tAMP_START\tAMP_END\n";
414
+ file_amp.close();
415
+ amplicons.write_out_frequencies(amp_file);
416
+ */
417
+
408
418
// combine amplicon counts to get total variants
409
419
amplicons.combine_haplotypes ();
410
420
// detect primer binding issues
@@ -444,32 +454,34 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out,
444
454
if (amplicons.test_flux .size () < 2 ) continue ;
445
455
446
456
std::map<std::string, std::vector<float >> allele_maps;
457
+ std::map<std::string, std::vector<uint32_t >> allele_depths;
447
458
for (uint32_t j=0 ; j < amplicons.test_flux .size (); j++){
448
459
if (i == test_pos){
449
460
std::cerr << " \n " << amplicons.test_test [j] << std::endl;
450
461
}
451
462
uint32_t total_depth = amplicons.test_flux [j].depth ;
452
463
// actually, we use ungapped depth
453
464
for (auto pos : amplicons.test_flux [j].alleles ){
454
- if (pos.nuc == " -" ){
455
- total_depth -= pos.depth ;
456
- }
465
+ // if(pos.nuc == "-"){
466
+ // total_depth -= pos.depth;
467
+ // }
457
468
if (i == test_pos){
458
469
std::cerr << pos.nuc << " " << pos.depth << std::endl;
459
470
}
460
471
}
461
472
462
- if (i == test_pos){
463
- std::cerr << " total depth " << total_depth << std::endl;
464
- }
465
473
if (total_depth < 50 ){
466
474
continue ;
467
475
}
476
+ if (i == test_pos){
477
+ std::cerr << " total depth " << total_depth << std::endl;
478
+ }
468
479
std::vector<allele> ad = amplicons.test_flux [j].alleles ;
469
480
for (uint32_t k=0 ; k < ad.size (); k++){
470
481
std::string nuc = ad[k].nuc ;
471
- if (nuc == " -" ) continue ;
482
+ // if(nuc == "-") continue;
472
483
uint32_t ad_depth = ad[k].depth ;
484
+ if (ad_depth == 0 ) continue ;
473
485
if (i == test_pos){
474
486
std::cerr << nuc << " " << ad_depth << " " << total_depth << std::endl;
475
487
}
@@ -478,22 +490,34 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out,
478
490
if (i == test_pos){
479
491
std::cerr << " allele map " << nuc << " " << t << std::endl;
480
492
}
481
- allele_maps[nuc].push_back (t);
493
+ allele_maps[nuc].push_back (t);
494
+ allele_depths[nuc].push_back (ad_depth);
482
495
} else {
483
496
std::vector<float > tmp;
497
+ std::vector<uint32_t > tmp_depths;
484
498
tmp.push_back (t);
499
+ tmp_depths.push_back (ad_depth);
485
500
if (i == test_pos){
486
501
std::cerr << " allele map " << nuc << " " << t << std::endl;
487
502
}
488
503
allele_maps[nuc] = tmp;
504
+ allele_depths[nuc] = tmp_depths;
489
505
}
490
506
}
491
507
}
492
508
std::map<std::string, std::vector<float >>::iterator it;
493
509
for (it = allele_maps.begin (); it != allele_maps.end (); it++){
494
- float sd = calculate_standard_deviation (it->second );
510
+ float sd = ( float ) calculate_standard_deviation (it->second , allele_depths[it-> first ] );
495
511
if (i == test_pos){
512
+ for (auto bit = allele_depths.begin (); bit != allele_depths.end (); ++bit) {
513
+ std::cerr << " allele depths " << bit->first << std::endl;
514
+ for (auto b : bit->second ){
515
+ std::cerr << b << " " ;
516
+ }
517
+ std::cerr << " \n " ;
518
+ }
496
519
std::cerr << i << " std " << sd << " " << it->first << std::endl;
520
+ std::cerr << " allele frequencies" << std::endl;
497
521
for (auto x : it->second ){
498
522
std::cerr << x << std::endl;
499
523
}
@@ -536,7 +560,8 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out,
536
560
// write variants to a file
537
561
ofstream file;
538
562
file.open (bam_out + " .txt" , ios::trunc);
539
- file << " POS\t ALLELE\t DEPTH\t FREQ\t GAPPED_FREQ\t AVG_QUAL\t FLAGGED_POS\t AMP_MASKED\t PRIMER_BINDING\n " ;
563
+ // file << "POS\tALLELE\tDEPTH\tFREQ\tGAPPED_FREQ\tAVG_QUAL\tFLAGGED_POS\tAMP_MASKED\tPRIMER_BINDING\n";
564
+ file << " REGION\t POS\t REF\t ALT\t REF_DP\t REF_RV\t REF_QUAL\t ALT_DP\t ALT_RV\t ALT_QUAL\t ALT_FREQ\t TOTAL_DP\t PVAL\t PASS\t GFF_FEATURE\t REF_CODON\t REF_AA\t ALT_CODON\t ALT_AA\t POS_AA\t GAPPED_FREQ\t FLAGGED_POS\t AMP_MASKED\t PRIMER_BINDING\n " ;
540
565
for (uint32_t i=0 ; i < variants.size (); i++){
541
566
// find the depth of the deletion to calculate upgapped depth
542
567
float del_depth = 0 ;
@@ -552,13 +577,28 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out,
552
577
if ((float )variants[i].alleles [j].depth == 0 ){
553
578
continue ;
554
579
}
580
+ file << " NA\t " ; // region
555
581
file << std::to_string (variants[i].pos ) << " \t " ;
582
+ file << " NA\t " ; // ref
556
583
std::string test_string = variants[i].alleles [j].nuc ;
557
584
file << variants[i].alleles [j].nuc << " \t " ;
558
- file << std::to_string (variants[i].alleles [j].depth ) << " \t " ;
559
- file << std::to_string (freq) << " \t " ;
585
+ file << " NA\t " ; // ref dp
586
+ file << " NA\t " ; // ref rv
587
+ file << " NA\t " ; // ref qual
588
+ file << std::to_string (variants[i].alleles [j].depth ) << " \t " ; // alt dp
589
+ file << " NA\t " ; // alt rv
590
+ file << std::to_string ((float )variants[i].alleles [j].mean_qual / (float )variants[i].alleles [j].depth ) << " \t " ; // alt qual
591
+ file << std::to_string (freq) << " \t " ; // alt freq
592
+ file << std::to_string (variants[i].depth ) << " \t " ; // total dp
593
+ file << " NA\t " ; // pval
594
+ file << " NA\t " ; // pass
595
+ file << " NA\t " ; // gff feature
596
+ file << " NA\t " ; // ref codon
597
+ file << " NA\t " ; // ref aa
598
+ file << " NA\t " ; // alt codon
599
+ file << " NA\t " ; // alt aa
600
+ file << " NA\t " ; // pos aa
560
601
file << std::to_string (gapped_freq) << " \t " ;
561
- file << std::to_string ((float )variants[i].alleles [j].mean_qual / (float )variants[i].alleles [j].depth ) << " \t " ;
562
602
std::vector<uint32_t >::iterator it;
563
603
it = find (flagged_positions.begin (), flagged_positions.end (), variants[i].pos );
564
604
if (it != flagged_positions.end ()){
0 commit comments