Skip to content

Commit 0678057

Browse files
committed
variant output styles match 1.0 and adding depth weighted stddev
1 parent c968352 commit 0678057

File tree

2 files changed

+87
-57
lines changed

2 files changed

+87
-57
lines changed

src/gmm.cpp

Lines changed: 20 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,6 @@ void calculate_gapped_frequency(std::vector<variant> &variants, double universal
9494
}
9595
}
9696
variants[i].gapped_freq = variants[i].depth / total_depth;
97-
variants[i].transformed_gap_freq = log(variants[i].gapped_freq / (1 - variants[i].gapped_freq));
9897
if(variants[i].gapped_freq > universal_cluster || variants[i].gapped_freq < noise_cluster){
9998
variants[i].gap_outside_freq_range = true;
10099
}
@@ -236,7 +235,7 @@ gaussian_mixture_model retrain_model(uint32_t n, arma::mat data, std::vector<var
236235

237236
for(uint32_t c=0; c < centroid_vec.size(); c++){
238237
//std::cerr << 0.5 + (1.0 / 20) * std::log(centroid_vec[c] / (1.0 - centroid_vec[c])) << std::endl;
239-
std::cerr << "initial means " << centroid_vec[c] << std::endl;
238+
//std::cerr << "initial means " << centroid_vec[c] << std::endl;
240239
initial_means.col(i) = (double)centroid_vec[c];
241240
//initial_means.col(i) = 0.5 + (1.0 / 1) * std::log(centroid_vec[c] / (1.0 - centroid_vec[c]));
242241
cov.col(i) = 0.005;
@@ -819,14 +818,14 @@ std::vector<uint32_t> find_deletion_positions(std::string filename, uint32_t dep
819818
continue;
820819
}
821820
std::vector<std::string> row_values;
822-
//split the line by delimiter
823821
split(line, '\t', row_values);
824-
depth = std::stoi(row_values[2]);
825-
pos = std::stoi(row_values[0]);
826-
freq = std::stof(row_values[3]);
822+
823+
depth = std::stoi(row_values[7]);
824+
pos = std::stoi(row_values[1]);
825+
freq = std::stof(row_values[10]);
827826
float multiplier = pow(10, round_val);
828827
freq = round(freq * multiplier) / multiplier;
829-
std::string nuc = row_values[1];
828+
std::string nuc = row_values[3];
830829
count++;
831830
if(1/freq * depth < depth_cutoff || freq < lower_bound) continue;
832831
if (is_substring(nuc, "+") || is_substring(nuc, "-")) {
@@ -859,28 +858,26 @@ void parse_internal_variants(std::string filename, std::vector<variant> &variant
859858
std::string amp_flag = "";
860859
std::string primer_flag = "";
861860
split(line, '\t', row_values);
862-
pos = std::stoi(row_values[0]);
861+
pos = std::stoi(row_values[1]);
863862
bool del_pos = std::find(deletion_positions.begin(), deletion_positions.end(), pos) != deletion_positions.end();
864863

865-
depth = std::stoi(row_values[2]);
866-
freq = std::stof(row_values[3]);
867-
864+
depth = std::stoi(row_values[7]);
865+
freq = std::stof(row_values[10]);
866+
868867
float multiplier = pow(10, round_val);
869868
freq = round(freq * multiplier) / multiplier;
870869

871-
gapped_freq = std::stof(row_values[4]);
872-
qual = std::stof(row_values[5]);
873-
flag = row_values[6];
874-
amp_flag = row_values[7];
875-
primer_flag = row_values[8];
870+
gapped_freq = std::stof(row_values[20]);
871+
qual = std::stof(row_values[9]);
872+
flag = row_values[21];
873+
amp_flag = row_values[22];
874+
primer_flag = row_values[23];
876875

877876
variant tmp;
878877
tmp.position = pos;
879-
tmp.nuc = row_values[1];
878+
tmp.nuc = row_values[3];
880879
tmp.depth = depth;
881880
tmp.freq = freq;
882-
//tmp.transformed_freq = log(freq / (1 - freq));;
883-
tmp.transformed_freq = 1.0 / (1.0 + std::exp(-1 * (freq - 0.5)));
884881
tmp.qual = qual;
885882

886883
if (flag == "TRUE"){
@@ -931,7 +928,7 @@ void parse_internal_variants(std::string filename, std::vector<variant> &variant
931928
}
932929

933930
std::vector<variant> gmm_model(std::string prefix, std::string output_prefix){
934-
uint32_t n=6;
931+
uint32_t n=8;
935932
//changes based on error rate
936933
float lower_bound = 0.01;
937934
float upper_bound = 0.99;
@@ -1015,7 +1012,7 @@ std::vector<variant> gmm_model(std::string prefix, std::string output_prefix){
10151012
retrained.means.clear();
10161013
retrained.hefts.clear();
10171014
retrained.prob_matrix.clear();
1018-
retrained = retrain_model(counter, data, variants, lower_n, 0.001);
1015+
retrained = retrain_model(counter, data, variants, lower_n, 0.0001);
10191016
bool optimal = true;
10201017
assign_clusters(variants, retrained, lower_n);
10211018
std::vector<std::vector<double>> clusters(counter);
@@ -1036,7 +1033,7 @@ std::vector<variant> gmm_model(std::string prefix, std::string output_prefix){
10361033
for(auto d : data){
10371034
std::cerr << d << " ";
10381035
//this is for 2 standard dev
1039-
if(std::abs(d-mean) > 0.05){
1036+
if(std::abs(d-mean) > 0.10){
10401037
count_far++;
10411038
}
10421039
}
@@ -1105,11 +1102,6 @@ std::vector<variant> gmm_model(std::string prefix, std::string output_prefix){
11051102
std::vector<double> final_means;
11061103
for(auto data : clusters){
11071104
double mean = calculate_mean(data);
1108-
/*for(auto d : data){
1109-
std::cerr << d << " ";
1110-
}
1111-
std::cerr <<"\n";
1112-
std::cerr << "end mean " << mean << std::endl;*/
11131105
final_means.push_back(mean);
11141106
}
11151107
std::ofstream file;
@@ -1125,7 +1117,6 @@ std::vector<variant> gmm_model(std::string prefix, std::string output_prefix){
11251117
mads_string += tmp_str;
11261118
}
11271119

1128-
std::cerr << mads_string << std::endl;
11291120
std::string mad_output = output_prefix + "_mad.txt";
11301121
file.open(mad_output, std::ios::trunc);
11311122
file << "MADS\n";
@@ -1142,7 +1133,6 @@ std::vector<variant> gmm_model(std::string prefix, std::string output_prefix){
11421133
tmp_str += "]\n";
11431134
percent_string += tmp_str;
11441135
}
1145-
std::cerr << percent_string << std::endl;
11461136
std::string percent_output = output_prefix + "_percent.txt";
11471137
file.open(percent_output, std::ios::trunc);
11481138
file << "PERCENTS\n";
@@ -1184,9 +1174,9 @@ std::vector<variant> gmm_model(std::string prefix, std::string output_prefix){
11841174
variants.push_back(base_variants[i]);
11851175
}
11861176
}
1177+
count=0;
11871178
//populate a new armadillo dataset with more frequencies
11881179
arma::mat final_data(1, useful_var, arma::fill::zeros);
1189-
count=0;
11901180
for(uint32_t i = 0; i < variants.size(); i++){
11911181
double tmp;
11921182
//check if variant should be filtered for first pass model

src/saga.cpp

Lines changed: 67 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -151,18 +151,19 @@ void generate_range(uint32_t start, uint32_t end, std::vector<uint32_t> &result)
151151
}
152152
}
153153

154-
float calculate_standard_deviation(std::vector<float> frequencies) {
155-
float sum = 0.0, mean = 0.0, sd = 0.0;
156-
uint32_t i = 0;
157-
for(i = 0; i < frequencies.size(); ++i) {
158-
sum += frequencies[i];
154+
float calculate_standard_deviation(std::vector<float> frequencies, std::vector<uint32_t> depths) {
155+
double weighted_sum = 0.0, total_weight = 0.0;
156+
for(uint32_t i = 0; i < frequencies.size(); i++) {
157+
weighted_sum += frequencies[i] * depths[i];
158+
total_weight += depths[i];
159159
}
160-
mean = sum / frequencies.size();
161-
162-
for(i = 0; i < frequencies.size(); ++i) {
163-
sd += pow(frequencies[i] - mean, 2);
160+
double mean = weighted_sum / total_weight;
161+
double weighted_variance = 0.0;
162+
for (uint32_t i = 0; i < frequencies.size(); i++) {
163+
weighted_variance += depths[i] * std::pow(frequencies[i] - mean, 2);
164164
}
165-
return sqrt(sd / frequencies.size());
165+
weighted_variance /= total_weight;
166+
return std::sqrt(weighted_variance);
166167
}
167168

168169
//first main function call
@@ -276,8 +277,6 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out,
276277
} else {
277278
start_pos = aln->core.pos;
278279
}
279-
//TESTLINES
280-
if(start_pos > 450) continue;
281280
bam1_t *r = aln;
282281
//get the md tag
283282
uint8_t *aux = bam_aux_get(aln, "MD");
@@ -328,7 +327,10 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out,
328327
read_map[read_name] = bam_dup1(aln); // Duplicate the read to avoid overwriting
329328
}*/
330329

331-
//if(start_pos > 22664 || start_pos < 24664) continue;
330+
//TEST LINES
331+
//if(start_pos > 3500) continue;
332+
//std::string test = bam_get_qname(aln);
333+
//if(test != "A01535:8:HJ3YYDSX2:4:1126:24433:27305") continue;
332334
counter += 1;
333335
overlapping_primers.clear();
334336
if(strand == '+'){
@@ -404,7 +406,15 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out,
404406
for (uint32_t i=0; i < primers.size(); i++){
405407
amplicons.set_haplotypes(primers[i]);
406408
}
407-
409+
/*
410+
//TEST LINES
411+
std::string amp_file = "/home/chrissy/paper_ivar_2.0/real_primer_binding/var/file_121_all.txt";
412+
std::ofstream file_amp(amp_file, std::ios::trunc);
413+
file_amp << "POS\tALLELE\tDEPTH\tFREQ\tAMP_START\tAMP_END\n";
414+
file_amp.close();
415+
amplicons.write_out_frequencies(amp_file);
416+
*/
417+
408418
//combine amplicon counts to get total variants
409419
amplicons.combine_haplotypes();
410420
//detect primer binding issues
@@ -444,32 +454,34 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out,
444454
if (amplicons.test_flux.size() < 2) continue;
445455

446456
std::map<std::string, std::vector<float>> allele_maps;
457+
std::map<std::string, std::vector<uint32_t>> allele_depths;
447458
for(uint32_t j=0; j < amplicons.test_flux.size(); j++){
448459
if(i == test_pos){
449460
std::cerr << "\n" << amplicons.test_test[j] << std::endl;
450461
}
451462
uint32_t total_depth = amplicons.test_flux[j].depth;
452463
//actually, we use ungapped depth
453464
for(auto pos : amplicons.test_flux[j].alleles){
454-
if(pos.nuc == "-"){
455-
total_depth -= pos.depth;
456-
}
465+
//if(pos.nuc == "-"){
466+
// total_depth -= pos.depth;
467+
//}
457468
if(i == test_pos){
458469
std::cerr << pos.nuc << " " << pos.depth << std::endl;
459470
}
460471
}
461472

462-
if(i == test_pos){
463-
std::cerr << "total depth " << total_depth << std::endl;
464-
}
465473
if(total_depth < 50){
466474
continue;
467475
}
476+
if(i == test_pos){
477+
std::cerr << "total depth " << total_depth << std::endl;
478+
}
468479
std::vector<allele> ad = amplicons.test_flux[j].alleles;
469480
for(uint32_t k=0; k < ad.size(); k++){
470481
std::string nuc = ad[k].nuc;
471-
if(nuc == "-") continue;
482+
//if(nuc == "-") continue;
472483
uint32_t ad_depth = ad[k].depth;
484+
if(ad_depth == 0) continue;
473485
if(i == test_pos){
474486
std::cerr << nuc << " " << ad_depth << " " << total_depth << std::endl;
475487
}
@@ -478,22 +490,34 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out,
478490
if(i == test_pos){
479491
std::cerr << "allele map " << nuc << " " << t << std::endl;
480492
}
481-
allele_maps[nuc].push_back(t);
493+
allele_maps[nuc].push_back(t);
494+
allele_depths[nuc].push_back(ad_depth);
482495
} else {
483496
std::vector<float> tmp;
497+
std::vector<uint32_t> tmp_depths;
484498
tmp.push_back(t);
499+
tmp_depths.push_back(ad_depth);
485500
if(i == test_pos){
486501
std::cerr << "allele map " << nuc << " " << t << std::endl;
487502
}
488503
allele_maps[nuc] = tmp;
504+
allele_depths[nuc] = tmp_depths;
489505
}
490506
}
491507
}
492508
std::map<std::string, std::vector<float>>::iterator it;
493509
for (it = allele_maps.begin(); it != allele_maps.end(); it++){
494-
float sd = calculate_standard_deviation(it->second);
510+
float sd = (float)calculate_standard_deviation(it->second, allele_depths[it->first]);
495511
if(i == test_pos){
512+
for (auto bit = allele_depths.begin(); bit != allele_depths.end(); ++bit) {
513+
std::cerr << "allele depths " << bit->first << std::endl;
514+
for(auto b : bit->second){
515+
std::cerr << b << " ";
516+
}
517+
std::cerr << "\n";
518+
}
496519
std::cerr << i << " std " << sd << " " << it->first << std::endl;
520+
std::cerr << "allele frequencies" << std::endl;
497521
for(auto x : it->second){
498522
std::cerr << x << std::endl;
499523
}
@@ -536,7 +560,8 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out,
536560
//write variants to a file
537561
ofstream file;
538562
file.open(bam_out + ".txt", ios::trunc);
539-
file << "POS\tALLELE\tDEPTH\tFREQ\tGAPPED_FREQ\tAVG_QUAL\tFLAGGED_POS\tAMP_MASKED\tPRIMER_BINDING\n";
563+
//file << "POS\tALLELE\tDEPTH\tFREQ\tGAPPED_FREQ\tAVG_QUAL\tFLAGGED_POS\tAMP_MASKED\tPRIMER_BINDING\n";
564+
file << "REGION\tPOS\tREF\tALT\tREF_DP\tREF_RV\tREF_QUAL\tALT_DP\tALT_RV\tALT_QUAL\tALT_FREQ\tTOTAL_DP\tPVAL\tPASS\tGFF_FEATURE\tREF_CODON\tREF_AA\tALT_CODON\tALT_AA\tPOS_AA\tGAPPED_FREQ\tFLAGGED_POS\tAMP_MASKED\tPRIMER_BINDING\n";
540565
for(uint32_t i=0; i < variants.size(); i++){
541566
//find the depth of the deletion to calculate upgapped depth
542567
float del_depth = 0;
@@ -552,13 +577,28 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out,
552577
if((float)variants[i].alleles[j].depth == 0){
553578
continue;
554579
}
580+
file << "NA\t"; //region
555581
file << std::to_string(variants[i].pos) << "\t";
582+
file << "NA\t"; //ref
556583
std::string test_string = variants[i].alleles[j].nuc;
557584
file << variants[i].alleles[j].nuc << "\t";
558-
file << std::to_string(variants[i].alleles[j].depth) << "\t";
559-
file << std::to_string(freq) << "\t";
585+
file << "NA\t"; //ref dp
586+
file << "NA\t"; //ref rv
587+
file << "NA\t"; //ref qual
588+
file << std::to_string(variants[i].alleles[j].depth) << "\t"; //alt dp
589+
file << "NA\t"; //alt rv
590+
file << std::to_string((float)variants[i].alleles[j].mean_qual / (float)variants[i].alleles[j].depth) << "\t"; //alt qual
591+
file << std::to_string(freq) << "\t"; //alt freq
592+
file << std::to_string(variants[i].depth) << "\t"; //total dp
593+
file << "NA\t"; //pval
594+
file << "NA\t"; //pass
595+
file << "NA\t"; //gff feature
596+
file << "NA\t"; //ref codon
597+
file << "NA\t"; //ref aa
598+
file << "NA\t"; //alt codon
599+
file << "NA\t"; //alt aa
600+
file << "NA\t"; //pos aa
560601
file << std::to_string(gapped_freq) << "\t";
561-
file << std::to_string((float)variants[i].alleles[j].mean_qual / (float)variants[i].alleles[j].depth) << "\t";
562602
std::vector<uint32_t>::iterator it;
563603
it = find(flagged_positions.begin(), flagged_positions.end(), variants[i].pos);
564604
if (it != flagged_positions.end()){

0 commit comments

Comments
 (0)