Skip to content

Commit 61271cb

Browse files
committed
adding in previously missed changes
1 parent d487617 commit 61271cb

File tree

2 files changed

+23
-68
lines changed

2 files changed

+23
-68
lines changed

src/gmm.cpp

Lines changed: 18 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,12 @@
99
#include <unordered_map>
1010

1111
void calculate_reference_frequency(std::vector<variant> &variants, std::string filename, uint32_t depth_cutoff, float lower_bound, float upper_bound, std::vector<uint32_t> deletion_positions){
12+
//also account for things NOT in the variants file
13+
uint32_t max_pos = 0;
14+
for(auto var : variants){
15+
if(var.position > max_pos) max_pos = var.position;
16+
}
17+
1218
//if we use ivar 1.0 calculate the ref and add in it's freq
1319
std::vector<uint32_t> unique_pos;
1420
std::ifstream infile(filename);
@@ -73,7 +79,6 @@ void calculate_reference_frequency(std::vector<variant> &variants, std::string f
7379
variants.push_back(tmp);
7480
count += 1;
7581
}
76-
7782
}
7883

7984
double calculate_range(const std::vector<double>& vec) {
@@ -129,29 +134,6 @@ double calculate_median(std::vector<double> &data) {
129134
}
130135
}
131136

132-
double calculate_stddev(const std::vector<double>& data, double mean) {
133-
double sum = 0;
134-
for (double val : data) {
135-
sum += std::pow(val - mean, 2);
136-
}
137-
return std::sqrt(sum / data.size());
138-
}
139-
140-
int count_outliers_z_score(const std::vector<double>& data) {
141-
double mean = calculate_mean(data);
142-
double stddev = calculate_stddev(data, mean);
143-
144-
int outlier_count = 0;
145-
for (double value : data) {
146-
double z_score = (value - mean) / stddev;
147-
if (std::abs(z_score) > 3) { // Typically consider |Z| > 3 as an outlier
148-
outlier_count++;
149-
}
150-
}
151-
152-
return outlier_count;
153-
}
154-
155137
void calculate_gapped_frequency(std::vector<variant> &variants, double universal_cluster, double noise_cluster){
156138
for(uint32_t i=0; i < variants.size(); i++){
157139
if(variants[i].pos_del_flag){
@@ -364,7 +346,7 @@ gaussian_mixture_model retrain_model_seeded(uint32_t n, arma::mat data, std::vec
364346

365347
for(uint32_t i=0; i < centroids.size(); i++){
366348
initial_means.col(i) = centroids[i];
367-
cov.col(i) = 0.001;
349+
cov.col(i) = 0.0001;
368350
}
369351

370352
double var_floor = 0.0001;
@@ -744,9 +726,7 @@ void assign_variants_simple(std::vector<variant> &variants, std::vector<std::vec
744726
if(assigned.size() == 0) continue;
745727
//make sure the assignment is concrete
746728
std::vector<uint32_t> assignment_flagged;
747-
//if(unique_pos[i] == 29692){
748729
assignment_flagged = compare_cluster_assignment(tmp_prob, assigned);
749-
//}
750730
for(uint32_t j=0; j < pos_idxs.size(); j++){
751731
std::vector<uint32_t>::iterator tmp = std::find(assignment_flagged.begin(), assignment_flagged.end(), j);
752732
uint32_t k = 0;
@@ -1003,12 +983,12 @@ std::vector<variant> gmm_model(std::string prefix, std::string output_prefix){
1003983
float quality_threshold = 20;
1004984
uint32_t round_val = 4;
1005985

1006-
bool development_mode=false;
986+
bool development_mode=true;
1007987
std::vector<float> error_rate = cluster_error(prefix);
1008988
lower_bound = error_rate[0];
1009989
upper_bound = error_rate[1];
1010990
std::cerr << lower_bound << " " << upper_bound << std::endl;
1011-
991+
//exit(0);
1012992
std::vector<variant> base_variants;
1013993
std::vector<uint32_t> deletion_positions = find_deletion_positions(prefix, depth_cutoff, lower_bound, upper_bound, round_val);
1014994

@@ -1031,7 +1011,7 @@ std::vector<variant> gmm_model(std::string prefix, std::string output_prefix){
10311011
variants.push_back(base_variants[i]);
10321012
all_var.push_back(base_variants[i].freq);
10331013
count_pos.push_back(base_variants[i].position);
1034-
std::cerr << base_variants[i].freq << " " << base_variants[i].position << " " << base_variants[i].nuc << std::endl;
1014+
//std::cerr << base_variants[i].freq << " " << base_variants[i].position << " " << base_variants[i].nuc << " " << base_variants[i].depth << std::endl;
10351015
}
10361016
}
10371017
std::cerr << "useful var " << useful_var << std::endl;
@@ -1079,7 +1059,7 @@ std::vector<variant> gmm_model(std::string prefix, std::string output_prefix){
10791059
retrained.means.clear();
10801060
retrained.hefts.clear();
10811061
retrained.prob_matrix.clear();
1082-
retrained = retrain_model(counter, data, variants, lower_n, 0.0001);
1062+
retrained = retrain_model(counter, data, variants, lower_n, 0.001);
10831063
bool optimal = true;
10841064
assign_clusters(variants, retrained, lower_n);
10851065
std::vector<std::vector<double>> clusters(counter);
@@ -1097,6 +1077,7 @@ std::vector<variant> gmm_model(std::string prefix, std::string output_prefix){
10971077
continue;
10981078
}
10991079
int count_far = 0;
1080+
int count_far_far = 0;
11001081
for(auto d : data){
11011082
std::cerr << d << " ";
11021083
if(std::abs(d-mean) > 0.10){
@@ -1105,11 +1086,12 @@ std::vector<variant> gmm_model(std::string prefix, std::string output_prefix){
11051086
}
11061087
std::cerr << "\n";
11071088
float percent_far = (float)count_far / (float)useful_var;
1108-
1089+
float percent_far_far = (float)count_far_far / (float)useful_var;
1090+
11091091
tmp_mads.push_back(mad);
11101092
tmp_percent_far.push_back(percent_far);
11111093
float ratio = (float)useful_var / (float) counter;
1112-
std::cerr << "mean " << mean << " mad " << mad << " cluster size " << data.size() << " count far " << count_far << " percent far " << percent_far << " ratio " << ratio << std::endl;
1094+
std::cerr << "mean " << mean << " mad " << mad << " cluster size " << data.size() << " count far " << count_far << " percent far " << std::endl;
11131095
if(ratio >= 5){
11141096
if(mad <= 0.10 && percent_far <= 0.10){
11151097
optimal = true;
@@ -1171,39 +1153,6 @@ std::vector<variant> gmm_model(std::string prefix, std::string output_prefix){
11711153
final_means.push_back(mean);
11721154
}
11731155
std::ofstream file;
1174-
//write mad to strings for use
1175-
if(development_mode){
1176-
std::string mads_string;
1177-
for(uint32_t l=0; l < mads.size(); l++){
1178-
std::string tmp_str = "[";
1179-
for(uint32_t d=0; d < mads[l].size(); d++){
1180-
if(d != 0) tmp_str += ",";
1181-
tmp_str += std::to_string(mads[l][d]);
1182-
}
1183-
tmp_str += "]\n";
1184-
mads_string += tmp_str;
1185-
}
1186-
std::string mad_output = output_prefix + "_mad.txt";
1187-
file.open(mad_output, std::ios::trunc);
1188-
file << "MADS\n";
1189-
file << mads_string;
1190-
file.close();
1191-
}
1192-
std::string percent_string;
1193-
for(uint32_t l=0; l < percents.size(); l++){
1194-
std::string tmp_str = "[";
1195-
for(uint32_t d=0; d < percents[l].size(); d++){
1196-
if(d != 0) tmp_str += ",";
1197-
tmp_str += std::to_string(percents[l][d]);
1198-
}
1199-
tmp_str += "]\n";
1200-
percent_string += tmp_str;
1201-
}
1202-
std::string percent_output = output_prefix + "_percent.txt";
1203-
file.open(percent_output, std::ios::trunc);
1204-
file << "PERCENTS\n";
1205-
file << percent_string;
1206-
file.close();
12071156

12081157
//write means to string
12091158
file.open(output_prefix + ".txt", std::ios::trunc);
@@ -1235,6 +1184,9 @@ std::vector<variant> gmm_model(std::string prefix, std::string output_prefix){
12351184
}*/
12361185
//could benefit from redoing lines in variants file as gapped/ungapped depth
12371186
for(uint32_t i=0; i < base_variants.size(); i++){
1187+
if(base_variants[i].del_flag){
1188+
continue;
1189+
}
12381190
if(!base_variants[i].outside_freq_range && !base_variants[i].pos_del_flag){
12391191
useful_var += 1;
12401192
variants.push_back(base_variants[i]);

src/interval_tree.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ class IntervalTree {
5555
void set_haplotypes(ITNode *root, primer prim);
5656
int unpaired_primers(ITNode *root, primer prim);
5757
void combine_haplotypes(ITNode *root);
58+
void write_out_frequencies(ITNode *root, std::string filename);
5859
void detect_abberations(ITNode *root, uint32_t pos);
5960
void detect_amplicon_overlaps(ITNode *root, uint32_t pos);
6061
void detect_primer_issues(ITNode *root, uint32_t pos);
@@ -80,8 +81,9 @@ class IntervalTree {
8081
void detect_amplicon_overlaps(uint32_t pos) {detect_amplicon_overlaps(_root, pos);}
8182
void detect_primer_issues(uint32_t pos) {detect_primer_issues(_root, pos);}
8283
void combine_haplotypes() {combine_haplotypes(_root);}
84+
void write_out_frequencies(std::string filename){write_out_frequencies(_root, filename);}
8385
void add_read_variants(uint32_t *cigar, uint32_t start_pos, uint32_t nlength, uint8_t *sequence, uint8_t *aux, uint8_t *quality, std::string qname);
84-
void populate_variants();
86+
void populate_variants(uint32_t last_position);
8587
void find_read_amplicon(uint32_t lower, uint32_t upper, std::vector<uint32_t> positions, std::vector<std::string> bases, std::vector<uint8_t> qualities) {find_read_amplicon(_root, lower, upper, positions, bases, qualities);}
8688
void amplicon_position_pop() {amplicon_position_pop(_root);}
8789
};
@@ -91,11 +93,12 @@ void detect_abberations(ITNode *root, uint32_t find_position);
9193
void get_max_pos();
9294
void set_haplotypes(ITNode *root, primer prim);
9395
void add_read_variants(uint32_t *cigar, uint32_t start_pos, uint32_t nlength, uint8_t *sequence, uint8_t *aux, uint8_t *quality, std::string qname);
94-
void populate_variants();
96+
void populate_variants(uint32_t last_position);
9597
int unpaired_primers(ITNode *root, primer prim);
9698
void detect_primer_issues(ITNode *root, uint32_t find_position);
9799
void detect_amplicon_overlaps(ITNode *root, uint32_t find_position);
98100
void find_read_amplicon(ITNode *root, uint32_t lower, uint32_t upper, std::vector<uint32_t> positions, std::vector<std::string> bases, std::vector<uint8_t> qualities);
99101
IntervalTree populate_amplicons(std::string pair_info_file, std::vector<primer> &primers);
100102
IntervalTree amplicon_position_pop();
103+
void write_out_frequencies(ITNode *root, std::string filename);
101104
#endif

0 commit comments

Comments
 (0)