9
9
#include < unordered_map>
10
10
11
11
void calculate_reference_frequency (std::vector<variant> &variants, std::string filename, uint32_t depth_cutoff, float lower_bound, float upper_bound, std::vector<uint32_t > deletion_positions){
12
+ // also account for things NOT in the variants file
13
+ uint32_t max_pos = 0 ;
14
+ for (auto var : variants){
15
+ if (var.position > max_pos) max_pos = var.position ;
16
+ }
17
+
12
18
// if we use ivar 1.0 calculate the ref and add in it's freq
13
19
std::vector<uint32_t > unique_pos;
14
20
std::ifstream infile (filename);
@@ -73,7 +79,6 @@ void calculate_reference_frequency(std::vector<variant> &variants, std::string f
73
79
variants.push_back (tmp);
74
80
count += 1 ;
75
81
}
76
-
77
82
}
78
83
79
84
double calculate_range (const std::vector<double >& vec) {
@@ -129,29 +134,6 @@ double calculate_median(std::vector<double> &data) {
129
134
}
130
135
}
131
136
132
- double calculate_stddev (const std::vector<double >& data, double mean) {
133
- double sum = 0 ;
134
- for (double val : data) {
135
- sum += std::pow (val - mean, 2 );
136
- }
137
- return std::sqrt (sum / data.size ());
138
- }
139
-
140
- int count_outliers_z_score (const std::vector<double >& data) {
141
- double mean = calculate_mean (data);
142
- double stddev = calculate_stddev (data, mean);
143
-
144
- int outlier_count = 0 ;
145
- for (double value : data) {
146
- double z_score = (value - mean) / stddev;
147
- if (std::abs (z_score) > 3 ) { // Typically consider |Z| > 3 as an outlier
148
- outlier_count++;
149
- }
150
- }
151
-
152
- return outlier_count;
153
- }
154
-
155
137
void calculate_gapped_frequency (std::vector<variant> &variants, double universal_cluster, double noise_cluster){
156
138
for (uint32_t i=0 ; i < variants.size (); i++){
157
139
if (variants[i].pos_del_flag ){
@@ -364,7 +346,7 @@ gaussian_mixture_model retrain_model_seeded(uint32_t n, arma::mat data, std::vec
364
346
365
347
for (uint32_t i=0 ; i < centroids.size (); i++){
366
348
initial_means.col (i) = centroids[i];
367
- cov.col (i) = 0.001 ;
349
+ cov.col (i) = 0.0001 ;
368
350
}
369
351
370
352
double var_floor = 0.0001 ;
@@ -744,9 +726,7 @@ void assign_variants_simple(std::vector<variant> &variants, std::vector<std::vec
744
726
if (assigned.size () == 0 ) continue ;
745
727
// make sure the assignment is concrete
746
728
std::vector<uint32_t > assignment_flagged;
747
- // if(unique_pos[i] == 29692){
748
729
assignment_flagged = compare_cluster_assignment (tmp_prob, assigned);
749
- // }
750
730
for (uint32_t j=0 ; j < pos_idxs.size (); j++){
751
731
std::vector<uint32_t >::iterator tmp = std::find (assignment_flagged.begin (), assignment_flagged.end (), j);
752
732
uint32_t k = 0 ;
@@ -1003,12 +983,12 @@ std::vector<variant> gmm_model(std::string prefix, std::string output_prefix){
1003
983
float quality_threshold = 20 ;
1004
984
uint32_t round_val = 4 ;
1005
985
1006
- bool development_mode=false ;
986
+ bool development_mode=true ;
1007
987
std::vector<float > error_rate = cluster_error (prefix);
1008
988
lower_bound = error_rate[0 ];
1009
989
upper_bound = error_rate[1 ];
1010
990
std::cerr << lower_bound << " " << upper_bound << std::endl;
1011
-
991
+ // exit(0);
1012
992
std::vector<variant> base_variants;
1013
993
std::vector<uint32_t > deletion_positions = find_deletion_positions (prefix, depth_cutoff, lower_bound, upper_bound, round_val);
1014
994
@@ -1031,7 +1011,7 @@ std::vector<variant> gmm_model(std::string prefix, std::string output_prefix){
1031
1011
variants.push_back (base_variants[i]);
1032
1012
all_var.push_back (base_variants[i].freq );
1033
1013
count_pos.push_back (base_variants[i].position );
1034
- std::cerr << base_variants[i].freq << " " << base_variants[i].position << " " << base_variants[i].nuc << std::endl;
1014
+ // std::cerr << base_variants[i].freq << " " << base_variants[i].position << " " << base_variants[i].nuc << " " << base_variants[i].depth << std::endl;
1035
1015
}
1036
1016
}
1037
1017
std::cerr << " useful var " << useful_var << std::endl;
@@ -1079,7 +1059,7 @@ std::vector<variant> gmm_model(std::string prefix, std::string output_prefix){
1079
1059
retrained.means .clear ();
1080
1060
retrained.hefts .clear ();
1081
1061
retrained.prob_matrix .clear ();
1082
- retrained = retrain_model (counter, data, variants, lower_n, 0.0001 );
1062
+ retrained = retrain_model (counter, data, variants, lower_n, 0.001 );
1083
1063
bool optimal = true ;
1084
1064
assign_clusters (variants, retrained, lower_n);
1085
1065
std::vector<std::vector<double >> clusters (counter);
@@ -1097,6 +1077,7 @@ std::vector<variant> gmm_model(std::string prefix, std::string output_prefix){
1097
1077
continue ;
1098
1078
}
1099
1079
int count_far = 0 ;
1080
+ int count_far_far = 0 ;
1100
1081
for (auto d : data){
1101
1082
std::cerr << d << " " ;
1102
1083
if (std::abs (d-mean) > 0.10 ){
@@ -1105,11 +1086,12 @@ std::vector<variant> gmm_model(std::string prefix, std::string output_prefix){
1105
1086
}
1106
1087
std::cerr << " \n " ;
1107
1088
float percent_far = (float )count_far / (float )useful_var;
1108
-
1089
+ float percent_far_far = (float )count_far_far / (float )useful_var;
1090
+
1109
1091
tmp_mads.push_back (mad);
1110
1092
tmp_percent_far.push_back (percent_far);
1111
1093
float ratio = (float )useful_var / (float ) counter;
1112
- std::cerr << " mean " << mean << " mad " << mad << " cluster size " << data.size () << " count far " << count_far << " percent far " << percent_far << " ratio " << ratio << std::endl;
1094
+ std::cerr << " mean " << mean << " mad " << mad << " cluster size " << data.size () << " count far " << count_far << " percent far " << std::endl;
1113
1095
if (ratio >= 5 ){
1114
1096
if (mad <= 0.10 && percent_far <= 0.10 ){
1115
1097
optimal = true ;
@@ -1171,39 +1153,6 @@ std::vector<variant> gmm_model(std::string prefix, std::string output_prefix){
1171
1153
final_means.push_back (mean);
1172
1154
}
1173
1155
std::ofstream file;
1174
- // write mad to strings for use
1175
- if (development_mode){
1176
- std::string mads_string;
1177
- for (uint32_t l=0 ; l < mads.size (); l++){
1178
- std::string tmp_str = " [" ;
1179
- for (uint32_t d=0 ; d < mads[l].size (); d++){
1180
- if (d != 0 ) tmp_str += " ," ;
1181
- tmp_str += std::to_string (mads[l][d]);
1182
- }
1183
- tmp_str += " ]\n " ;
1184
- mads_string += tmp_str;
1185
- }
1186
- std::string mad_output = output_prefix + " _mad.txt" ;
1187
- file.open (mad_output, std::ios::trunc);
1188
- file << " MADS\n " ;
1189
- file << mads_string;
1190
- file.close ();
1191
- }
1192
- std::string percent_string;
1193
- for (uint32_t l=0 ; l < percents.size (); l++){
1194
- std::string tmp_str = " [" ;
1195
- for (uint32_t d=0 ; d < percents[l].size (); d++){
1196
- if (d != 0 ) tmp_str += " ," ;
1197
- tmp_str += std::to_string (percents[l][d]);
1198
- }
1199
- tmp_str += " ]\n " ;
1200
- percent_string += tmp_str;
1201
- }
1202
- std::string percent_output = output_prefix + " _percent.txt" ;
1203
- file.open (percent_output, std::ios::trunc);
1204
- file << " PERCENTS\n " ;
1205
- file << percent_string;
1206
- file.close ();
1207
1156
1208
1157
// write means to string
1209
1158
file.open (output_prefix + " .txt" , std::ios::trunc);
@@ -1235,6 +1184,9 @@ std::vector<variant> gmm_model(std::string prefix, std::string output_prefix){
1235
1184
}*/
1236
1185
// could benefit from redoing lines in variants file as gapped/ungapped depth
1237
1186
for (uint32_t i=0 ; i < base_variants.size (); i++){
1187
+ if (base_variants[i].del_flag ){
1188
+ continue ;
1189
+ }
1238
1190
if (!base_variants[i].outside_freq_range && !base_variants[i].pos_del_flag ){
1239
1191
useful_var += 1 ;
1240
1192
variants.push_back (base_variants[i]);
0 commit comments