10
10
#include < algorithm>
11
11
#include < numeric>
12
12
13
- bool test_cluster_deviation (float nearest_cluster, float variant_cluster, float std_dev){
13
+ bool test_cluster_deviation (double nearest_cluster, double variant_cluster, double std_dev){
14
14
bool fluctuation = false ;
15
15
// CLEANUP THIS CAN BE CALCULATED ONCE PER ALL CLUSTERS
16
16
// determine if the assigned and nearest cluster can be resolved based on variant fluctuation
@@ -40,7 +40,7 @@ double find_neighboring_cluster(double freq, uint32_t cluster_assigned, std::vec
40
40
void call_majority_consensus (std::vector<variant> variants, uint32_t max_position, std::string clustering_file, double default_threshold){
41
41
// if we can't find a solution simply take the majority variant per position
42
42
std::vector<std::string> nucs;
43
- std::vector<float > freqs;
43
+ std::vector<double > freqs;
44
44
std::vector<std::string> tmp (max_position, " N" );
45
45
for (uint32_t i=1 ; i <= max_position; i++){
46
46
freqs.clear ();
@@ -53,7 +53,7 @@ void call_majority_consensus(std::vector<variant> variants, uint32_t max_positio
53
53
}
54
54
if (freqs.size () == 0 ) continue ;
55
55
uint32_t index = std::distance (freqs.begin (), std::max_element (freqs.begin (), freqs.end ()));
56
- if (freqs[index] >= ( float ) default_threshold){
56
+ if (freqs[index] >= default_threshold){
57
57
tmp[i-1 ] = nucs[index];
58
58
}
59
59
}
@@ -67,32 +67,32 @@ void call_majority_consensus(std::vector<variant> variants, uint32_t max_positio
67
67
file.close ();
68
68
}
69
69
70
- float find_nearest_distance (const std::vector<float > all_sums, float value) {
71
- float min_distance = std::numeric_limits<float >::max ();
70
+ double find_nearest_distance (const std::vector<double > all_sums, double value) {
71
+ double min_distance = std::numeric_limits<double >::max ();
72
72
for (auto num : all_sums) {
73
- float distance = std::abs (num - value);
73
+ double distance = std::abs (num - value);
74
74
if (distance < min_distance) {
75
75
min_distance = distance;
76
76
}
77
77
}
78
78
return min_distance;
79
79
}
80
80
81
- bool account_peaks (std::vector<float > possible_solution, std::vector<float > means, float total, float error){
81
+ bool account_peaks (std::vector<double > possible_solution, std::vector<double > means, double total, double error){
82
82
bool valid = true ;
83
- std::vector<float > current;
84
- std::vector<std::vector<float >> results;
83
+ std::vector<double > current;
84
+ std::vector<std::vector<double >> results;
85
85
find_combinations (possible_solution, 0 , current, results);
86
86
87
- std::vector<float > all_sums;
87
+ std::vector<double > all_sums;
88
88
for (auto result : results){
89
- float sum = std::accumulate (result.begin (), result.end (), 0 .0f );
89
+ double sum = std::accumulate (result.begin (), result.end (), 0 .0f );
90
90
all_sums.push_back (sum);
91
91
}
92
92
93
93
// check if all means can be accounted for
94
94
for (auto mean : means){
95
- float dist = find_nearest_distance (all_sums, mean);
95
+ double dist = find_nearest_distance (all_sums, mean);
96
96
if (dist > error){
97
97
valid = false ;
98
98
break ;
@@ -101,23 +101,23 @@ bool account_peaks(std::vector<float> possible_solution, std::vector<float> mean
101
101
return (valid);
102
102
}
103
103
104
- bool within_error_range (std::vector<float > values, float target, float error){
104
+ bool within_error_range (std::vector<double > values, double target, double error){
105
105
// test if the sum of the vector equals the target value within some error
106
- float sum = std::accumulate (values.begin (), values.end (), 0 .0f );
106
+ double sum = std::accumulate (values.begin (), values.end (), 0 .0f );
107
107
if (sum < target+error && sum > target-error){
108
108
return (true );
109
109
} else {
110
110
return (false );
111
111
}
112
112
}
113
113
114
- std::vector<std::vector<float >> find_subsets_with_error (std::vector<float > means, float target, float error){
114
+ std::vector<std::vector<double >> find_subsets_with_error (std::vector<double > means, double target, double error){
115
115
// first we find all the possible combinations
116
- std::vector<float > current;
117
- std::vector<std::vector<float >> results;
116
+ std::vector<double > current;
117
+ std::vector<std::vector<double >> results;
118
118
find_combinations (means, 0 , current, results);
119
119
120
- std::vector<std::vector<float >> valid_combinations;
120
+ std::vector<std::vector<double >> valid_combinations;
121
121
for (uint32_t i=0 ; i < results.size (); i++){
122
122
bool in_range = within_error_range (results[i], target, error);
123
123
if (in_range){
@@ -127,8 +127,8 @@ std::vector<std::vector<float>> find_subsets_with_error(std::vector<float> means
127
127
return (valid_combinations);
128
128
}
129
129
130
- std::vector<std::vector<float >> frequency_pair_finder (std::vector<variant> variants, float lower_bound, float upper_bound, std::vector<float > means){
131
- std::vector<std::vector<float >> pairs;
130
+ std::vector<std::vector<double >> frequency_pair_finder (std::vector<variant> variants, double lower_bound, double upper_bound, std::vector<double > means){
131
+ std::vector<std::vector<double >> pairs;
132
132
std::vector<uint32_t > track_positions;
133
133
134
134
for (uint32_t i=0 ; i < variants.size (); i++){
@@ -139,7 +139,7 @@ std::vector<std::vector<float>> frequency_pair_finder(std::vector<variant> varia
139
139
size_t index = std::distance (track_positions.begin (), it);
140
140
pairs[index].push_back (means[variants[i].cluster_assigned ]);
141
141
} else {
142
- std::vector<float > tmp = {means[variants[i].cluster_assigned ]};
142
+ std::vector<double > tmp = {means[variants[i].cluster_assigned ]};
143
143
pairs.push_back (tmp);
144
144
track_positions.push_back (variants[i].position );
145
145
}
@@ -149,17 +149,6 @@ std::vector<std::vector<float>> frequency_pair_finder(std::vector<variant> varia
149
149
return (pairs);
150
150
}
151
151
152
- bool cluster_gravity_analysis (std::vector<std::vector<float >> solutions){
153
- // in the event of multiple solutions, check that the largest cluster is the same
154
- std::vector<float > max_values;
155
- for (auto solution : solutions){
156
- float max = *std::max_element (solution.begin (), solution.end ());
157
- max_values.push_back (max);
158
- }
159
- bool all_same = std::all_of (max_values.begin () + 1 , max_values.end (), [&](float x) { return x == max_values[0 ]; });
160
- return (all_same);
161
- }
162
-
163
152
bool account_for_clusters (std::vector<float > means, std::vector<std::vector<float >> results, float error){
164
153
bool keep = false ;
165
154
std::vector<float > accounted_means;
@@ -190,7 +179,7 @@ bool account_for_clusters(std::vector<float> means, std::vector<std::vector<floa
190
179
return (keep);
191
180
}
192
181
193
- void find_combinations (std::vector<float > means, uint32_t index, std::vector<float > ¤t, std::vector<std::vector<float >> &results){
182
+ void find_combinations (std::vector<double > means, uint32_t index, std::vector<double > ¤t, std::vector<std::vector<double >> &results){
194
183
if (!current.empty ()){
195
184
results.push_back (current);
196
185
}
@@ -201,9 +190,9 @@ void find_combinations(std::vector<float> means, uint32_t index, std::vector<flo
201
190
}
202
191
}
203
192
204
- std::vector<std::vector<float >> find_solutions (std::vector<float > means, float error){
205
- std::vector<float > current;
206
- std::vector<std::vector<float >> results;
193
+ std::vector<std::vector<double >> find_solutions (std::vector<double > means, double error){
194
+ std::vector<double > current;
195
+ std::vector<std::vector<double >> results;
207
196
find_combinations (means, 0 , current, results);
208
197
209
198
std::sort (results.begin (), results.end ());
@@ -212,7 +201,7 @@ std::vector<std::vector<float>> find_solutions(std::vector<float> means, float e
212
201
auto max_iter = std::max_element (means.begin (), means.end ());
213
202
auto min_iter = std::min_element (means.begin (), means.end ());
214
203
215
- std::vector<std::vector<float >> final_results;
204
+ std::vector<std::vector<double >> final_results;
216
205
// constrain that the solutions must add to 1
217
206
for (uint32_t i=0 ; i < results.size (); i++){
218
207
bool keep = within_error_range (results[i], 1 , error);
@@ -223,11 +212,11 @@ std::vector<std::vector<float>> find_solutions(std::vector<float> means, float e
223
212
return (final_results);
224
213
}
225
214
226
- std::vector<float > parse_string_to_vector (const std::string& str) {
227
- std::vector<float > result;
215
+ std::vector<double > parse_string_to_vector (const std::string& str) {
216
+ std::vector<double > result;
228
217
std::stringstream ss (str);
229
218
char ch; // Used to read and discard non-numeric characters, including the decimal point
230
- float num;
219
+ double num;
231
220
232
221
// Read characters one by one
233
222
while (ss >> ch) {
@@ -243,15 +232,15 @@ std::vector<float> parse_string_to_vector(const std::string& str) {
243
232
return result;
244
233
}
245
234
246
- std::vector<std::vector<uint32_t >> find_combination_peaks (std::vector<float > solution, std::vector<float > means, std::vector<float > &unresolved){
235
+ std::vector<std::vector<uint32_t >> find_combination_peaks (std::vector<double > solution, std::vector<double > means, std::vector<double > &unresolved){
247
236
std::vector<std::vector<uint32_t >> cluster_indexes (means.size ());
248
- std::vector<float > current;
249
- std::vector<std::vector<float >> results;
250
- std::vector<float > totals;
237
+ std::vector<double > current;
238
+ std::vector<std::vector<double >> results;
239
+ std::vector<double > totals;
251
240
252
241
find_combinations (solution, 0 , current, results);
253
242
for (uint32_t i=0 ; i < results.size (); i++){
254
- float sum = std::accumulate (results[i].begin (), results[i].end (), 0 .0f );
243
+ double sum = std::accumulate (results[i].begin (), results[i].end (), 0 .0f );
255
244
totals.push_back (sum);
256
245
}
257
246
// given a solution and the means, map each cluster to the cluster it contains
@@ -260,9 +249,9 @@ std::vector<std::vector<uint32_t>> find_combination_peaks(std::vector<float> sol
260
249
// th mean is part of the solution
261
250
if (it != solution.end ()){
262
251
cluster_indexes[i].push_back (i);
263
- float target = means[i];
264
- std::vector<float > distances (totals.size ());
265
- std::transform (totals.begin (), totals.end (), distances.begin (), [target](float num) { return std::abs (target - num); });
252
+ double target = means[i];
253
+ std::vector<double > distances (totals.size ());
254
+ std::transform (totals.begin (), totals.end (), distances.begin (), [target](double num) { return std::abs (target - num); });
266
255
uint32_t count = 0 ;
267
256
268
257
// this checks the distances from the mean to all other possible peaks
@@ -272,12 +261,12 @@ std::vector<std::vector<uint32_t>> find_combination_peaks(std::vector<float> sol
272
261
if (count > 1 ) unresolved.push_back (target);
273
262
274
263
} else {
275
- float target = means[i];
264
+ double target = means[i];
276
265
// the problem with this is that it looks at the min but not if two overlapping peaks occur
277
- auto it = std::min_element (totals.begin (), totals.end (), [target](float a, float b) {return std::abs (a - target) < std::abs (b - target);});
266
+ auto it = std::min_element (totals.begin (), totals.end (), [target](double a, double b) {return std::abs (a - target) < std::abs (b - target);});
278
267
279
- std::vector<float > distances (totals.size ());
280
- std::transform (totals.begin (), totals.end (), distances.begin (), [target](float num) { return std::abs (target - num); });
268
+ std::vector<double > distances (totals.size ());
269
+ std::transform (totals.begin (), totals.end (), distances.begin (), [target](double num) { return std::abs (target - num); });
281
270
uint32_t count = 0 ;
282
271
for (uint32_t d=0 ; d < distances.size (); d++){
283
272
if (distances[d] < 0.03 ) count += 1 ;
@@ -291,13 +280,6 @@ std::vector<std::vector<uint32_t>> find_combination_peaks(std::vector<float> sol
291
280
if (count > 1 ) unresolved.push_back (means[i]);
292
281
}
293
282
}
294
- /* for(uint32_t i=0; i < cluster_indexes.size(); i++){
295
- for(uint32_t j=0; j < cluster_indexes[i].size(); j++){
296
- std::cerr << cluster_indexes[i][j] << " ";
297
- }
298
- std::cerr << "\n";
299
- }
300
- for(auto u : unresolved) std::cerr << u << std::endl;*/
301
283
return (cluster_indexes);
302
284
}
303
285
@@ -323,11 +305,11 @@ std::vector<std::vector<double>> deduplicate_solutions(std::vector<std::vector<d
323
305
return (solutions);
324
306
}
325
307
326
- std::vector<float > parse_clustering_results (std::string clustering_file){
308
+ std::vector<double > parse_clustering_results (std::string clustering_file){
327
309
std::ifstream infile (clustering_file + " .txt" );
328
310
std::string line;
329
311
uint32_t count = 0 ;
330
- std::vector<float > numbers;
312
+ std::vector<double > numbers;
331
313
while (std::getline (infile, line)) {
332
314
if (count == 0 ) {
333
315
count += 1 ;
@@ -341,22 +323,25 @@ std::vector<float> parse_clustering_results(std::string clustering_file){
341
323
}
342
324
return (numbers);
343
325
}
344
- void cluster_consensus (std::vector<variant> variants, std::string clustering_file, std::string variants_file, double default_threshold){
345
- float depth_cutoff = 10 ;
326
+ void cluster_consensus (std::vector<variant> variants, std::string clustering_file, std::string variants_file, double default_threshold){
327
+ /*
328
+ Call consensu sequence from clusters.
329
+ */
330
+ double depth_cutoff = 10 ;
346
331
double error = 0.10 ;
347
332
float solution_error = 0.05 ;
348
333
double quality_threshold = 20 ;
349
334
350
- std::vector< float > error_rate = cluster_error (variants_file, quality_threshold);
351
- float freq_lower_bound = error_rate[ 0 ] ;
352
- float freq_upper_bound = error_rate[ 1 ] ;
335
+ double error_rate = cluster_error (variants_file, quality_threshold, depth_cutoff );
336
+ double freq_lower_bound = error_rate+ 0.001 ;
337
+ double freq_upper_bound = 1 -error_rate- 0.001 ;
353
338
354
339
// read in the cluster values
355
- std::vector<float > means = parse_clustering_results (clustering_file);
340
+ std::vector<double > means = parse_clustering_results (clustering_file);
356
341
for (auto m : means){
357
342
std::cerr << " consensus means " << m << std::endl;
358
343
}
359
- std::vector<std::vector<float >> clusters (means.size ());
344
+ std::vector<std::vector<double >> clusters (means.size ());
360
345
for (auto var : variants){
361
346
if (var.cluster_assigned != -1 ){
362
347
clusters[var.cluster_assigned ].push_back (var.freq );
@@ -370,28 +355,28 @@ void cluster_consensus(std::vector<variant> variants, std::string clustering_fil
370
355
}
371
356
}
372
357
// find position wise frequency pairs
373
- std::vector<std::vector<float >> pairs = frequency_pair_finder (variants, freq_lower_bound, freq_upper_bound, means);
374
- std::vector<std::vector<float >> solutions = find_solutions (means, error);
358
+ std::vector<std::vector<double >> pairs = frequency_pair_finder (variants, freq_lower_bound, freq_upper_bound, means);
359
+ std::vector<std::vector<double >> solutions = find_solutions (means, error);
375
360
376
361
// find peaks that can't be a subset of other peaks
377
- std::vector<float > non_subset_means;
362
+ std::vector<double > non_subset_means;
378
363
for (uint32_t i=0 ; i < means.size (); i++){
379
- std::vector<std::vector<float >> tmp = find_subsets_with_error (means, means[i], solution_error);
364
+ std::vector<std::vector<double >> tmp = find_subsets_with_error (means, means[i], solution_error);
380
365
if (tmp.size () <= 1 ){
381
366
non_subset_means.push_back (means[i]);
382
367
}
383
368
}
384
369
// reduce solution space to things that contain the non subset peaks
385
- std::vector<std::vector<float >> realistic_solutions;
370
+ std::vector<std::vector<double >> realistic_solutions;
386
371
for (uint32_t i=0 ; i < solutions.size (); i++){
387
- std::vector<float > tmp = solutions[i];
388
- bool found = std::all_of (non_subset_means.begin (), non_subset_means.end (), [&tmp](float value) {return std::find (tmp.begin (), tmp.end (), value) != tmp.end ();});
372
+ std::vector<double > tmp = solutions[i];
373
+ bool found = std::all_of (non_subset_means.begin (), non_subset_means.end (), [&tmp](double value) {return std::find (tmp.begin (), tmp.end (), value) != tmp.end ();});
389
374
if (found){
390
375
realistic_solutions.push_back (solutions[i]);
391
376
}
392
377
}
393
378
// check each solution that every possible peak is accounted for
394
- std::vector<std::vector<float >> solution_sets;
379
+ std::vector<std::vector<double >> solution_sets;
395
380
for (uint32_t i=0 ; i < realistic_solutions.size (); i++){
396
381
bool keep = account_peaks (realistic_solutions[i], means, 1 , solution_error);
397
382
if (keep){
@@ -407,7 +392,7 @@ void cluster_consensus(std::vector<variant> variants, std::string clustering_fil
407
392
}
408
393
std::cerr << " \n " << std::endl;
409
394
410
- std::vector<float > solution;
395
+ std::vector<double > solution;
411
396
bool traditional_majority= false ; // if we can't find a solution call a traditional majority consensus
412
397
if (solution_sets.size () == 0 ){
413
398
std::cerr << clustering_file << " no solution found" << std::endl;
@@ -427,7 +412,7 @@ void cluster_consensus(std::vector<variant> variants, std::string clustering_fil
427
412
for (auto x : solution){
428
413
std::cerr << x << std::endl;
429
414
}
430
- std::vector<float > unresolved;
415
+ std::vector<double > unresolved;
431
416
std::vector<std::vector<uint32_t >> cluster_groups = find_combination_peaks (solution, means, unresolved);
432
417
433
418
std::vector<std::vector<uint32_t >> inverse_groups (means.size ());
0 commit comments