Skip to content

Commit 5089336

Browse files
committed
switching from float to double
1 parent 0abe1e7 commit 5089336

File tree

5 files changed

+91
-151
lines changed

5 files changed

+91
-151
lines changed

src/call_consensus_clustering.cpp

Lines changed: 63 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
#include <algorithm>
1111
#include <numeric>
1212

13-
bool test_cluster_deviation(float nearest_cluster, float variant_cluster, float std_dev){
13+
bool test_cluster_deviation(double nearest_cluster, double variant_cluster, double std_dev){
1414
bool fluctuation = false;
1515
//CLEANUP THIS CAN BE CALCULATED ONCE PER ALL CLUSTERS
1616
//determine if the assigned and nearest cluster can be resolved based on variant fluctuation
@@ -40,7 +40,7 @@ double find_neighboring_cluster(double freq, uint32_t cluster_assigned, std::vec
4040
void call_majority_consensus(std::vector<variant> variants, uint32_t max_position, std::string clustering_file, double default_threshold){
4141
//if we can't find a solution simply take the majority variant per position
4242
std::vector<std::string> nucs;
43-
std::vector<float> freqs;
43+
std::vector<double> freqs;
4444
std::vector<std::string> tmp(max_position, "N");
4545
for(uint32_t i=1; i <= max_position; i++){
4646
freqs.clear();
@@ -53,7 +53,7 @@ void call_majority_consensus(std::vector<variant> variants, uint32_t max_positio
5353
}
5454
if(freqs.size() == 0) continue;
5555
uint32_t index = std::distance(freqs.begin(), std::max_element(freqs.begin(), freqs.end()));
56-
if(freqs[index] >= (float)default_threshold){
56+
if(freqs[index] >= default_threshold){
5757
tmp[i-1] = nucs[index];
5858
}
5959
}
@@ -67,32 +67,32 @@ void call_majority_consensus(std::vector<variant> variants, uint32_t max_positio
6767
file.close();
6868
}
6969

70-
float find_nearest_distance(const std::vector<float> all_sums, float value) {
71-
float min_distance = std::numeric_limits<float>::max();
70+
double find_nearest_distance(const std::vector<double> all_sums, double value) {
71+
double min_distance = std::numeric_limits<double>::max();
7272
for (auto num : all_sums) {
73-
float distance = std::abs(num - value);
73+
double distance = std::abs(num - value);
7474
if (distance < min_distance) {
7575
min_distance = distance;
7676
}
7777
}
7878
return min_distance;
7979
}
8080

81-
bool account_peaks(std::vector<float> possible_solution, std::vector<float> means, float total, float error){
81+
bool account_peaks(std::vector<double> possible_solution, std::vector<double> means, double total, double error){
8282
bool valid = true;
83-
std::vector<float> current;
84-
std::vector<std::vector<float>> results;
83+
std::vector<double> current;
84+
std::vector<std::vector<double>> results;
8585
find_combinations(possible_solution, 0, current, results);
8686

87-
std::vector<float> all_sums;
87+
std::vector<double> all_sums;
8888
for(auto result : results){
89-
float sum = std::accumulate(result.begin(), result.end(), 0.0f);
89+
double sum = std::accumulate(result.begin(), result.end(), 0.0f);
9090
all_sums.push_back(sum);
9191
}
9292

9393
//check if all means can be accounted for
9494
for(auto mean : means){
95-
float dist = find_nearest_distance(all_sums, mean);
95+
double dist = find_nearest_distance(all_sums, mean);
9696
if(dist > error){
9797
valid = false;
9898
break;
@@ -101,23 +101,23 @@ bool account_peaks(std::vector<float> possible_solution, std::vector<float> mean
101101
return(valid);
102102
}
103103

104-
bool within_error_range(std::vector<float> values, float target, float error){
104+
bool within_error_range(std::vector<double> values, double target, double error){
105105
//test if the sum of the vector equals the target value within some error
106-
float sum = std::accumulate(values.begin(), values.end(), 0.0f);
106+
double sum = std::accumulate(values.begin(), values.end(), 0.0f);
107107
if(sum < target+error && sum > target-error){
108108
return(true);
109109
} else{
110110
return(false);
111111
}
112112
}
113113

114-
std::vector<std::vector<float>> find_subsets_with_error(std::vector<float> means, float target, float error){
114+
std::vector<std::vector<double>> find_subsets_with_error(std::vector<double> means, double target, double error){
115115
//first we find all the possible combinations
116-
std::vector<float> current;
117-
std::vector<std::vector<float>> results;
116+
std::vector<double> current;
117+
std::vector<std::vector<double>> results;
118118
find_combinations(means, 0, current, results);
119119

120-
std::vector<std::vector<float>> valid_combinations;
120+
std::vector<std::vector<double>> valid_combinations;
121121
for(uint32_t i=0; i < results.size(); i++){
122122
bool in_range = within_error_range(results[i], target, error);
123123
if(in_range){
@@ -127,8 +127,8 @@ std::vector<std::vector<float>> find_subsets_with_error(std::vector<float> means
127127
return(valid_combinations);
128128
}
129129

130-
std::vector<std::vector<float>> frequency_pair_finder(std::vector<variant> variants, float lower_bound, float upper_bound, std::vector<float> means){
131-
std::vector<std::vector<float>> pairs;
130+
std::vector<std::vector<double>> frequency_pair_finder(std::vector<variant> variants, double lower_bound, double upper_bound, std::vector<double> means){
131+
std::vector<std::vector<double>> pairs;
132132
std::vector<uint32_t> track_positions;
133133

134134
for(uint32_t i=0; i < variants.size(); i++){
@@ -139,7 +139,7 @@ std::vector<std::vector<float>> frequency_pair_finder(std::vector<variant> varia
139139
size_t index = std::distance(track_positions.begin(), it);
140140
pairs[index].push_back(means[variants[i].cluster_assigned]);
141141
} else{
142-
std::vector<float> tmp = {means[variants[i].cluster_assigned]};
142+
std::vector<double> tmp = {means[variants[i].cluster_assigned]};
143143
pairs.push_back(tmp);
144144
track_positions.push_back(variants[i].position);
145145
}
@@ -149,17 +149,6 @@ std::vector<std::vector<float>> frequency_pair_finder(std::vector<variant> varia
149149
return(pairs);
150150
}
151151

152-
bool cluster_gravity_analysis(std::vector<std::vector<float>> solutions){
153-
//in the event of multiple solutions, check that the largest cluster is the same
154-
std::vector<float> max_values;
155-
for(auto solution : solutions){
156-
float max = *std::max_element(solution.begin(), solution.end());
157-
max_values.push_back(max);
158-
}
159-
bool all_same = std::all_of(max_values.begin() + 1, max_values.end(), [&](float x) { return x == max_values[0]; });
160-
return(all_same);
161-
}
162-
163152
bool account_for_clusters(std::vector<float> means, std::vector<std::vector<float>> results, float error){
164153
bool keep = false;
165154
std::vector<float> accounted_means;
@@ -190,7 +179,7 @@ bool account_for_clusters(std::vector<float> means, std::vector<std::vector<floa
190179
return(keep);
191180
}
192181

193-
void find_combinations(std::vector<float> means, uint32_t index, std::vector<float> &current, std::vector<std::vector<float>> &results){
182+
void find_combinations(std::vector<double> means, uint32_t index, std::vector<double> &current, std::vector<std::vector<double>> &results){
194183
if (!current.empty()){
195184
results.push_back(current);
196185
}
@@ -201,9 +190,9 @@ void find_combinations(std::vector<float> means, uint32_t index, std::vector<flo
201190
}
202191
}
203192

204-
std::vector<std::vector<float>> find_solutions(std::vector<float> means, float error){
205-
std::vector<float> current;
206-
std::vector<std::vector<float>> results;
193+
std::vector<std::vector<double>> find_solutions(std::vector<double> means, double error){
194+
std::vector<double> current;
195+
std::vector<std::vector<double>> results;
207196
find_combinations(means, 0, current, results);
208197

209198
std::sort(results.begin(), results.end());
@@ -212,7 +201,7 @@ std::vector<std::vector<float>> find_solutions(std::vector<float> means, float e
212201
auto max_iter = std::max_element(means.begin(), means.end());
213202
auto min_iter = std::min_element(means.begin(), means.end());
214203

215-
std::vector<std::vector<float>> final_results;
204+
std::vector<std::vector<double>> final_results;
216205
//constrain that the solutions must add to 1
217206
for(uint32_t i=0; i < results.size(); i++){
218207
bool keep = within_error_range(results[i], 1, error);
@@ -223,11 +212,11 @@ std::vector<std::vector<float>> find_solutions(std::vector<float> means, float e
223212
return(final_results);
224213
}
225214

226-
std::vector<float> parse_string_to_vector(const std::string& str) {
227-
std::vector<float> result;
215+
std::vector<double> parse_string_to_vector(const std::string& str) {
216+
std::vector<double> result;
228217
std::stringstream ss(str);
229218
char ch; // Used to read and discard non-numeric characters, including the decimal point
230-
float num;
219+
double num;
231220

232221
// Read characters one by one
233222
while (ss >> ch) {
@@ -243,15 +232,15 @@ std::vector<float> parse_string_to_vector(const std::string& str) {
243232
return result;
244233
}
245234

246-
std::vector<std::vector<uint32_t>> find_combination_peaks(std::vector<float> solution, std::vector<float> means, std::vector<float> &unresolved){
235+
std::vector<std::vector<uint32_t>> find_combination_peaks(std::vector<double> solution, std::vector<double> means, std::vector<double> &unresolved){
247236
std::vector<std::vector<uint32_t>> cluster_indexes(means.size());
248-
std::vector<float> current;
249-
std::vector<std::vector<float>> results;
250-
std::vector<float> totals;
237+
std::vector<double> current;
238+
std::vector<std::vector<double>> results;
239+
std::vector<double> totals;
251240

252241
find_combinations(solution, 0, current, results);
253242
for(uint32_t i=0; i < results.size(); i++){
254-
float sum = std::accumulate(results[i].begin(), results[i].end(), 0.0f);
243+
double sum = std::accumulate(results[i].begin(), results[i].end(), 0.0f);
255244
totals.push_back(sum);
256245
}
257246
//given a solution and the means, map each cluster to the cluster it contains
@@ -260,9 +249,9 @@ std::vector<std::vector<uint32_t>> find_combination_peaks(std::vector<float> sol
260249
//th mean is part of the solution
261250
if(it != solution.end()){
262251
cluster_indexes[i].push_back(i);
263-
float target = means[i];
264-
std::vector<float> distances(totals.size());
265-
std::transform(totals.begin(), totals.end(), distances.begin(), [target](float num) { return std::abs(target - num); });
252+
double target = means[i];
253+
std::vector<double> distances(totals.size());
254+
std::transform(totals.begin(), totals.end(), distances.begin(), [target](double num) { return std::abs(target - num); });
266255
uint32_t count = 0;
267256

268257
//this checks the distances from the mean to all other possible peaks
@@ -272,12 +261,12 @@ std::vector<std::vector<uint32_t>> find_combination_peaks(std::vector<float> sol
272261
if(count > 1) unresolved.push_back(target);
273262

274263
} else {
275-
float target = means[i];
264+
double target = means[i];
276265
//the problem with this is that it looks at the min but not if two overlapping peaks occur
277-
auto it = std::min_element(totals.begin(), totals.end(), [target](float a, float b) {return std::abs(a - target) < std::abs(b - target);});
266+
auto it = std::min_element(totals.begin(), totals.end(), [target](double a, double b) {return std::abs(a - target) < std::abs(b - target);});
278267

279-
std::vector<float> distances(totals.size());
280-
std::transform(totals.begin(), totals.end(), distances.begin(), [target](float num) { return std::abs(target - num); });
268+
std::vector<double> distances(totals.size());
269+
std::transform(totals.begin(), totals.end(), distances.begin(), [target](double num) { return std::abs(target - num); });
281270
uint32_t count = 0;
282271
for(uint32_t d=0; d < distances.size(); d++){
283272
if(distances[d] < 0.03) count += 1;
@@ -291,13 +280,6 @@ std::vector<std::vector<uint32_t>> find_combination_peaks(std::vector<float> sol
291280
if(count > 1) unresolved.push_back(means[i]);
292281
}
293282
}
294-
/*for(uint32_t i=0; i < cluster_indexes.size(); i++){
295-
for(uint32_t j=0; j < cluster_indexes[i].size(); j++){
296-
std::cerr << cluster_indexes[i][j] << " ";
297-
}
298-
std::cerr << "\n";
299-
}
300-
for(auto u : unresolved) std::cerr << u << std::endl;*/
301283
return(cluster_indexes);
302284
}
303285

@@ -323,11 +305,11 @@ std::vector<std::vector<double>> deduplicate_solutions(std::vector<std::vector<d
323305
return(solutions);
324306
}
325307

326-
std::vector<float> parse_clustering_results(std::string clustering_file){
308+
std::vector<double> parse_clustering_results(std::string clustering_file){
327309
std::ifstream infile(clustering_file + ".txt");
328310
std::string line;
329311
uint32_t count = 0;
330-
std::vector<float> numbers;
312+
std::vector<double> numbers;
331313
while (std::getline(infile, line)) {
332314
if(count == 0) {
333315
count += 1;
@@ -341,22 +323,25 @@ std::vector<float> parse_clustering_results(std::string clustering_file){
341323
}
342324
return(numbers);
343325
}
344-
void cluster_consensus(std::vector<variant> variants, std::string clustering_file, std::string variants_file, double default_threshold){
345-
float depth_cutoff = 10;
326+
void cluster_consensus(std::vector<variant> variants, std::string clustering_file, std::string variants_file, double default_threshold){
327+
/*
328+
Call consensu sequence from clusters.
329+
*/
330+
double depth_cutoff = 10;
346331
double error = 0.10;
347332
float solution_error = 0.05;
348333
double quality_threshold = 20;
349334

350-
std::vector<float> error_rate = cluster_error(variants_file, quality_threshold);
351-
float freq_lower_bound = error_rate[0];
352-
float freq_upper_bound = error_rate[1];
335+
double error_rate = cluster_error(variants_file, quality_threshold, depth_cutoff);
336+
double freq_lower_bound = error_rate+0.001;
337+
double freq_upper_bound = 1-error_rate-0.001;
353338

354339
//read in the cluster values
355-
std::vector<float> means = parse_clustering_results(clustering_file);
340+
std::vector<double> means = parse_clustering_results(clustering_file);
356341
for(auto m : means){
357342
std::cerr << "consensus means " << m << std::endl;
358343
}
359-
std::vector<std::vector<float>> clusters(means.size());
344+
std::vector<std::vector<double>> clusters(means.size());
360345
for(auto var : variants){
361346
if(var.cluster_assigned != -1){
362347
clusters[var.cluster_assigned].push_back(var.freq);
@@ -370,28 +355,28 @@ void cluster_consensus(std::vector<variant> variants, std::string clustering_fil
370355
}
371356
}
372357
//find position wise frequency pairs
373-
std::vector<std::vector<float>> pairs = frequency_pair_finder(variants, freq_lower_bound, freq_upper_bound, means);
374-
std::vector<std::vector<float>> solutions = find_solutions(means, error);
358+
std::vector<std::vector<double>> pairs = frequency_pair_finder(variants, freq_lower_bound, freq_upper_bound, means);
359+
std::vector<std::vector<double>> solutions = find_solutions(means, error);
375360

376361
//find peaks that can't be a subset of other peaks
377-
std::vector<float> non_subset_means;
362+
std::vector<double> non_subset_means;
378363
for(uint32_t i=0; i < means.size(); i++){
379-
std::vector<std::vector<float>> tmp = find_subsets_with_error(means, means[i], solution_error);
364+
std::vector<std::vector<double>> tmp = find_subsets_with_error(means, means[i], solution_error);
380365
if(tmp.size() <= 1){
381366
non_subset_means.push_back(means[i]);
382367
}
383368
}
384369
//reduce solution space to things that contain the non subset peaks
385-
std::vector<std::vector<float>> realistic_solutions;
370+
std::vector<std::vector<double>> realistic_solutions;
386371
for(uint32_t i=0; i < solutions.size(); i++){
387-
std::vector<float> tmp = solutions[i];
388-
bool found = std::all_of(non_subset_means.begin(), non_subset_means.end(), [&tmp](float value) {return std::find(tmp.begin(), tmp.end(), value) != tmp.end();});
372+
std::vector<double> tmp = solutions[i];
373+
bool found = std::all_of(non_subset_means.begin(), non_subset_means.end(), [&tmp](double value) {return std::find(tmp.begin(), tmp.end(), value) != tmp.end();});
389374
if(found){
390375
realistic_solutions.push_back(solutions[i]);
391376
}
392377
}
393378
//check each solution that every possible peak is accounted for
394-
std::vector<std::vector<float>> solution_sets;
379+
std::vector<std::vector<double>> solution_sets;
395380
for(uint32_t i=0; i < realistic_solutions.size(); i++){
396381
bool keep = account_peaks(realistic_solutions[i], means, 1, solution_error);
397382
if(keep){
@@ -407,7 +392,7 @@ void cluster_consensus(std::vector<variant> variants, std::string clustering_fil
407392
}
408393
std::cerr << "\n" << std::endl;
409394

410-
std::vector<float> solution;
395+
std::vector<double> solution;
411396
bool traditional_majority= false; //if we can't find a solution call a traditional majority consensus
412397
if(solution_sets.size() == 0){
413398
std::cerr << clustering_file << " no solution found" << std::endl;
@@ -427,7 +412,7 @@ void cluster_consensus(std::vector<variant> variants, std::string clustering_fil
427412
for(auto x : solution){
428413
std::cerr << x << std::endl;
429414
}
430-
std::vector<float> unresolved;
415+
std::vector<double> unresolved;
431416
std::vector<std::vector<uint32_t>> cluster_groups = find_combination_peaks(solution, means, unresolved);
432417

433418
std::vector<std::vector<uint32_t>> inverse_groups(means.size());

src/call_consensus_clustering.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
#include "gmm.h"
22
#ifndef call_consensus_clustering
33
#define call_consensus_clustering
4-
54
void cluster_consensus(std::vector<variant> variants, std::string clustering_file, std::string variants_filename, double default_threshold);
6-
void find_combinations(std::vector<float> means, uint32_t index, std::vector<float> &current, std::vector<std::vector<float>> &results);
5+
void find_combinations(std::vector<double> means, uint32_t index, std::vector<double> &current, std::vector<std::vector<double>> &results)
76
#endif

0 commit comments

Comments
 (0)