andersen-lab
diff --git a/‎src/call_consensus_clustering.cpp
Lines changed: 63 additions & 78 deletions b/‎src/call_consensus_clustering.cpp
Lines changed: 63 additions & 78 deletions
diff --git a/‎src/call_consensus_clustering.h
Lines changed: 1 addition & 2 deletions b/‎src/call_consensus_clustering.h
Lines changed: 1 addition & 2 deletions
@@ -10,7 +10,7 @@
 #include <algorithm>
 #include <numeric>
 
-bool test_cluster_deviation(float nearest_cluster, float variant_cluster, float std_dev){
+bool test_cluster_deviation(double nearest_cluster, double variant_cluster, double std_dev){
   bool fluctuation = false;
   //CLEANUP THIS CAN BE CALCULATED ONCE PER ALL CLUSTERS
   //determine if the assigned and nearest cluster can be resolved based on variant fluctuation
@@ -40,7 +40,7 @@ double find_neighboring_cluster(double freq, uint32_t cluster_assigned, std::vec
 void call_majority_consensus(std::vector<variant> variants, uint32_t max_position, std::string clustering_file, double default_threshold){
   //if we can't find a solution simply take the majority variant per position
   std::vector<std::string> nucs;
-  std::vector<float> freqs;
+  std::vector<double> freqs;
   std::vector<std::string> tmp(max_position, "N");
   for(uint32_t i=1; i <= max_position; i++){
     freqs.clear();
@@ -53,7 +53,7 @@ void call_majority_consensus(std::vector<variant> variants, uint32_t max_positio
     }
     if(freqs.size() == 0) continue;
     uint32_t index = std::distance(freqs.begin(), std::max_element(freqs.begin(), freqs.end()));
-    if(freqs[index] >= (float)default_threshold){
+    if(freqs[index] >= default_threshold){
       tmp[i-1] = nucs[index];
     }
   }
@@ -67,32 +67,32 @@ void call_majority_consensus(std::vector<variant> variants, uint32_t max_positio
   file.close(); 
 }
 
-float find_nearest_distance(const std::vector<float> all_sums, float value) {
-    float min_distance = std::numeric_limits<float>::max();
+double find_nearest_distance(const std::vector<double> all_sums, double value) {
+    double min_distance = std::numeric_limits<double>::max();
     for (auto num : all_sums) {
-        float distance = std::abs(num - value);
+        double distance = std::abs(num - value);
         if (distance < min_distance) {
             min_distance = distance;
         }
     }
     return min_distance;
 }
 
-bool account_peaks(std::vector<float> possible_solution, std::vector<float> means, float total, float error){
+bool account_peaks(std::vector<double> possible_solution, std::vector<double> means, double total, double error){
   bool valid = true;
-  std::vector<float> current;
-  std::vector<std::vector<float>> results;
+  std::vector<double> current;
+  std::vector<std::vector<double>> results;
   find_combinations(possible_solution, 0, current, results);
 
-  std::vector<float> all_sums; 
+  std::vector<double> all_sums; 
   for(auto result : results){
-    float sum = std::accumulate(result.begin(), result.end(), 0.0f);
+    double sum = std::accumulate(result.begin(), result.end(), 0.0f);
     all_sums.push_back(sum);
   }
 
   //check if all means can be accounted for
   for(auto mean : means){
-    float dist = find_nearest_distance(all_sums, mean);
+    double dist = find_nearest_distance(all_sums, mean);
     if(dist > error){
       valid = false;
       break;
@@ -101,23 +101,23 @@ bool account_peaks(std::vector<float> possible_solution, std::vector<float> mean
   return(valid);
 }
 
-bool within_error_range(std::vector<float> values, float target, float error){
+bool within_error_range(std::vector<double> values, double target, double error){
   //test if the sum of the vector equals the target value within some error
-  float sum = std::accumulate(values.begin(), values.end(), 0.0f);
+  double sum = std::accumulate(values.begin(), values.end(), 0.0f);
   if(sum < target+error && sum > target-error){
     return(true);  
   } else{
     return(false);
   }
 }
 
-std::vector<std::vector<float>> find_subsets_with_error(std::vector<float> means, float target, float error){
+std::vector<std::vector<double>> find_subsets_with_error(std::vector<double> means, double target, double error){
   //first we find all the possible combinations
-  std::vector<float> current;
-  std::vector<std::vector<float>> results;
+  std::vector<double> current;
+  std::vector<std::vector<double>> results;
   find_combinations(means, 0, current, results);
 
-  std::vector<std::vector<float>> valid_combinations;  
+  std::vector<std::vector<double>> valid_combinations;  
   for(uint32_t i=0; i < results.size(); i++){
     bool in_range = within_error_range(results[i], target, error);
     if(in_range){
@@ -127,8 +127,8 @@ std::vector<std::vector<float>> find_subsets_with_error(std::vector<float> means
   return(valid_combinations);
 }
 
-std::vector<std::vector<float>> frequency_pair_finder(std::vector<variant> variants, float lower_bound, float upper_bound, std::vector<float> means){ 
-  std::vector<std::vector<float>> pairs;
+std::vector<std::vector<double>> frequency_pair_finder(std::vector<variant> variants, double lower_bound, double upper_bound, std::vector<double> means){ 
+  std::vector<std::vector<double>> pairs;
   std::vector<uint32_t> track_positions;
 
   for(uint32_t i=0; i < variants.size(); i++){
@@ -139,7 +139,7 @@ std::vector<std::vector<float>> frequency_pair_finder(std::vector<variant> varia
         size_t index = std::distance(track_positions.begin(), it);
         pairs[index].push_back(means[variants[i].cluster_assigned]);
       } else{
-        std::vector<float> tmp = {means[variants[i].cluster_assigned]};
+        std::vector<double> tmp = {means[variants[i].cluster_assigned]};
         pairs.push_back(tmp);
         track_positions.push_back(variants[i].position);
       }
@@ -149,17 +149,6 @@ std::vector<std::vector<float>> frequency_pair_finder(std::vector<variant> varia
   return(pairs);
 }
 
-bool cluster_gravity_analysis(std::vector<std::vector<float>> solutions){
-  //in the event of multiple solutions, check that the largest cluster is the same
-  std::vector<float> max_values;
-  for(auto solution : solutions){
-    float max = *std::max_element(solution.begin(), solution.end());
-    max_values.push_back(max);
-  }
-  bool all_same = std::all_of(max_values.begin() + 1, max_values.end(), [&](float x) { return x == max_values[0]; });  
-  return(all_same);
-}
-
 bool account_for_clusters(std::vector<float> means, std::vector<std::vector<float>> results, float error){
   bool keep = false;
   std::vector<float> accounted_means;
@@ -190,7 +179,7 @@ bool account_for_clusters(std::vector<float> means, std::vector<std::vector<floa
   return(keep);
 }
 
-void find_combinations(std::vector<float> means, uint32_t index, std::vector<float> &current, std::vector<std::vector<float>> &results){
+void find_combinations(std::vector<double> means, uint32_t index, std::vector<double> &current, std::vector<std::vector<double>> &results){
   if (!current.empty()){
     results.push_back(current);
   }
@@ -201,9 +190,9 @@ void find_combinations(std::vector<float> means, uint32_t index, std::vector<flo
   }
 }
 
-std::vector<std::vector<float>> find_solutions(std::vector<float> means, float error){
-  std::vector<float> current;
-  std::vector<std::vector<float>> results;
+std::vector<std::vector<double>> find_solutions(std::vector<double> means, double error){
+  std::vector<double> current;
+  std::vector<std::vector<double>> results;
   find_combinations(means, 0, current, results);
 
   std::sort(results.begin(), results.end());
@@ -212,7 +201,7 @@ std::vector<std::vector<float>> find_solutions(std::vector<float> means, float e
   auto max_iter = std::max_element(means.begin(), means.end());
   auto min_iter = std::min_element(means.begin(), means.end());
 
-  std::vector<std::vector<float>> final_results;
+  std::vector<std::vector<double>> final_results;
   //constrain that the solutions must add to 1
   for(uint32_t i=0; i < results.size(); i++){
     bool keep = within_error_range(results[i], 1, error);
@@ -223,11 +212,11 @@ std::vector<std::vector<float>> find_solutions(std::vector<float> means, float e
   return(final_results);  
 }
 
-std::vector<float> parse_string_to_vector(const std::string& str) {
-    std::vector<float> result;
+std::vector<double> parse_string_to_vector(const std::string& str) {
+    std::vector<double> result;
     std::stringstream ss(str);
     char ch; // Used to read and discard non-numeric characters, including the decimal point
-    float num;
+    double num;
 
     // Read characters one by one
     while (ss >> ch) {
@@ -243,15 +232,15 @@ std::vector<float> parse_string_to_vector(const std::string& str) {
     return result;
 }
 
-std::vector<std::vector<uint32_t>> find_combination_peaks(std::vector<float> solution, std::vector<float> means, std::vector<float> &unresolved){ 
+std::vector<std::vector<uint32_t>> find_combination_peaks(std::vector<double> solution, std::vector<double> means, std::vector<double> &unresolved){ 
   std::vector<std::vector<uint32_t>> cluster_indexes(means.size());
-  std::vector<float> current;
-  std::vector<std::vector<float>> results;
-  std::vector<float> totals;
+  std::vector<double> current;
+  std::vector<std::vector<double>> results;
+  std::vector<double> totals;
 
   find_combinations(solution, 0, current, results);
   for(uint32_t i=0; i < results.size(); i++){
-    float sum = std::accumulate(results[i].begin(), results[i].end(), 0.0f); 
+    double sum = std::accumulate(results[i].begin(), results[i].end(), 0.0f); 
     totals.push_back(sum);
   }
   //given a solution and the means, map each cluster to the cluster it contains
@@ -260,9 +249,9 @@ std::vector<std::vector<uint32_t>> find_combination_peaks(std::vector<float> sol
     //th mean is part of the solution
     if(it != solution.end()){
         cluster_indexes[i].push_back(i);
-        float target = means[i];
-        std::vector<float> distances(totals.size());
-        std::transform(totals.begin(), totals.end(), distances.begin(), [target](float num) { return std::abs(target - num); }); 
+        double target = means[i];
+        std::vector<double> distances(totals.size());
+        std::transform(totals.begin(), totals.end(), distances.begin(), [target](double num) { return std::abs(target - num); }); 
         uint32_t count = 0;
 
         //this checks the distances from the mean to all other possible peaks
@@ -272,12 +261,12 @@ std::vector<std::vector<uint32_t>> find_combination_peaks(std::vector<float> sol
         if(count > 1) unresolved.push_back(target);
 
     } else {
-      float target = means[i];
+      double target = means[i];
       //the problem with this is that it looks at the min but not if two overlapping peaks occur
-      auto it = std::min_element(totals.begin(), totals.end(), [target](float a, float b) {return std::abs(a - target) < std::abs(b - target);});
+      auto it = std::min_element(totals.begin(), totals.end(), [target](double a, double b) {return std::abs(a - target) < std::abs(b - target);});
 
-      std::vector<float> distances(totals.size());
-      std::transform(totals.begin(), totals.end(), distances.begin(), [target](float num) { return std::abs(target - num); }); 
+      std::vector<double> distances(totals.size());
+      std::transform(totals.begin(), totals.end(), distances.begin(), [target](double num) { return std::abs(target - num); }); 
       uint32_t count = 0;
       for(uint32_t d=0; d < distances.size(); d++){
         if(distances[d] < 0.03) count += 1;
@@ -291,13 +280,6 @@ std::vector<std::vector<uint32_t>> find_combination_peaks(std::vector<float> sol
       if(count > 1) unresolved.push_back(means[i]);
     }
   }
-  /*for(uint32_t i=0; i < cluster_indexes.size(); i++){
-    for(uint32_t j=0; j < cluster_indexes[i].size(); j++){
-      std::cerr << cluster_indexes[i][j] << " ";
-    }
-    std::cerr << "\n";
-  }
-  for(auto u : unresolved) std::cerr << u << std::endl;*/
   return(cluster_indexes);
 }
 
@@ -323,11 +305,11 @@ std::vector<std::vector<double>> deduplicate_solutions(std::vector<std::vector<d
   return(solutions);
 }
 
-std::vector<float> parse_clustering_results(std::string clustering_file){
+std::vector<double> parse_clustering_results(std::string clustering_file){
   std::ifstream infile(clustering_file + ".txt");
   std::string line;
   uint32_t count = 0;
-  std::vector<float> numbers;
+  std::vector<double> numbers;
   while (std::getline(infile, line)) {
     if(count == 0) {
       count += 1;
@@ -341,22 +323,25 @@ std::vector<float> parse_clustering_results(std::string clustering_file){
   }  
   return(numbers);
 }
-void cluster_consensus(std::vector<variant> variants, std::string clustering_file, std::string variants_file, double default_threshold){ 
-  float depth_cutoff = 10; 
+void cluster_consensus(std::vector<variant> variants, std::string clustering_file, std::string variants_file, double default_threshold){
+  /*
+    Call consensu sequence from clusters.
+  */ 
+  double depth_cutoff = 10; 
   double error = 0.10; 
   float solution_error = 0.05;
   double quality_threshold = 20; 
 
-  std::vector<float> error_rate = cluster_error(variants_file, quality_threshold);
-  float freq_lower_bound = error_rate[0];
-  float freq_upper_bound = error_rate[1];
+  double error_rate = cluster_error(variants_file, quality_threshold, depth_cutoff);
+  double freq_lower_bound = error_rate+0.001;
+  double freq_upper_bound = 1-error_rate-0.001;
 
   //read in the cluster values
-  std::vector<float> means = parse_clustering_results(clustering_file);
+  std::vector<double> means = parse_clustering_results(clustering_file);
   for(auto m : means){
     std::cerr << "consensus means " << m << std::endl;
   }
-  std::vector<std::vector<float>> clusters(means.size());
+  std::vector<std::vector<double>> clusters(means.size());
   for(auto var : variants){
     if(var.cluster_assigned != -1){
       clusters[var.cluster_assigned].push_back(var.freq);
@@ -370,28 +355,28 @@ void cluster_consensus(std::vector<variant> variants, std::string clustering_fil
     }
   }
   //find position wise frequency pairs
-  std::vector<std::vector<float>> pairs = frequency_pair_finder(variants, freq_lower_bound, freq_upper_bound, means); 
-  std::vector<std::vector<float>> solutions = find_solutions(means, error);  
+  std::vector<std::vector<double>> pairs = frequency_pair_finder(variants, freq_lower_bound, freq_upper_bound, means); 
+  std::vector<std::vector<double>> solutions = find_solutions(means, error);  
 
   //find peaks that can't be a subset of other peaks
-  std::vector<float> non_subset_means;
+  std::vector<double> non_subset_means;
   for(uint32_t i=0; i < means.size(); i++){
-    std::vector<std::vector<float>> tmp = find_subsets_with_error(means, means[i], solution_error);    
+    std::vector<std::vector<double>> tmp = find_subsets_with_error(means, means[i], solution_error);    
     if(tmp.size() <= 1){
       non_subset_means.push_back(means[i]);
     }
   }
   //reduce solution space to things that contain the non subset peaks
-  std::vector<std::vector<float>> realistic_solutions;
+  std::vector<std::vector<double>> realistic_solutions;
   for(uint32_t i=0; i < solutions.size(); i++){  
-      std::vector<float> tmp = solutions[i];
-      bool found = std::all_of(non_subset_means.begin(), non_subset_means.end(), [&tmp](float value) {return std::find(tmp.begin(), tmp.end(), value) != tmp.end();});
+      std::vector<double> tmp = solutions[i];
+      bool found = std::all_of(non_subset_means.begin(), non_subset_means.end(), [&tmp](double value) {return std::find(tmp.begin(), tmp.end(), value) != tmp.end();});
      if(found){
         realistic_solutions.push_back(solutions[i]);
      }
   }
   //check each solution that every possible peak is accounted for
-  std::vector<std::vector<float>> solution_sets;
+  std::vector<std::vector<double>> solution_sets;
   for(uint32_t i=0; i < realistic_solutions.size(); i++){
     bool keep = account_peaks(realistic_solutions[i], means, 1, solution_error);
     if(keep){
@@ -407,7 +392,7 @@ void cluster_consensus(std::vector<variant> variants, std::string clustering_fil
   }
   std::cerr << "\n" << std::endl;
 
-  std::vector<float> solution;
+  std::vector<double> solution;
   bool traditional_majority= false; //if we can't find a solution call a traditional majority consensus
   if(solution_sets.size() == 0){
     std::cerr << clustering_file << " no solution found" << std::endl;
@@ -427,7 +412,7 @@ void cluster_consensus(std::vector<variant> variants, std::string clustering_fil
   for(auto x : solution){
     std::cerr << x << std::endl;
   }
-  std::vector<float> unresolved;
+  std::vector<double> unresolved;
   std::vector<std::vector<uint32_t>> cluster_groups = find_combination_peaks(solution, means, unresolved);
 
   std::vector<std::vector<uint32_t>> inverse_groups(means.size());
 
@@ -1,7 +1,6 @@
 #include "gmm.h"
 #ifndef call_consensus_clustering
 #define call_consensus_clustering
-
 void cluster_consensus(std::vector<variant> variants, std::string clustering_file, std::string variants_filename, double default_threshold);
-void find_combinations(std::vector<float> means, uint32_t index, std::vector<float> &current, std::vector<std::vector<float>> &results);
+void find_combinations(std::vector<double> means, uint32_t index, std::vector<double> &current, std::vector<std::vector<double>> &results)
 #endif