Skip to content

Commit 0429610

Browse files
committed
update
1 parent 61271cb commit 0429610

File tree

10 files changed

+423
-297
lines changed

10 files changed

+423
-297
lines changed

src/call_consensus_clustering.cpp

Lines changed: 74 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include "call_consensus_clustering.h"
22
#include "estimate_error.h"
33
#include "gmm.h"
4+
#include "saga.h"
45
#include <ostream>
56
#include <iostream>
67
#include <vector>
@@ -9,6 +10,33 @@
910
#include <algorithm>
1011
#include <numeric>
1112

13+
bool test_cluster_deviation(float nearest_cluster, float variant_cluster, float std_dev){
14+
bool fluctuation = false;
15+
//CLEANUP THIS CAN BE CALCULATED ONCE PER ALL CLUSTERS
16+
//determine if the assigned and nearest cluster can be resolved based on variant fluctuation
17+
std::vector<double> tmp = {(double) nearest_cluster, (double) variant_cluster};
18+
double cluster_dev = calculate_standard_deviation(tmp);
19+
if((double)std_dev > cluster_dev){
20+
fluctuation = true;
21+
}
22+
return(fluctuation);
23+
}
24+
25+
double find_neighboring_cluster(double freq, uint32_t cluster_assigned, std::vector<double> means){
26+
//find closest cluster by mean value
27+
double min_dist = 1;
28+
uint32_t index = 0;
29+
for(uint32_t i=0; i < means.size(); i++){
30+
if(i == cluster_assigned) continue;
31+
double dist = std::abs(means[i]-freq);
32+
if(dist < min_dist){
33+
min_dist = dist;
34+
index = i;
35+
}
36+
}
37+
return(means[index]);
38+
}
39+
1240
void call_majority_consensus(std::vector<variant> variants, uint32_t max_position, std::string clustering_file, double default_threshold){
1341
//if we can't find a solution simply take the majority variant per position
1442
std::vector<std::string> nucs;
@@ -229,12 +257,15 @@ std::vector<std::vector<uint32_t>> find_combination_peaks(std::vector<float> sol
229257
//given a solution and the means, map each cluster to the cluster it contains
230258
for(uint32_t i=0; i < means.size(); i++){
231259
auto it = std::find(solution.begin(), solution.end(), means[i]);
260+
//th mean is part of the solution
232261
if(it != solution.end()){
233262
cluster_indexes[i].push_back(i);
234263
float target = means[i];
235264
std::vector<float> distances(totals.size());
236265
std::transform(totals.begin(), totals.end(), distances.begin(), [target](float num) { return std::abs(target - num); });
237266
uint32_t count = 0;
267+
268+
//this checks the distances from the mean to all other possible peaks
238269
for(uint32_t d=0; d < distances.size(); d++){
239270
if(distances[d] < 0.03) count += 1;
240271
}
@@ -313,9 +344,10 @@ std::vector<float> parse_clustering_results(std::string clustering_file){
313344
void cluster_consensus(std::vector<variant> variants, std::string clustering_file, std::string variants_file, double default_threshold){
314345
float depth_cutoff = 10;
315346
double error = 0.10;
316-
float solution_error = 0.10;
317-
318-
std::vector<float> error_rate = cluster_error(variants_file);
347+
float solution_error = 0.05;
348+
double quality_threshold = 20;
349+
350+
std::vector<float> error_rate = cluster_error(variants_file, quality_threshold);
319351
float freq_lower_bound = error_rate[0];
320352
float freq_upper_bound = error_rate[1];
321353

@@ -340,7 +372,7 @@ void cluster_consensus(std::vector<variant> variants, std::string clustering_fil
340372
//find position wise frequency pairs
341373
std::vector<std::vector<float>> pairs = frequency_pair_finder(variants, freq_lower_bound, freq_upper_bound, means);
342374
std::vector<std::vector<float>> solutions = find_solutions(means, error);
343-
375+
344376
//find peaks that can't be a subset of other peaks
345377
std::vector<float> non_subset_means;
346378
for(uint32_t i=0; i < means.size(); i++){
@@ -358,7 +390,6 @@ void cluster_consensus(std::vector<variant> variants, std::string clustering_fil
358390
realistic_solutions.push_back(solutions[i]);
359391
}
360392
}
361-
362393
//check each solution that every possible peak is accounted for
363394
std::vector<std::vector<float>> solution_sets;
364395
for(uint32_t i=0; i < realistic_solutions.size(); i++){
@@ -367,6 +398,15 @@ void cluster_consensus(std::vector<variant> variants, std::string clustering_fil
367398
solution_sets.push_back(realistic_solutions[i]);
368399
}
369400
}
401+
402+
for(auto sol : solution_sets){
403+
std::cerr << "\nsolution" << std::endl;
404+
for(auto s : sol){
405+
std::cerr << s << " ";
406+
}
407+
}
408+
std::cerr << "\n" << std::endl;
409+
370410
std::vector<float> solution;
371411
bool traditional_majority= false; //if we can't find a solution call a traditional majority consensus
372412
if(solution_sets.size() == 0){
@@ -389,11 +429,10 @@ void cluster_consensus(std::vector<variant> variants, std::string clustering_fil
389429
}
390430
std::vector<float> unresolved;
391431
std::vector<std::vector<uint32_t>> cluster_groups = find_combination_peaks(solution, means, unresolved);
392-
for(auto u : unresolved) std::cerr << "unresolved " << u << std::endl;
432+
393433
std::vector<std::vector<uint32_t>> inverse_groups(means.size());
394434
for(uint32_t i=0; i < cluster_groups.size(); i++){
395435
for(uint32_t j=0; j < cluster_groups[i].size(); j++){
396-
//std::cerr << cluster_groups[i][j] << " i " << i << " j " << j << std::endl;
397436
inverse_groups[cluster_groups[i][j]].push_back(i);
398437
}
399438
}
@@ -465,7 +504,7 @@ void cluster_consensus(std::vector<variant> variants, std::string clustering_fil
465504
//TESTLINES
466505
if(variants[i].nuc.find('+') != std::string::npos) continue;
467506
//TESTLINES
468-
if(variants[i].position == 0){
507+
if(variants[i].position == 13572){
469508
print = true;
470509
std::cerr << "\ntop freq " << variants[i].freq << " " << variants[i].nuc << " cluster " << variants[i].cluster_assigned << " " << variants[i].gapped_freq << std::endl;
471510
std::cerr << "vague assignment " << variants[i].vague_assignment << " del pos " << variants[i].pos_del_flag << " depth flag " << variants[i].depth_flag << std::endl;
@@ -482,11 +521,33 @@ void cluster_consensus(std::vector<variant> variants, std::string clustering_fil
482521
continue;
483522
}
484523
//if this position is experiencing fluctuation across amplicons, call ambiguity
485-
if(variants[i].amplicon_flux && variants[i].freq < freq_upper_bound){
524+
if(variants[i].amplicon_flux && variants[i].freq < freq_upper_bound && variants[i].freq > freq_lower_bound){
525+
526+
//find all clusters not part of the same assignment
527+
std::vector<double> other_population_clusters;
528+
for(uint32_t j=0; j < inverse_groups.size(); j++){
529+
//check to make sure you're lookin at a group that's part of the solution
530+
auto mit = std::find(solution.begin(), solution.end(), means[j]);
531+
if(mit == solution.end()) continue;
532+
auto it = std::find(inverse_groups[j].begin(), inverse_groups[j].end(), variants[i].cluster_assigned);
533+
//assigned cluster is not apart of the population
534+
if(it == inverse_groups[j].end())
535+
for(auto ig : inverse_groups[j]){
536+
//CLEAN UP this will push redundant things back
537+
other_population_clusters.push_back(means[ig]);
538+
}
539+
}
540+
541+
//find the second closest cluster index
542+
double closest_mean = find_neighboring_cluster((double)variants[i].gapped_freq, variants[i].cluster_assigned, other_population_clusters);
543+
//check if the cluster is within the standard dev of the variant
544+
bool fluctuating = test_cluster_deviation(closest_mean, means[variants[i].cluster_assigned], variants[i].std_dev);
486545
if(print){
487-
std::cerr << "position is experiencing fluctuation" << std::endl;
546+
std::cerr << "fluctuating " << fluctuating << std::endl;
547+
}
548+
if(fluctuating){
549+
continue;
488550
}
489-
continue;
490551
}
491552

492553
//if this amplicon is experiencing fluctuation across amplicons, call ambiguity
@@ -541,6 +602,7 @@ void cluster_consensus(std::vector<variant> variants, std::string clustering_fil
541602
}
542603
continue;
543604
}
605+
544606
for(uint32_t j=0; j < inverse_groups.size(); j++){
545607
//check to make sure you're lookin at a group that's part of the solution
546608
auto mit = std::find(solution.begin(), solution.end(), means[j]);
@@ -566,6 +628,7 @@ void cluster_consensus(std::vector<variant> variants, std::string clustering_fil
566628
auto it = std::find(solution.begin(), solution.end(), tmp_mean);
567629
if(it == solution.end()) continue;
568630
file << ">"+clustering_file+"_cluster_"+ std::to_string(means[i]) << "\n";
631+
std::cerr << all_sequences[i][13571] << std::endl;
569632
file << all_sequences[i] << "\n";
570633
}
571634
file.close();

src/estimate_error.cpp

Lines changed: 74 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,95 +1,122 @@
11
#include "estimate_error.h"
22
#include "gmm.h"
3+
#include "saga.h"
34

4-
std::vector<float> cluster_error(std::string filename){
5+
std::vector<double> z_score(std::vector<double> data) {
6+
double mean = std::accumulate(data.begin(), data.end(), 0.0) / data.size();
7+
double sq_sum = std::inner_product(data.begin(), data.end(), data.begin(), 0.0);
8+
double stddev = std::sqrt(sq_sum / data.size() - mean * mean);
9+
10+
std::vector<double> z_scores;
11+
for (double x : data)
12+
z_scores.push_back((x - mean) / stddev);
13+
return z_scores;
14+
}
15+
16+
std::vector<std::vector<uint32_t>> determine_outlier_points(std::vector<double> data, std::vector<double> means){
17+
std::vector<std::vector<double>> clusters(means.size());
18+
//assign points to clusters
19+
for(auto tmp : data){
20+
auto it = std::min_element(means.begin(), means.end(),
21+
[tmp](double a, double b) {
22+
return std::fabs(a - tmp) < std::fabs(b - tmp);
23+
});
24+
uint32_t index = std::distance(means.begin(), it);
25+
clusters[index].push_back(tmp);
26+
}
27+
28+
std::vector<std::vector<uint32_t>> removal_points(means.size());
29+
//calculate cluster specific percentiles
30+
for(uint32_t j=0; j < clusters.size(); j++){
31+
std::vector<double> z_scores = z_score(clusters[j]);
32+
for(uint32_t i=0; i < z_scores.size(); i++){
33+
double abs = std::abs(z_scores[i]);
34+
if(abs >= 10){
35+
removal_points[j].push_back(i);
36+
}
37+
}
38+
}
39+
return(removal_points);
40+
}
41+
42+
std::vector<float> cluster_error(std::string filename, double quality_threshold){
543
/*
644
Here we use clustering to determine the value of the noise.
745
*/
846

9-
float lower_bound = 0.01;
10-
float upper_bound = 0.10;
47+
float lower_bound = 0.50;
48+
float upper_bound = 0.99;
1149
uint32_t depth_cutoff = 10;
12-
float quality_threshold = 20;
1350
uint32_t round_val = 4;
1451

1552
std::vector<uint32_t> deletion_positions = find_deletion_positions(filename, depth_cutoff, lower_bound, upper_bound, round_val);
1653
std::vector<variant> base_variants;
17-
parse_internal_variants(filename, base_variants, depth_cutoff, lower_bound, upper_bound, deletion_positions, round_val);
54+
parse_internal_variants(filename, base_variants, depth_cutoff, lower_bound, upper_bound, deletion_positions, round_val, quality_threshold);
1855
if(base_variants[0].version_1_var){
1956
calculate_reference_frequency(base_variants, filename, depth_cutoff, lower_bound, upper_bound, deletion_positions);
2057
}
2158

22-
gaussian_mixture_model retrained_original;
2359
std::vector<variant> variants_original;
2460
uint32_t useful_count_original = 0;
2561
uint32_t max_pos = 0;
62+
std::vector<double> frequencies;
2663
for(uint32_t i=0; i < base_variants.size(); i++){
2764
if(base_variants[i].position > max_pos) max_pos = base_variants[i].position;
28-
//changed this for TEST
29-
if(!base_variants[i].amplicon_flux && !base_variants[i].depth_flag && !base_variants[i].outside_freq_range && !base_variants[i].qual_flag && !base_variants[i].del_flag && !base_variants[i].amplicon_masked && !base_variants[i].primer_masked){
30-
65+
if(!base_variants[i].amplicon_flux && !base_variants[i].depth_flag && !base_variants[i].outside_freq_range && !base_variants[i].qual_flag && !base_variants[i].del_flag && !base_variants[i].amplicon_masked && !base_variants[i].primer_masked){
3166
useful_count_original++;
3267
variants_original.push_back(base_variants[i]);
68+
frequencies.push_back(base_variants[i].freq);
3369
}
3470
}
71+
3572
arma::mat data_original(1, useful_count_original, arma::fill::zeros);
36-
//std::cerr << useful_count_original << std::endl;
3773
uint32_t count_original=0;
3874
for(uint32_t i = 0; i < variants_original.size(); i++){
39-
//check if variant should be filtered for first pass model
40-
double tmp = static_cast<double>(variants_original[i].freq); //transform
75+
double tmp = static_cast<double>(variants_original[i].gapped_freq);
4176
data_original.col(count_original) = tmp;
4277
count_original += 1;
4378
}
44-
uint32_t n = 3;
45-
retrained_original = retrain_model(n, data_original, variants_original, 2, 0.0001);
46-
assign_clusters(variants_original, retrained_original, 2);
79+
80+
uint32_t n = 2;
81+
gaussian_mixture_model model = retrain_model(n, data_original, variants_original, 2, 0.00001);
82+
std::vector<double> means = model.means;
83+
84+
//for each cluster this describes the points which are outliers
85+
std::vector<std::vector<uint32_t>> removal_points = determine_outlier_points(frequencies, means);
4786
std::vector<std::vector<double>> clusters(n);
87+
4888
for(auto var : variants_original){
49-
int cluster = var.cluster_assigned;
50-
clusters[cluster].push_back(var.freq);
51-
//std::cerr << var.freq << " " << cluster << std::endl;
89+
double tmp = (double)var.gapped_freq;
90+
auto it = std::min_element(means.begin(), means.end(),
91+
[tmp](double a, double b) {
92+
return std::fabs(a - tmp) < std::fabs(b - tmp);
93+
});
94+
uint32_t index = std::distance(means.begin(), it);
95+
clusters[index].push_back(var.gapped_freq);
5296
}
5397
uint32_t j = 0;
5498
uint32_t largest=0;
55-
std::vector<float> cluster_percents;
5699
for(uint32_t i=0; i < clusters.size(); i++){
57-
std::cerr << clusters[i].size() << std::endl;
58-
cluster_percents.push_back((float)clusters[i].size() / (float)variants_original.size());
59100
if(clusters[i].size() > largest){
60101
j = i;
61102
largest = clusters[i].size();
62103
}
63104
}
64-
for(auto me : retrained_original.means){
65-
std::cerr << me << std::endl;
66-
}
67-
for(auto per : cluster_percents){
68-
std::cerr << per << std::endl;
69-
}
70-
//get the upper edge of the noise cluster
71-
auto max_it = std::max_element(clusters[j].begin(), clusters[j].end());
72-
std::cerr << *max_it << std::endl;
73-
std::vector<float> max_error_pos(max_pos+1);
74-
for(uint32_t i=0; i < base_variants.size(); i++){
75-
if(base_variants[i].freq <= *max_it){
76-
max_error_pos[base_variants[i].position] += base_variants[i].freq;
105+
std::vector<double> universal_cluster = clusters[j];
106+
std::vector<uint32_t> outliers = removal_points[j];
107+
std::vector<double> cleaned_cluster;
108+
for(uint32_t i=0; i < universal_cluster.size(); i++){
109+
auto it = std::find(outliers.begin(), outliers.end(), i);
110+
if(it == outliers.end()){
111+
cleaned_cluster.push_back(universal_cluster[i]);
77112
}
78113
}
79114

80-
float max_error = 0;
81-
for(auto x : max_error_pos){
82-
if(x >= *max_it && x > max_error){
83-
max_error = x;
84-
}
85-
}
86-
//std::cerr << "max error " << max_error << std::endl;
115+
//get the upper edge of the noise cluster
116+
auto min_it = std::min_element(cleaned_cluster.begin(), cleaned_cluster.end());
117+
std::cerr << "min it " << *min_it << std::endl;
87118
std::vector<float> tmp;
88-
//tmp.push_back(*max_it);
89-
//tmp.push_back(1-*max_it);
90-
91-
tmp.push_back(max_error);
92-
tmp.push_back(1-max_error);
93-
//exit(0);
119+
tmp.push_back(1-(*min_it));
120+
tmp.push_back(*min_it);
94121
return tmp;
95122
}

src/estimate_error.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,5 @@
33
#include "./include/armadillo"
44
#ifndef estimate_error
55
#define estimate_error
6-
std::vector<float> cluster_error(std::string filename);
6+
std::vector<float> cluster_error(std::string filename, double quality_threshold);
77
#endif

0 commit comments

Comments
 (0)