1
1
#include " call_consensus_clustering.h"
2
2
#include " estimate_error.h"
3
3
#include " gmm.h"
4
+ #include " saga.h"
4
5
#include < ostream>
5
6
#include < iostream>
6
7
#include < vector>
9
10
#include < algorithm>
10
11
#include < numeric>
11
12
13
+ bool test_cluster_deviation (float nearest_cluster, float variant_cluster, float std_dev){
14
+ bool fluctuation = false ;
15
+ // CLEANUP THIS CAN BE CALCULATED ONCE PER ALL CLUSTERS
16
+ // determine if the assigned and nearest cluster can be resolved based on variant fluctuation
17
+ std::vector<double > tmp = {(double ) nearest_cluster, (double ) variant_cluster};
18
+ double cluster_dev = calculate_standard_deviation (tmp);
19
+ if ((double )std_dev > cluster_dev){
20
+ fluctuation = true ;
21
+ }
22
+ return (fluctuation);
23
+ }
24
+
25
+ double find_neighboring_cluster (double freq, uint32_t cluster_assigned, std::vector<double > means){
26
+ // find closest cluster by mean value
27
+ double min_dist = 1 ;
28
+ uint32_t index = 0 ;
29
+ for (uint32_t i=0 ; i < means.size (); i++){
30
+ if (i == cluster_assigned) continue ;
31
+ double dist = std::abs (means[i]-freq);
32
+ if (dist < min_dist){
33
+ min_dist = dist;
34
+ index = i;
35
+ }
36
+ }
37
+ return (means[index]);
38
+ }
39
+
12
40
void call_majority_consensus (std::vector<variant> variants, uint32_t max_position, std::string clustering_file, double default_threshold){
13
41
// if we can't find a solution simply take the majority variant per position
14
42
std::vector<std::string> nucs;
@@ -229,12 +257,15 @@ std::vector<std::vector<uint32_t>> find_combination_peaks(std::vector<float> sol
229
257
// given a solution and the means, map each cluster to the cluster it contains
230
258
for (uint32_t i=0 ; i < means.size (); i++){
231
259
auto it = std::find (solution.begin (), solution.end (), means[i]);
260
+ // th mean is part of the solution
232
261
if (it != solution.end ()){
233
262
cluster_indexes[i].push_back (i);
234
263
float target = means[i];
235
264
std::vector<float > distances (totals.size ());
236
265
std::transform (totals.begin (), totals.end (), distances.begin (), [target](float num) { return std::abs (target - num); });
237
266
uint32_t count = 0 ;
267
+
268
+ // this checks the distances from the mean to all other possible peaks
238
269
for (uint32_t d=0 ; d < distances.size (); d++){
239
270
if (distances[d] < 0.03 ) count += 1 ;
240
271
}
@@ -313,9 +344,10 @@ std::vector<float> parse_clustering_results(std::string clustering_file){
313
344
void cluster_consensus (std::vector<variant> variants, std::string clustering_file, std::string variants_file, double default_threshold){
314
345
float depth_cutoff = 10 ;
315
346
double error = 0.10 ;
316
- float solution_error = 0.10 ;
317
-
318
- std::vector<float > error_rate = cluster_error (variants_file);
347
+ float solution_error = 0.05 ;
348
+ double quality_threshold = 20 ;
349
+
350
+ std::vector<float > error_rate = cluster_error (variants_file, quality_threshold);
319
351
float freq_lower_bound = error_rate[0 ];
320
352
float freq_upper_bound = error_rate[1 ];
321
353
@@ -340,7 +372,7 @@ void cluster_consensus(std::vector<variant> variants, std::string clustering_fil
340
372
// find position wise frequency pairs
341
373
std::vector<std::vector<float >> pairs = frequency_pair_finder (variants, freq_lower_bound, freq_upper_bound, means);
342
374
std::vector<std::vector<float >> solutions = find_solutions (means, error);
343
-
375
+
344
376
// find peaks that can't be a subset of other peaks
345
377
std::vector<float > non_subset_means;
346
378
for (uint32_t i=0 ; i < means.size (); i++){
@@ -358,7 +390,6 @@ void cluster_consensus(std::vector<variant> variants, std::string clustering_fil
358
390
realistic_solutions.push_back (solutions[i]);
359
391
}
360
392
}
361
-
362
393
// check each solution that every possible peak is accounted for
363
394
std::vector<std::vector<float >> solution_sets;
364
395
for (uint32_t i=0 ; i < realistic_solutions.size (); i++){
@@ -367,6 +398,15 @@ void cluster_consensus(std::vector<variant> variants, std::string clustering_fil
367
398
solution_sets.push_back (realistic_solutions[i]);
368
399
}
369
400
}
401
+
402
+ for (auto sol : solution_sets){
403
+ std::cerr << " \n solution" << std::endl;
404
+ for (auto s : sol){
405
+ std::cerr << s << " " ;
406
+ }
407
+ }
408
+ std::cerr << " \n " << std::endl;
409
+
370
410
std::vector<float > solution;
371
411
bool traditional_majority= false ; // if we can't find a solution call a traditional majority consensus
372
412
if (solution_sets.size () == 0 ){
@@ -389,11 +429,10 @@ void cluster_consensus(std::vector<variant> variants, std::string clustering_fil
389
429
}
390
430
std::vector<float > unresolved;
391
431
std::vector<std::vector<uint32_t >> cluster_groups = find_combination_peaks (solution, means, unresolved);
392
- for ( auto u : unresolved) std::cerr << " unresolved " << u << std::endl;
432
+
393
433
std::vector<std::vector<uint32_t >> inverse_groups (means.size ());
394
434
for (uint32_t i=0 ; i < cluster_groups.size (); i++){
395
435
for (uint32_t j=0 ; j < cluster_groups[i].size (); j++){
396
- // std::cerr << cluster_groups[i][j] << " i " << i << " j " << j << std::endl;
397
436
inverse_groups[cluster_groups[i][j]].push_back (i);
398
437
}
399
438
}
@@ -465,7 +504,7 @@ void cluster_consensus(std::vector<variant> variants, std::string clustering_fil
465
504
// TESTLINES
466
505
if (variants[i].nuc .find (' +' ) != std::string::npos) continue ;
467
506
// TESTLINES
468
- if (variants[i].position == 0 ){
507
+ if (variants[i].position == 13572 ){
469
508
print = true ;
470
509
std::cerr << " \n top freq " << variants[i].freq << " " << variants[i].nuc << " cluster " << variants[i].cluster_assigned << " " << variants[i].gapped_freq << std::endl;
471
510
std::cerr << " vague assignment " << variants[i].vague_assignment << " del pos " << variants[i].pos_del_flag << " depth flag " << variants[i].depth_flag << std::endl;
@@ -482,11 +521,33 @@ void cluster_consensus(std::vector<variant> variants, std::string clustering_fil
482
521
continue ;
483
522
}
484
523
// if this position is experiencing fluctuation across amplicons, call ambiguity
485
- if (variants[i].amplicon_flux && variants[i].freq < freq_upper_bound){
524
+ if (variants[i].amplicon_flux && variants[i].freq < freq_upper_bound && variants[i].freq > freq_lower_bound){
525
+
526
+ // find all clusters not part of the same assignment
527
+ std::vector<double > other_population_clusters;
528
+ for (uint32_t j=0 ; j < inverse_groups.size (); j++){
529
+ // check to make sure you're lookin at a group that's part of the solution
530
+ auto mit = std::find (solution.begin (), solution.end (), means[j]);
531
+ if (mit == solution.end ()) continue ;
532
+ auto it = std::find (inverse_groups[j].begin (), inverse_groups[j].end (), variants[i].cluster_assigned );
533
+ // assigned cluster is not apart of the population
534
+ if (it == inverse_groups[j].end ())
535
+ for (auto ig : inverse_groups[j]){
536
+ // CLEAN UP this will push redundant things back
537
+ other_population_clusters.push_back (means[ig]);
538
+ }
539
+ }
540
+
541
+ // find the second closest cluster index
542
+ double closest_mean = find_neighboring_cluster ((double )variants[i].gapped_freq , variants[i].cluster_assigned , other_population_clusters);
543
+ // check if the cluster is within the standard dev of the variant
544
+ bool fluctuating = test_cluster_deviation (closest_mean, means[variants[i].cluster_assigned ], variants[i].std_dev );
486
545
if (print){
487
- std::cerr << " position is experiencing fluctuation" << std::endl;
546
+ std::cerr << " fluctuating " << fluctuating << std::endl;
547
+ }
548
+ if (fluctuating){
549
+ continue ;
488
550
}
489
- continue ;
490
551
}
491
552
492
553
// if this amplicon is experiencing fluctuation across amplicons, call ambiguity
@@ -541,6 +602,7 @@ void cluster_consensus(std::vector<variant> variants, std::string clustering_fil
541
602
}
542
603
continue ;
543
604
}
605
+
544
606
for (uint32_t j=0 ; j < inverse_groups.size (); j++){
545
607
// check to make sure you're lookin at a group that's part of the solution
546
608
auto mit = std::find (solution.begin (), solution.end (), means[j]);
@@ -566,6 +628,7 @@ void cluster_consensus(std::vector<variant> variants, std::string clustering_fil
566
628
auto it = std::find (solution.begin (), solution.end (), tmp_mean);
567
629
if (it == solution.end ()) continue ;
568
630
file << " >" +clustering_file+" _cluster_" + std::to_string (means[i]) << " \n " ;
631
+ std::cerr << all_sequences[i][13571 ] << std::endl;
569
632
file << all_sequences[i] << " \n " ;
570
633
}
571
634
file.close ();
0 commit comments