9
9
#include < algorithm>
10
10
#include < numeric>
11
11
12
+ void call_majority_consensus (std::vector<variant> variants, uint32_t max_position, std::string clustering_file, double default_threshold){
13
+ // if we can't find a solution simply take the majority variant per position
14
+ std::vector<std::string> nucs;
15
+ std::vector<float > freqs;
16
+ std::vector<std::string> tmp (max_position, " N" );
17
+ for (uint32_t i=1 ; i <= max_position; i++){
18
+ freqs.clear ();
19
+ nucs.clear ();
20
+ for (uint32_t j=0 ; j < variants.size (); j++){
21
+ if (variants[j].position == i){
22
+ nucs.push_back (variants[j].nuc );
23
+ freqs.push_back (variants[j].freq );
24
+ }
25
+ }
26
+ if (freqs.size () == 0 ) continue ;
27
+ uint32_t index = std::distance (freqs.begin (), std::max_element (freqs.begin (), freqs.end ()));
28
+ if (freqs[index] >= (float )default_threshold){
29
+ tmp[i-1 ] = nucs[index];
30
+ }
31
+ }
32
+ std::string consensus_string = std::accumulate (tmp.begin (), tmp.end (), std::string (" " ));
33
+ // write the consensus to file
34
+ std::string consensus_filename = clustering_file + " .fa" ;
35
+ std::ofstream file (consensus_filename);
36
+ std::string name = " >" +clustering_file+" _" +std::to_string (default_threshold)+" _threshold" ;
37
+ file << name << " \n " ;
38
+ file << consensus_string << " \n " ;
39
+ file.close ();
40
+ }
41
+
12
42
float find_nearest_distance (const std::vector<float > all_sums, float value) {
13
43
float min_distance = std::numeric_limits<float >::max ();
14
44
for (auto num : all_sums) {
@@ -185,11 +215,12 @@ std::vector<float> parse_string_to_vector(const std::string& str) {
185
215
return result;
186
216
}
187
217
188
- std::vector<std::vector<uint32_t >> find_combination_peaks (std::vector<float > solution, std::vector<float > means){
218
+ std::vector<std::vector<uint32_t >> find_combination_peaks (std::vector<float > solution, std::vector<float > means, std::vector< float > &unresolved ){
189
219
std::vector<std::vector<uint32_t >> cluster_indexes (means.size ());
190
220
std::vector<float > current;
191
221
std::vector<std::vector<float >> results;
192
222
std::vector<float > totals;
223
+
193
224
find_combinations (solution, 0 , current, results);
194
225
for (uint32_t i=0 ; i < results.size (); i++){
195
226
float sum = std::accumulate (results[i].begin (), results[i].end (), 0 .0f );
@@ -200,23 +231,42 @@ std::vector<std::vector<uint32_t>> find_combination_peaks(std::vector<float> sol
200
231
auto it = std::find (solution.begin (), solution.end (), means[i]);
201
232
if (it != solution.end ()){
202
233
cluster_indexes[i].push_back (i);
234
+ float target = means[i];
235
+ std::vector<float > distances (totals.size ());
236
+ std::transform (totals.begin (), totals.end (), distances.begin (), [target](float num) { return std::abs (target - num); });
237
+ uint32_t count = 0 ;
238
+ for (uint32_t d=0 ; d < distances.size (); d++){
239
+ if (distances[d] < 0.03 ) count += 1 ;
240
+ }
241
+ if (count > 1 ) unresolved.push_back (target);
242
+
203
243
} else {
204
244
float target = means[i];
245
+ // the problem with this is that it looks at the min but not if two overlapping peaks occur
205
246
auto it = std::min_element (totals.begin (), totals.end (), [target](float a, float b) {return std::abs (a - target) < std::abs (b - target);});
247
+
248
+ std::vector<float > distances (totals.size ());
249
+ std::transform (totals.begin (), totals.end (), distances.begin (), [target](float num) { return std::abs (target - num); });
250
+ uint32_t count = 0 ;
251
+ for (uint32_t d=0 ; d < distances.size (); d++){
252
+ if (distances[d] < 0.03 ) count += 1 ;
253
+ }
206
254
uint32_t index = std::distance (totals.begin (), it);
207
255
for (auto x : results[index]){
208
256
auto it2 = std::find (std::begin (means), std::end (means), x);
209
257
uint32_t index2 = std::distance (std::begin (means), it2);
210
258
cluster_indexes[i].push_back (index2);
211
259
}
260
+ if (count > 1 ) unresolved.push_back (means[i]);
212
261
}
213
262
}
214
263
/* for(uint32_t i=0; i < cluster_indexes.size(); i++){
215
264
for(uint32_t j=0; j < cluster_indexes[i].size(); j++){
216
265
std::cerr << cluster_indexes[i][j] << " ";
217
266
}
218
267
std::cerr << "\n";
219
- }*/
268
+ }
269
+ for(auto u : unresolved) std::cerr << u << std::endl;*/
220
270
return (cluster_indexes);
221
271
}
222
272
@@ -260,10 +310,10 @@ std::vector<float> parse_clustering_results(std::string clustering_file){
260
310
}
261
311
return (numbers);
262
312
}
263
- void cluster_consensus (std::vector<variant> variants, std::string clustering_file, std::string variants_file){
313
+ void cluster_consensus (std::vector<variant> variants, std::string clustering_file, std::string variants_file, double default_threshold ){
264
314
float depth_cutoff = 10 ;
265
- double error = 0.05 ;
266
- float solution_error = 0.05 ;
315
+ double error = 0.10 ;
316
+ float solution_error = 0.10 ;
267
317
268
318
std::vector<float > error_rate = cluster_error (variants_file);
269
319
float freq_lower_bound = error_rate[0 ];
@@ -274,7 +324,6 @@ void cluster_consensus(std::vector<variant> variants, std::string clustering_fil
274
324
for (auto m : means){
275
325
std::cerr << " consensus means " << m << std::endl;
276
326
}
277
- std::cerr << " A " << std::endl;
278
327
std::vector<std::vector<float >> clusters (means.size ());
279
328
for (auto var : variants){
280
329
if (var.cluster_assigned != -1 ){
@@ -300,10 +349,6 @@ void cluster_consensus(std::vector<variant> variants, std::string clustering_fil
300
349
non_subset_means.push_back (means[i]);
301
350
}
302
351
}
303
- // TEST LINES
304
- for (auto nsm : non_subset_means){
305
- std::cerr << " non subset means " << nsm << std::endl;
306
- }
307
352
// reduce solution space to things that contain the non subset peaks
308
353
std::vector<std::vector<float >> realistic_solutions;
309
354
for (uint32_t i=0 ; i < solutions.size (); i++){
@@ -317,45 +362,41 @@ void cluster_consensus(std::vector<variant> variants, std::string clustering_fil
317
362
// check each solution that every possible peak is accounted for
318
363
std::vector<std::vector<float >> solution_sets;
319
364
for (uint32_t i=0 ; i < realistic_solutions.size (); i++){
320
- for (auto r : realistic_solutions[i]){
321
- std::cerr << r << " " ;
322
- }
323
- std::cerr << " \n " ;
324
365
bool keep = account_peaks (realistic_solutions[i], means, 1 , solution_error);
325
366
if (keep){
326
367
solution_sets.push_back (realistic_solutions[i]);
327
368
}
328
369
}
329
370
std::vector<float > solution;
371
+ bool traditional_majority= false ; // if we can't find a solution call a traditional majority consensus
330
372
if (solution_sets.size () == 0 ){
331
373
std::cerr << clustering_file << " no solution found" << std::endl;
332
- exit ( 1 ) ;
374
+ traditional_majority = true ;
333
375
} else if (solution_sets.size () > 1 ){
334
376
std::cerr << " too many solutions" << std::endl;
335
- exit ( 1 ) ;
377
+ traditional_majority = true ;
336
378
} else {
337
379
solution = solution_sets[0 ];
338
380
}
381
+
382
+ if (traditional_majority){
383
+ call_majority_consensus (variants, max_position, clustering_file, default_threshold);
384
+ exit (0 );
385
+ }
386
+
339
387
for (auto x : solution){
340
388
std::cerr << x << std::endl;
341
389
}
342
- std::vector<std::vector<uint32_t >> cluster_groups = find_combination_peaks (solution, means);
390
+ std::vector<float > unresolved;
391
+ std::vector<std::vector<uint32_t >> cluster_groups = find_combination_peaks (solution, means, unresolved);
392
+ for (auto u : unresolved) std::cerr << " unresolved " << u << std::endl;
343
393
std::vector<std::vector<uint32_t >> inverse_groups (means.size ());
344
-
345
394
for (uint32_t i=0 ; i < cluster_groups.size (); i++){
346
395
for (uint32_t j=0 ; j < cluster_groups[i].size (); j++){
347
396
// std::cerr << cluster_groups[i][j] << " i " << i << " j " << j << std::endl;
348
397
inverse_groups[cluster_groups[i][j]].push_back (i);
349
398
}
350
399
}
351
- /* for(auto x : inverse_groups){
352
- for(auto y : x){
353
- std::cerr << y << " ";
354
- }
355
- std::cerr << "\n";
356
- }
357
- exit(0);*/
358
-
359
400
// TESTLINES MEAN CODE
360
401
std::string solution_string = " [" ;
361
402
for (uint32_t i=0 ; i < solution.size (); i++){
@@ -394,7 +435,6 @@ void cluster_consensus(std::vector<variant> variants, std::string clustering_fil
394
435
}
395
436
// a list of cluster assignments that we assign to consensus
396
437
std::vector<int > major_indexes;
397
- std::vector<int > minor_indexes;
398
438
// index of the "100%" cluster
399
439
for (uint32_t j=0 ; j < means.size (); j++){
400
440
float tmp = means[j];
@@ -405,10 +445,6 @@ void cluster_consensus(std::vector<variant> variants, std::string clustering_fil
405
445
auto it = std::find (solution.begin (), solution.end (), tmp);
406
446
if ((diff < error && it == solution.end ()) || tmp == largest){
407
447
std::cerr << " major index " << means[j] << " " << j << std::endl;
408
- if (tmp != largest){
409
- std::cerr << " adding to minor" << std::endl;
410
- // minor_indexes.push_back((int)j);
411
- }
412
448
major_indexes.push_back ((int )j);
413
449
}
414
450
}
@@ -423,30 +459,44 @@ void cluster_consensus(std::vector<variant> variants, std::string clustering_fil
423
459
std::vector<std::string> tmp (max_position, " N" );
424
460
all_consensus_seqs.push_back (tmp);
425
461
}
426
-
462
+
427
463
// iterate all variants and determine
428
464
for (uint32_t i = 0 ; i < variants.size (); i++){
429
465
// TESTLINES
430
466
if (variants[i].nuc .find (' +' ) != std::string::npos) continue ;
431
467
// TESTLINES
432
- if (variants[i].position == 1055 ){
468
+ if (variants[i].position == 0 ){
433
469
print = true ;
434
470
std::cerr << " \n top freq " << variants[i].freq << " " << variants[i].nuc << " cluster " << variants[i].cluster_assigned << " " << variants[i].gapped_freq << std::endl;
435
471
std::cerr << " vague assignment " << variants[i].vague_assignment << " del pos " << variants[i].pos_del_flag << " depth flag " << variants[i].depth_flag << std::endl;
436
- for (auto c : variants[i].probabilities ){
437
- std::cerr << c << " " ;
438
- }
439
- std::cerr << " \n " ;
472
+ std::cerr << variants[i].amplicon_masked << std::endl;
440
473
} else {
441
474
print = false ;
442
475
}
476
+ // if the mean for this cluster is unresolved we skip it
477
+ auto it = std::find (unresolved.begin (), unresolved.end (), means[variants[i].cluster_assigned ]);
478
+ if (it != unresolved.end ()){
479
+ if (print){
480
+ std::cerr << " unresolved " << means[variants[i].cluster_assigned ] << std::endl;
481
+ }
482
+ continue ;
483
+ }
443
484
// if this position is experiencing fluctuation across amplicons, call ambiguity
444
485
if (variants[i].amplicon_flux && variants[i].freq < freq_upper_bound){
445
486
if (print){
446
487
std::cerr << " position is experiencing fluctuation" << std::endl;
447
488
}
448
489
continue ;
449
490
}
491
+
492
+ // if this amplicon is experiencing fluctuation across amplicons, call ambiguity
493
+ if (variants[i].amplicon_masked && variants[i].freq < freq_upper_bound){
494
+ if (print){
495
+ std::cerr << " amplicon is experiencing fluctuation" << std::endl;
496
+ }
497
+ continue ;
498
+ }
499
+
450
500
if (variants[i].depth_flag ){
451
501
if (print){
452
502
std::cerr << " a " << variants[i].depth_flag << std::endl;
@@ -497,12 +547,11 @@ void cluster_consensus(std::vector<variant> variants, std::string clustering_fil
497
547
if (mit == solution.end ()) continue ;
498
548
// assign the point to all applicable groups
499
549
auto it = std::find (inverse_groups[j].begin (), inverse_groups[j].end (), variants[i].cluster_assigned );
500
- if (it != inverse_groups[j].end ()){
550
+ if (it != inverse_groups[j].end ()){
501
551
all_consensus_seqs[j][position-1 ] = variants[i].nuc ;
502
552
}
503
553
}
504
554
}
505
-
506
555
std::vector<std::string> all_sequences;
507
556
for (uint32_t i=0 ; i < all_consensus_seqs.size (); i++){
508
557
std::string tmp = std::accumulate (all_consensus_seqs[i].begin (), all_consensus_seqs[i].end (), std::string (" " ));
0 commit comments