Skip to content

Commit d487617

Browse files
committed
adding in fixes for no bed file and no pair file
1 parent 0678057 commit d487617

File tree

8 files changed

+372
-176
lines changed

8 files changed

+372
-176
lines changed

src/call_consensus_clustering.cpp

Lines changed: 88 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,36 @@
99
#include <algorithm>
1010
#include <numeric>
1111

12+
void call_majority_consensus(std::vector<variant> variants, uint32_t max_position, std::string clustering_file, double default_threshold){
13+
//if we can't find a solution simply take the majority variant per position
14+
std::vector<std::string> nucs;
15+
std::vector<float> freqs;
16+
std::vector<std::string> tmp(max_position, "N");
17+
for(uint32_t i=1; i <= max_position; i++){
18+
freqs.clear();
19+
nucs.clear();
20+
for(uint32_t j=0; j < variants.size(); j++){
21+
if(variants[j].position == i){
22+
nucs.push_back(variants[j].nuc);
23+
freqs.push_back(variants[j].freq);
24+
}
25+
}
26+
if(freqs.size() == 0) continue;
27+
uint32_t index = std::distance(freqs.begin(), std::max_element(freqs.begin(), freqs.end()));
28+
if(freqs[index] >= (float)default_threshold){
29+
tmp[i-1] = nucs[index];
30+
}
31+
}
32+
std::string consensus_string = std::accumulate(tmp.begin(), tmp.end(), std::string(""));
33+
//write the consensus to file
34+
std::string consensus_filename = clustering_file + ".fa";
35+
std::ofstream file(consensus_filename);
36+
std::string name = ">"+clustering_file+"_"+std::to_string(default_threshold)+"_threshold";
37+
file << name << "\n";
38+
file << consensus_string << "\n";
39+
file.close();
40+
}
41+
1242
float find_nearest_distance(const std::vector<float> all_sums, float value) {
1343
float min_distance = std::numeric_limits<float>::max();
1444
for (auto num : all_sums) {
@@ -185,11 +215,12 @@ std::vector<float> parse_string_to_vector(const std::string& str) {
185215
return result;
186216
}
187217

188-
std::vector<std::vector<uint32_t>> find_combination_peaks(std::vector<float> solution, std::vector<float> means){
218+
std::vector<std::vector<uint32_t>> find_combination_peaks(std::vector<float> solution, std::vector<float> means, std::vector<float> &unresolved){
189219
std::vector<std::vector<uint32_t>> cluster_indexes(means.size());
190220
std::vector<float> current;
191221
std::vector<std::vector<float>> results;
192222
std::vector<float> totals;
223+
193224
find_combinations(solution, 0, current, results);
194225
for(uint32_t i=0; i < results.size(); i++){
195226
float sum = std::accumulate(results[i].begin(), results[i].end(), 0.0f);
@@ -200,23 +231,42 @@ std::vector<std::vector<uint32_t>> find_combination_peaks(std::vector<float> sol
200231
auto it = std::find(solution.begin(), solution.end(), means[i]);
201232
if(it != solution.end()){
202233
cluster_indexes[i].push_back(i);
234+
float target = means[i];
235+
std::vector<float> distances(totals.size());
236+
std::transform(totals.begin(), totals.end(), distances.begin(), [target](float num) { return std::abs(target - num); });
237+
uint32_t count = 0;
238+
for(uint32_t d=0; d < distances.size(); d++){
239+
if(distances[d] < 0.03) count += 1;
240+
}
241+
if(count > 1) unresolved.push_back(target);
242+
203243
} else {
204244
float target = means[i];
245+
//the problem with this is that it looks at the min but not if two overlapping peaks occur
205246
auto it = std::min_element(totals.begin(), totals.end(), [target](float a, float b) {return std::abs(a - target) < std::abs(b - target);});
247+
248+
std::vector<float> distances(totals.size());
249+
std::transform(totals.begin(), totals.end(), distances.begin(), [target](float num) { return std::abs(target - num); });
250+
uint32_t count = 0;
251+
for(uint32_t d=0; d < distances.size(); d++){
252+
if(distances[d] < 0.03) count += 1;
253+
}
206254
uint32_t index = std::distance(totals.begin(), it);
207255
for(auto x : results[index]){
208256
auto it2 = std::find(std::begin(means), std::end(means), x);
209257
uint32_t index2 = std::distance(std::begin(means), it2);
210258
cluster_indexes[i].push_back(index2);
211259
}
260+
if(count > 1) unresolved.push_back(means[i]);
212261
}
213262
}
214263
/*for(uint32_t i=0; i < cluster_indexes.size(); i++){
215264
for(uint32_t j=0; j < cluster_indexes[i].size(); j++){
216265
std::cerr << cluster_indexes[i][j] << " ";
217266
}
218267
std::cerr << "\n";
219-
}*/
268+
}
269+
for(auto u : unresolved) std::cerr << u << std::endl;*/
220270
return(cluster_indexes);
221271
}
222272

@@ -260,10 +310,10 @@ std::vector<float> parse_clustering_results(std::string clustering_file){
260310
}
261311
return(numbers);
262312
}
263-
void cluster_consensus(std::vector<variant> variants, std::string clustering_file, std::string variants_file){
313+
void cluster_consensus(std::vector<variant> variants, std::string clustering_file, std::string variants_file, double default_threshold){
264314
float depth_cutoff = 10;
265-
double error = 0.05;
266-
float solution_error = 0.05;
315+
double error = 0.10;
316+
float solution_error = 0.10;
267317

268318
std::vector<float> error_rate = cluster_error(variants_file);
269319
float freq_lower_bound = error_rate[0];
@@ -274,7 +324,6 @@ void cluster_consensus(std::vector<variant> variants, std::string clustering_fil
274324
for(auto m : means){
275325
std::cerr << "consensus means " << m << std::endl;
276326
}
277-
std::cerr << " A " << std::endl;
278327
std::vector<std::vector<float>> clusters(means.size());
279328
for(auto var : variants){
280329
if(var.cluster_assigned != -1){
@@ -300,10 +349,6 @@ void cluster_consensus(std::vector<variant> variants, std::string clustering_fil
300349
non_subset_means.push_back(means[i]);
301350
}
302351
}
303-
//TEST LINES
304-
for(auto nsm : non_subset_means){
305-
std::cerr << "non subset means " << nsm << std::endl;
306-
}
307352
//reduce solution space to things that contain the non subset peaks
308353
std::vector<std::vector<float>> realistic_solutions;
309354
for(uint32_t i=0; i < solutions.size(); i++){
@@ -317,45 +362,41 @@ void cluster_consensus(std::vector<variant> variants, std::string clustering_fil
317362
//check each solution that every possible peak is accounted for
318363
std::vector<std::vector<float>> solution_sets;
319364
for(uint32_t i=0; i < realistic_solutions.size(); i++){
320-
for(auto r : realistic_solutions[i]){
321-
std::cerr << r << " ";
322-
}
323-
std::cerr << "\n";
324365
bool keep = account_peaks(realistic_solutions[i], means, 1, solution_error);
325366
if(keep){
326367
solution_sets.push_back(realistic_solutions[i]);
327368
}
328369
}
329370
std::vector<float> solution;
371+
bool traditional_majority= false; //if we can't find a solution call a traditional majority consensus
330372
if(solution_sets.size() == 0){
331373
std::cerr << clustering_file << " no solution found" << std::endl;
332-
exit(1);
374+
traditional_majority = true;
333375
} else if(solution_sets.size() > 1){
334376
std::cerr << "too many solutions" << std::endl;
335-
exit(1);
377+
traditional_majority = true;
336378
} else{
337379
solution = solution_sets[0];
338380
}
381+
382+
if(traditional_majority){
383+
call_majority_consensus(variants, max_position, clustering_file, default_threshold);
384+
exit(0);
385+
}
386+
339387
for(auto x : solution){
340388
std::cerr << x << std::endl;
341389
}
342-
std::vector<std::vector<uint32_t>> cluster_groups = find_combination_peaks(solution, means);
390+
std::vector<float> unresolved;
391+
std::vector<std::vector<uint32_t>> cluster_groups = find_combination_peaks(solution, means, unresolved);
392+
for(auto u : unresolved) std::cerr << "unresolved " << u << std::endl;
343393
std::vector<std::vector<uint32_t>> inverse_groups(means.size());
344-
345394
for(uint32_t i=0; i < cluster_groups.size(); i++){
346395
for(uint32_t j=0; j < cluster_groups[i].size(); j++){
347396
//std::cerr << cluster_groups[i][j] << " i " << i << " j " << j << std::endl;
348397
inverse_groups[cluster_groups[i][j]].push_back(i);
349398
}
350399
}
351-
/*for(auto x : inverse_groups){
352-
for(auto y : x){
353-
std::cerr << y << " ";
354-
}
355-
std::cerr << "\n";
356-
}
357-
exit(0);*/
358-
359400
//TESTLINES MEAN CODE
360401
std::string solution_string = "[";
361402
for(uint32_t i=0; i < solution.size(); i++){
@@ -394,7 +435,6 @@ void cluster_consensus(std::vector<variant> variants, std::string clustering_fil
394435
}
395436
//a list of cluster assignments that we assign to consensus
396437
std::vector<int> major_indexes;
397-
std::vector<int> minor_indexes;
398438
//index of the "100%" cluster
399439
for(uint32_t j=0; j < means.size(); j++){
400440
float tmp = means[j];
@@ -405,10 +445,6 @@ void cluster_consensus(std::vector<variant> variants, std::string clustering_fil
405445
auto it = std::find(solution.begin(), solution.end(), tmp);
406446
if((diff < error && it == solution.end()) || tmp == largest){
407447
std::cerr << "major index " << means[j] << " " << j << std::endl;
408-
if(tmp != largest){
409-
std::cerr << "adding to minor" << std::endl;
410-
//minor_indexes.push_back((int)j);
411-
}
412448
major_indexes.push_back((int)j);
413449
}
414450
}
@@ -423,30 +459,44 @@ void cluster_consensus(std::vector<variant> variants, std::string clustering_fil
423459
std::vector<std::string> tmp(max_position, "N");
424460
all_consensus_seqs.push_back(tmp);
425461
}
426-
462+
427463
//iterate all variants and determine
428464
for(uint32_t i = 0; i < variants.size(); i++){
429465
//TESTLINES
430466
if(variants[i].nuc.find('+') != std::string::npos) continue;
431467
//TESTLINES
432-
if(variants[i].position == 1055){
468+
if(variants[i].position == 0){
433469
print = true;
434470
std::cerr << "\ntop freq " << variants[i].freq << " " << variants[i].nuc << " cluster " << variants[i].cluster_assigned << " " << variants[i].gapped_freq << std::endl;
435471
std::cerr << "vague assignment " << variants[i].vague_assignment << " del pos " << variants[i].pos_del_flag << " depth flag " << variants[i].depth_flag << std::endl;
436-
for(auto c : variants[i].probabilities){
437-
std::cerr << c << " ";
438-
}
439-
std::cerr << "\n";
472+
std::cerr << variants[i].amplicon_masked << std::endl;
440473
} else{
441474
print = false;
442475
}
476+
//if the mean for this cluster is unresolved we skip it
477+
auto it = std::find(unresolved.begin(), unresolved.end(), means[variants[i].cluster_assigned]);
478+
if(it != unresolved.end()){
479+
if(print){
480+
std::cerr << "unresolved " << means[variants[i].cluster_assigned] << std::endl;
481+
}
482+
continue;
483+
}
443484
//if this position is experiencing fluctuation across amplicons, call ambiguity
444485
if(variants[i].amplicon_flux && variants[i].freq < freq_upper_bound){
445486
if(print){
446487
std::cerr << "position is experiencing fluctuation" << std::endl;
447488
}
448489
continue;
449490
}
491+
492+
//if this amplicon is experiencing fluctuation across amplicons, call ambiguity
493+
if(variants[i].amplicon_masked && variants[i].freq < freq_upper_bound){
494+
if(print){
495+
std::cerr << "amplicon is experiencing fluctuation" << std::endl;
496+
}
497+
continue;
498+
}
499+
450500
if(variants[i].depth_flag){
451501
if(print){
452502
std::cerr << "a " << variants[i].depth_flag << std::endl;
@@ -497,12 +547,11 @@ void cluster_consensus(std::vector<variant> variants, std::string clustering_fil
497547
if(mit == solution.end()) continue;
498548
//assign the point to all applicable groups
499549
auto it = std::find(inverse_groups[j].begin(), inverse_groups[j].end(), variants[i].cluster_assigned);
500-
if(it != inverse_groups[j].end()){
550+
if(it != inverse_groups[j].end()){
501551
all_consensus_seqs[j][position-1] = variants[i].nuc;
502552
}
503553
}
504554
}
505-
506555
std::vector<std::string> all_sequences;
507556
for(uint32_t i=0; i < all_consensus_seqs.size(); i++){
508557
std::string tmp = std::accumulate(all_consensus_seqs[i].begin(), all_consensus_seqs[i].end(), std::string(""));

src/call_consensus_clustering.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,6 @@
22
#ifndef call_consensus_clustering
33
#define call_consensus_clustering
44

5-
void cluster_consensus(std::vector<variant> variants, std::string clustering_file, std::string variants_filename);
5+
void cluster_consensus(std::vector<variant> variants, std::string clustering_file, std::string variants_filename, double default_threshold);
66
void find_combinations(std::vector<float> means, uint32_t index, std::vector<float> &current, std::vector<std::vector<float>> &results);
77
#endif

src/estimate_error.cpp

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,18 @@ std::vector<float> cluster_error(std::string filename){
77
*/
88

99
float lower_bound = 0.01;
10-
float upper_bound = 0.05;
10+
float upper_bound = 0.10;
1111
uint32_t depth_cutoff = 10;
1212
float quality_threshold = 20;
1313
uint32_t round_val = 4;
1414

1515
std::vector<uint32_t> deletion_positions = find_deletion_positions(filename, depth_cutoff, lower_bound, upper_bound, round_val);
1616
std::vector<variant> base_variants;
1717
parse_internal_variants(filename, base_variants, depth_cutoff, lower_bound, upper_bound, deletion_positions, round_val);
18+
if(base_variants[0].version_1_var){
19+
calculate_reference_frequency(base_variants, filename, depth_cutoff, lower_bound, upper_bound, deletion_positions);
20+
}
21+
1822
gaussian_mixture_model retrained_original;
1923
std::vector<variant> variants_original;
2024
uint32_t useful_count_original = 0;
@@ -28,7 +32,6 @@ std::vector<float> cluster_error(std::string filename){
2832
variants_original.push_back(base_variants[i]);
2933
}
3034
}
31-
std::cerr << "C" << std::endl;
3235
arma::mat data_original(1, useful_count_original, arma::fill::zeros);
3336
//std::cerr << useful_count_original << std::endl;
3437
uint32_t count_original=0;
@@ -38,27 +41,35 @@ std::vector<float> cluster_error(std::string filename){
3841
data_original.col(count_original) = tmp;
3942
count_original += 1;
4043
}
41-
uint32_t n = 5;
42-
std::cerr << count_original << std::endl;
44+
uint32_t n = 3;
4345
retrained_original = retrain_model(n, data_original, variants_original, 2, 0.0001);
44-
std::cerr << "D" << std::endl;
4546
assign_clusters(variants_original, retrained_original, 2);
4647
std::vector<std::vector<double>> clusters(n);
4748
for(auto var : variants_original){
4849
int cluster = var.cluster_assigned;
4950
clusters[cluster].push_back(var.freq);
51+
//std::cerr << var.freq << " " << cluster << std::endl;
5052
}
5153
uint32_t j = 0;
5254
uint32_t largest=0;
55+
std::vector<float> cluster_percents;
5356
for(uint32_t i=0; i < clusters.size(); i++){
57+
std::cerr << clusters[i].size() << std::endl;
58+
cluster_percents.push_back((float)clusters[i].size() / (float)variants_original.size());
5459
if(clusters[i].size() > largest){
5560
j = i;
5661
largest = clusters[i].size();
5762
}
5863
}
59-
std::vector<float> means;
64+
for(auto me : retrained_original.means){
65+
std::cerr << me << std::endl;
66+
}
67+
for(auto per : cluster_percents){
68+
std::cerr << per << std::endl;
69+
}
6070
//get the upper edge of the noise cluster
6171
auto max_it = std::max_element(clusters[j].begin(), clusters[j].end());
72+
std::cerr << *max_it << std::endl;
6273
std::vector<float> max_error_pos(max_pos+1);
6374
for(uint32_t i=0; i < base_variants.size(); i++){
6475
if(base_variants[i].freq <= *max_it){
@@ -72,8 +83,13 @@ std::vector<float> cluster_error(std::string filename){
7283
max_error = x;
7384
}
7485
}
86+
//std::cerr << "max error " << max_error << std::endl;
7587
std::vector<float> tmp;
76-
tmp.push_back(*max_it);
88+
//tmp.push_back(*max_it);
89+
//tmp.push_back(1-*max_it);
90+
91+
tmp.push_back(max_error);
7792
tmp.push_back(1-max_error);
93+
//exit(0);
7894
return tmp;
7995
}

0 commit comments

Comments
 (0)