Skip to content

Commit 73244fc

Browse files
committed
reducing excess code, changing noise calculation
1 parent 811b5c2 commit 73244fc

9 files changed

+55
-68
lines changed

src/call_consensus_clustering.cpp

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -209,9 +209,6 @@ std::vector<std::vector<float>> find_solutions(std::vector<float> means, float e
209209
std::sort(results.begin(), results.end());
210210
results.erase(std::unique(results.begin(), results.end()), results.end());
211211

212-
auto max_iter = std::max_element(means.begin(), means.end());
213-
auto min_iter = std::min_element(means.begin(), means.end());
214-
215212
std::vector<std::vector<float>> final_results;
216213
//constrain that the solutions must add to 1
217214
for(uint32_t i=0; i < results.size(); i++){
@@ -341,13 +338,11 @@ std::vector<float> parse_clustering_results(std::string clustering_file){
341338
}
342339
return(numbers);
343340
}
344-
void cluster_consensus(std::vector<variant> variants, std::string clustering_file, std::string variants_file, double default_threshold){
345-
float depth_cutoff = 10;
341+
void cluster_consensus(std::vector<variant> variants, std::string clustering_file, std::string variants_file, double default_threshold, uint32_t min_depth, uint8_t min_qual){
346342
double error = 0.10;
347-
float solution_error = 0.05;
348-
double quality_threshold = 20;
343+
float solution_error = 0.10;
349344

350-
double error_rate = cluster_error(variants_file, quality_threshold, depth_cutoff);
345+
double error_rate = cluster_error(variants_file, min_qual, min_depth);
351346
float freq_lower_bound = 1-error_rate;
352347
float freq_upper_bound = error_rate;
353348

@@ -504,7 +499,7 @@ void cluster_consensus(std::vector<variant> variants, std::string clustering_fil
504499
//TESTLINES
505500
if(variants[i].nuc.find('+') != std::string::npos) continue;
506501
//TESTLINES
507-
if(variants[i].position == 13572){
502+
if(variants[i].position == 22029){
508503
print = true;
509504
std::cerr << "\ntop freq " << variants[i].freq << " " << variants[i].nuc << " cluster " << variants[i].cluster_assigned << " " << variants[i].gapped_freq << std::endl;
510505
std::cerr << "vague assignment " << variants[i].vague_assignment << " del pos " << variants[i].pos_del_flag << " depth flag " << variants[i].depth_flag << std::endl;
@@ -628,7 +623,6 @@ void cluster_consensus(std::vector<variant> variants, std::string clustering_fil
628623
auto it = std::find(solution.begin(), solution.end(), tmp_mean);
629624
if(it == solution.end()) continue;
630625
file << ">"+clustering_file+"_cluster_"+ std::to_string(means[i]) << "\n";
631-
std::cerr << all_sequences[i][13571] << std::endl;
632626
file << all_sequences[i] << "\n";
633627
}
634628
file.close();

src/call_consensus_clustering.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,6 @@
22
#ifndef call_consensus_clustering
33
#define call_consensus_clustering
44

5-
void cluster_consensus(std::vector<variant> variants, std::string clustering_file, std::string variants_filename, double default_threshold);
5+
void cluster_consensus(std::vector<variant> variants, std::string clustering_file, std::string variants_filename, double default_threshold, uint32_t min_depth, uint8_t min_qual);
66
void find_combinations(std::vector<float> means, uint32_t index, std::vector<float> &current, std::vector<std::vector<float>> &results);
77
#endif

src/estimate_error.cpp

Lines changed: 40 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,13 @@
22
#include "gmm.h"
33
#include "saga.h"
44

5+
std::vector<std::vector<double>> assign_nearest_cluster(){
6+
/*
7+
Given a set of means and variants assign each variant to the nearest cluster.
8+
*/
9+
10+
}
11+
512
std::vector<double> z_score(std::vector<double> data) {
613
double mean = std::accumulate(data.begin(), data.end(), 0.0) / data.size();
714
double sq_sum = std::inner_product(data.begin(), data.end(), data.begin(), 0.0);
@@ -39,7 +46,7 @@ std::vector<std::vector<uint32_t>> determine_outlier_points(std::vector<double>
3946
return(removal_points);
4047
}
4148

42-
double cluster_error(std::string filename, double quality_threshold, uint32_t depth_cutoff){
49+
double cluster_error(std::string filename, uint8_t quality_threshold, uint32_t depth_cutoff){
4350
/*
4451
Here we use clustering to determine the value of the noise.
4552
*/
@@ -76,32 +83,42 @@ double cluster_error(std::string filename, double quality_threshold, uint32_t de
7683
count_original += 1;
7784
}
7885

79-
uint32_t n = 3;
80-
gaussian_mixture_model model = retrain_model(n, data_original, variants_original, 2, 0.00001);
81-
std::vector<double> means = model.means;
82-
83-
//for each cluster this describes the points which are outliers
84-
std::vector<std::vector<uint32_t>> removal_points = determine_outlier_points(frequencies, means);
85-
std::vector<std::vector<double>> clusters(n);
86+
//start with a small n value and if we don't find two major clusters we increase the number of clusters
87+
uint32_t n = 2;
88+
kmeans_model model;
8689

87-
for(auto var : variants_original){
88-
double tmp = (double)var.gapped_freq;
89-
auto it = std::min_element(means.begin(), means.end(),
90-
[tmp](double a, double b) {
91-
return std::fabs(a - tmp) < std::fabs(b - tmp);
92-
});
93-
uint32_t index = std::distance(means.begin(), it);
94-
clusters[index].push_back(var.gapped_freq);
90+
while(n <= 5){
91+
model = train_model(n, data_original);
92+
for(auto m : model.means){
93+
std::cerr << m << std::endl;
94+
}
95+
std::vector<double> means = model.means;
96+
bool stop=true;
97+
for(uint32_t i=0; i < model.clusters.size(); i++){
98+
float percent = (float)model.clusters[i].size() / (float)data_original.size();
99+
if(percent > 0.85) {
100+
n++;
101+
stop = false;
102+
}
103+
std::cerr << "n " << n << " percent " << percent << std::endl;
104+
}
105+
if(stop) break;
95106
}
107+
//for each cluster this describes the points which are outliers
108+
std::vector<std::vector<uint32_t>> removal_points = determine_outlier_points(frequencies, model.means);
109+
96110
uint32_t j = 0;
97111
uint32_t largest=0;
98-
for(uint32_t i=0; i < clusters.size(); i++){
99-
if(clusters[i].size() > largest){
112+
for(uint32_t i=0; i < model.clusters.size(); i++){
113+
//std::cerr << "percent " << (float)model.clusters[i].size() / (float)data_original.size() << std::endl;
114+
double mean = std::accumulate(model.clusters[i].begin(), model.clusters[i].end(), 0.0) / model.clusters[i].size();
115+
//std::cerr << "mean " << mean << std::endl;
116+
if(model.clusters[i].size() > largest){
100117
j = i;
101-
largest = clusters[i].size();
118+
largest = model.clusters[i].size();
102119
}
103120
}
104-
std::vector<double> universal_cluster = clusters[j];
121+
std::vector<double> universal_cluster = model.clusters[j];
105122
std::vector<uint32_t> outliers = removal_points[j];
106123
std::vector<double> cleaned_cluster;
107124
for(uint32_t i=0; i < universal_cluster.size(); i++){
@@ -113,5 +130,7 @@ double cluster_error(std::string filename, double quality_threshold, uint32_t de
113130

114131
//get the upper edge of the noise cluster
115132
auto min_it = std::min_element(cleaned_cluster.begin(), cleaned_cluster.end());
116-
return *min_it;
133+
std::cerr << *min_it << std::endl;
134+
//exit(0);
135+
return(*min_it);
117136
}

src/estimate_error.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,5 @@
33
#include "./include/armadillo"
44
#ifndef estimate_error
55
#define estimate_error
6-
double cluster_error(std::string filename, double quality_threshold, uint32_t depth_cutoff);
6+
double cluster_error(std::string filename, uint8_t quality_threshold, uint32_t depth_cutoff);
77
#endif

src/interval_tree.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -138,8 +138,6 @@ void IntervalTree::detect_primer_issues(ITNode *root, uint32_t find_position){
138138

139139
void IntervalTree::detect_abberations(ITNode *root, uint32_t find_position){
140140
if (root==NULL) return;
141-
//unsure why this line is wrogn, but it is
142-
//if (find_position < (uint32_t)root->data->low) return;
143141
for(uint32_t i=0; i < root->amp_positions.size(); i++){
144142
if(find_position == root->amp_positions[i].pos){
145143
test_flux.push_back(root->amp_positions[i]);

src/ivar.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,8 @@ int main(int argc, char *argv[]) {
331331
//ivar contam
332332
if (cmd.compare("contam") == 0) {
333333
g_args.min_threshold = 0;
334+
g_args.min_depth = 10;
335+
g_args.min_qual = 20;
334336
opt = getopt(argc, argv, contam_opt_str);
335337
while (opt != -1) {
336338
switch (opt) {
@@ -352,8 +354,8 @@ int main(int argc, char *argv[]) {
352354
opt = getopt(argc, argv, contam_opt_str);
353355
}
354356
if (!g_args.variants.empty() && !g_args.prefix.empty()) {
355-
std::vector<variant> variants = gmm_model(g_args.variants, g_args.prefix);
356-
cluster_consensus(variants, g_args.prefix, g_args.variants, g_args.min_threshold);
357+
std::vector<variant> variants = gmm_model(g_args.variants, g_args.prefix, g_args.min_depth, g_args.min_qual);
358+
cluster_consensus(variants, g_args.prefix, g_args.variants, g_args.min_threshold, g_args.min_depth, g_args.min_qual);
357359
}
358360
res = 0;
359361
g_args.prefix = get_filename_without_extension(g_args.prefix, ".bam");

src/primer_bed.cpp

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,12 @@
11
#include "primer_bed.h"
22
#include "allele_functions.h"
33
#include "ref_seq.h"
4-
#include <chrono>
54
std::string primer::get_name() { return name; }
65

76
std::string primer::get_region() { return region; }
87

98
int primer::get_score() { return score; }
109

11-
std::vector<position> primer::get_positions() { return positions; }
12-
13-
void primer::set_positions(position pos) { positions.push_back(pos); }
14-
1510
uint32_t primer::get_start() const { return start; }
1611

1712
uint32_t primer::get_end() const { return end; }
@@ -75,22 +70,6 @@ float weighted_similarity(std::string string_1, std::string string_2){
7570
return similarity_score / total_weight;
7671
}
7772

78-
void primer::populate_positions(){
79-
for(uint32_t i=0; i < this->hardcoded_length; i++){
80-
position tmp;
81-
tmp.alleles = populate_basic_alleles();
82-
if(this->get_strand() == '+'){
83-
tmp.pos = i + this->get_start();
84-
} else {
85-
if(this->get_end() > i){
86-
break;
87-
}
88-
tmp.pos = this->get_end() - i;
89-
}
90-
positions.push_back(tmp);
91-
}
92-
}
93-
9473
std::vector<primer> populate_from_file(std::string path, int32_t offset = 0) {
9574
std::ifstream data(path.c_str());
9675
std::string line;

src/primer_bed.h

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,10 @@ class primer {
2020
int16_t pair_indice;
2121
int16_t indice;
2222
uint32_t read_count = 0;
23-
std::vector<position> positions; //allele counts for this primer
2423

2524
public:
26-
uint32_t hardcoded_length=300; //CHANGE
2725
std::string get_name();
2826
std::string get_region();
29-
std::vector<position> get_positions();
30-
void set_positions(position pos);
3127
int get_score();
3228
uint32_t get_start() const;
3329
uint32_t get_end() const;
@@ -36,7 +32,6 @@ class primer {
3632
int16_t get_pair_indice();
3733
int16_t get_indice() const;
3834
uint32_t get_read_count() const;
39-
void populate_positions();
4035
void set_start(uint32_t s);
4136
void set_end(uint32_t e);
4237
void set_strand(char s);
@@ -64,8 +59,4 @@ primer get_max_end(std::vector<primer> primers);
6459
std::vector<uint32_t> get_nlengths();
6560
std::vector<uint32_t> get_start_positions();
6661
std::vector<bool> get_direction();
67-
std::vector<std::vector<std::string>> get_qnames();
68-
void set_positions(position pos);
69-
std::vector<position> get_positions();
70-
void populate_positions();
7162
#endif

src/saga.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,9 @@ void merge_reads(const bam1_t* read1, const bam1_t* read2, IntervalTree &amplico
191191

192192
amplicons.find_read_amplicon(start_forward, end_reverse, found_amplicon, bam_get_qname(read1), amp_start, amp_dist);
193193
if(!found_amplicon){
194+
if(start_forward < 23063 && end_reverse > 23063){
195+
std::cerr << start_forward << " " << end_reverse << " " << bam_get_qname(read1) << " " << bam_get_qname(read2) << std::endl;
196+
}
194197
amplicons.add_read_variants(final_positions, final_bases, final_qualities, min_qual);
195198
} else {
196199
amplicons.assign_read_amplicon(amp_start, final_positions, final_bases, final_qualities, min_qual);
@@ -402,6 +405,7 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out, std:
402405
start_pos = aln->core.pos;
403406
}
404407
//TEST LINES
408+
if(start_pos < 22063 || start_pos > 24063) continue;
405409
//if(start_pos < 15176 || start_pos > 17176) continue;
406410
bam1_t *r = aln;
407411
//get the md tag
@@ -519,7 +523,7 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out, std:
519523
std::vector<uint32_t> flagged_positions;
520524
std::vector<float> std_deviations;
521525
std::vector<std::string> pos_nuc;
522-
uint32_t test_pos = 0;
526+
uint32_t test_pos = 23064;
523527
//detect fluctuating variants across amplicons
524528
for(uint32_t i=0; i < amplicons.max_pos; i++){
525529
amplicons.test_flux.clear();

0 commit comments

Comments
 (0)