Skip to content

Commit 0abe1e7

Browse files
committed
cleaning up code changing way variants is calculated
1 parent 25bb58a commit 0abe1e7

File tree

7 files changed

+106
-818
lines changed

7 files changed

+106
-818
lines changed

src/interval_tree.cpp

Lines changed: 9 additions & 246 deletions
Original file line numberDiff line numberDiff line change
@@ -67,18 +67,20 @@ void IntervalTree::combine_haplotypes(ITNode *root){
6767
combine_haplotypes(root->right);
6868
}
6969

70-
void IntervalTree::assign_read_amplicon(ITNode *root, uint32_t amp_start, std::vector<uint32_t> positions, std::vector<std::string> bases, std::vector<uint32_t> qualities){
70+
void IntervalTree::assign_read_amplicon(ITNode *root, uint32_t amp_start, std::vector<uint32_t> positions, std::vector<std::string> bases, std::vector<uint32_t> qualities, uint8_t min_qual){
7171
if (root==NULL) return;
7272
if((uint32_t)root->data->low == amp_start){
7373
for(uint32_t i=0; i < positions.size(); i++){
7474
for(uint32_t j=0; j < root->amp_positions.size(); j++){
7575
if(positions[i] == root->amp_positions[j].pos){
76-
root->amp_positions[j].update_alleles(bases[i], 1, qualities[i]);
76+
if(qualities[i] >= (uint32_t) min_qual){
77+
root->amp_positions[j].update_alleles(bases[i], 1, qualities[i]);
78+
}
7779
}
7880
}
7981
}
8082
}
81-
assign_read_amplicon(root->right, amp_start, positions, bases, qualities);
83+
assign_read_amplicon(root->right, amp_start, positions, bases, qualities, min_qual);
8284
}
8385

8486
void IntervalTree::find_read_amplicon(ITNode *root, uint32_t lower, uint32_t upper, bool &found, std::string read_name, uint32_t &amp_start, uint32_t &amp_dist){
@@ -99,7 +101,7 @@ void IntervalTree::find_read_amplicon(ITNode *root, uint32_t lower, uint32_t upp
99101

100102
void IntervalTree::amplicon_position_pop(ITNode *root){
101103
if (root==NULL) return;
102-
for(uint32_t i=root->data->low; i < root->data->high; i++){
104+
for(uint32_t i=root->data->low; i < (uint32_t)root->data->high; i++){
103105
position add_pos;
104106
add_pos.pos = i;
105107
add_pos.alleles = populate_basic_alleles();
@@ -149,212 +151,11 @@ void IntervalTree::detect_abberations(ITNode *root, uint32_t find_position){
149151

150152
}
151153

152-
void IntervalTree::add_read_variants(std::vector<uint32_t> positions, std::vector<std::string> bases, std::vector<uint32_t> qualities){
154+
void IntervalTree::add_read_variants(std::vector<uint32_t> positions, std::vector<std::string> bases, std::vector<uint32_t> qualities, uint8_t min_qual){
153155
for(uint32_t i=0; i < positions.size(); i++){
154-
variants[positions[i]].update_alleles(bases[i], 1, qualities[i]);
155-
}
156-
}
157-
158-
void IntervalTree::add_read_variants(uint32_t *cigar, uint32_t start_pos, uint32_t nlength, uint8_t *seq, uint8_t *aux, uint8_t* qual, std::string qname) {
159-
uint32_t consumed_query = 0;
160-
uint32_t consumed_ref = 0;
161-
std::vector<uint32_t> sc_positions; //sc positions
162-
std::vector<uint32_t> ignore_sequence; //positions in query that are insertions
163-
//first we handle insertion from the cigar
164-
std::vector<uint32_t> useful;
165-
std::vector<uint8_t> sequence;
166-
std::vector<uint8_t> quality;
167-
168-
uint32_t quality_threshold=20;
169-
uint8_t nt = 0;
170-
uint32_t length = 0, ll=0, op=0;
171-
//auto start = high_resolution_clock::now();
172-
for(uint32_t i=0; i < nlength; i++){
173-
op = bam_cigar_op(cigar[i]);
174-
if (bam_cigar_type(op) & 1){
175-
length = bam_cigar_oplen(cigar[i]);
176-
for(uint32_t k=0; k < length; k++){
177-
useful.push_back(ll+k);
178-
}
179-
ll += length;
180-
}
181-
}
182-
//TESTLINES
183-
uint32_t qual_threshold = 20;
184-
quality.reserve(useful.size());
185-
sequence.reserve(useful.size());
186-
for(uint32_t k=0; k < useful.size(); k++){
187-
if(qual[useful[k]] < qual_threshold){
188-
sequence.push_back('L');
189-
total_qual += qual[useful[k]];
190-
total_bases++;
191-
} else {
192-
nt = seq_nt16_str[bam_seqi(seq, useful[k])];
193-
sequence.push_back(nt);
194-
}
195-
quality.push_back(qual[useful[k]]);;
196-
}
197-
useful.clear();
198-
std::string nuc;
199-
for(uint32_t j=0; j < nlength; j++){
200-
op = bam_cigar_op(cigar[j]);
201-
uint32_t oplen = bam_cigar_oplen(cigar[j]);
202-
if(op == 1){
203-
std::ostringstream convert;
204-
uint32_t q = 0;
205-
for(uint32_t k=0; k < oplen; k++) {
206-
//convert data type to get the characters
207-
ignore_sequence.push_back(k+consumed_ref+start_pos);
208-
convert << sequence[k+consumed_query];
209-
q += quality[k+consumed_query];
210-
}
211-
nuc.clear();
212-
nuc = convert.str();
213-
214-
int avg_q = (int)q/nuc.size();
215-
char ch = 'L';
216-
std::string nuc = "+" + convert.str();
217-
//check if this position exists
218-
int exists = check_position_exists(start_pos+consumed_ref, variants);
219-
if (exists != -1 && nuc.find(ch) == std::string::npos) {
220-
variants[exists].update_alleles(nuc, 1, avg_q);
221-
} else {
222-
//add position to vector
223-
position add_pos;
224-
add_pos.pos = start_pos+consumed_ref; //don't add a position
225-
if (nuc.find(ch) == std::string::npos) {
226-
add_pos.update_alleles(nuc, 1, avg_q);
227-
variants.push_back(add_pos);
228-
}
229-
}
230-
consumed_query += oplen;
231-
continue;
232-
}
233-
if (!(bam_cigar_type(op) & 1) && !(bam_cigar_type(op) & 2)){
234-
continue;
235-
}
236-
//if we don't consume both query and reference
237-
if (!(bam_cigar_type(op) & 1) || !(bam_cigar_type(op) & 2)){
238-
if (op != 2){
239-
for(uint32_t k=0; k < oplen; k++) {
240-
//convert data type to get the characters
241-
sc_positions.push_back(k+consumed_query);
242-
}
243-
}
244-
}
245-
//consumes query
246-
if (bam_cigar_type(op) & 1){
247-
consumed_query += oplen;
248-
}
249-
//consumes ref
250-
if (bam_cigar_type(op) & 2){
251-
consumed_ref += oplen;
252-
}
253-
}
254-
//we will use the aux tag to handle deletions
255-
bool deletion = false;
256-
bool substitution = false;
257-
uint32_t current_pos = start_pos;
258-
std::vector<uint32_t> deletion_positions; //the aux tag does NOT recognize insertions
259-
std::vector<uint32_t> substitutions; //handle substitutions
260-
std::string gather_digits = "";
261-
std::string deleted_char;
262-
std::string sub_char;
263-
uint32_t last_char = 0;
264-
uint32_t j = 0;
265-
do {
266-
char character = (char)aux[j];
267-
if (character == '^'){
268-
current_pos += std::stoi(gather_digits);
269-
gather_digits = "";
270-
deletion = true;
271-
} else if (isdigit(character) && deletion) {
272-
deleted_char = "";
273-
deletion = false;
274-
gather_digits += character;
275-
} else if (isalpha(character) && deletion) {
276-
variants[current_pos].update_alleles("-", 1, 0);
277-
deletion_positions.push_back(current_pos);
278-
current_pos += 1;
279-
deleted_char += character;
280-
deletion = true;
281-
} else if (isdigit(character) && !deletion) {
282-
if(substitution){
283-
for(uint32_t z=1; z < sub_char.size(); z++){
284-
substitutions.push_back(current_pos + z);
285-
}
286-
substitution = false;
287-
sub_char.clear();
288-
}
289-
if(last_char > 0){
290-
current_pos += last_char;
291-
last_char = 0;
292-
}
293-
gather_digits += character;
294-
} else if (isalpha(character) && !deletion) {
295-
last_char += 1;
296-
substitution = true;
297-
sub_char += character;
298-
if(gather_digits.size() > 0){
299-
current_pos += std::stoi(gather_digits);
300-
}
301-
gather_digits = "";
302-
}
303-
j++;
304-
} while(aux[j] != '\0');
305-
//now that we know where the insertions and deletions are, let's just iterate the query sequence and add it in, skipping problem positions
306-
current_pos = start_pos;
307-
bool prev_insertion = false;
308-
std::ostringstream convert;
309-
nuc.clear();
310-
std::vector<uint32_t> seen_insertions;
311-
std::vector<uint32_t>::iterator i_it;
312-
std::string test = "";
313-
//j is relative to the sequence and current pos to the reference
314-
for(uint32_t j=0; j < sequence.size(); j++){
315-
if(qname == test){
316-
std::cerr << "j " << j << " cur pos " << current_pos << " " << sequence[j] << std::endl;
317-
}
318-
std::vector<uint32_t>::iterator it = find(deletion_positions.begin(), deletion_positions.end(), current_pos);
319-
if (it != deletion_positions.end()) {
320-
current_pos += 1;
321-
j -= 1;
322-
continue;
323-
}
324-
it = find(ignore_sequence.begin(), ignore_sequence.end(), current_pos);
325-
i_it = find(seen_insertions.begin(), seen_insertions.end(), current_pos);
326-
//handle insertions
327-
if (it != ignore_sequence.end() && i_it == seen_insertions.end()) {
328-
std::ostringstream convert;
329-
convert << sequence[j];
330-
nuc += convert.str();
331-
seen_insertions.push_back(current_pos);
332-
current_pos += 1;
333-
prev_insertion = true;
334-
continue;
335-
} else if (it == ignore_sequence.end()){
336-
//insertion is over
337-
if(prev_insertion) {
338-
current_pos -= nuc.size();
339-
}
340-
prev_insertion = false;
341-
nuc = "";
342-
}
343-
it = find(sc_positions.begin(), sc_positions.end(), j);
344-
if (it != sc_positions.end()){
345-
continue;
346-
}
347-
current_pos += 1;
348-
std::ostringstream convert;
349-
convert << sequence[j];
350-
std::string nuc = convert.str();
351-
if((uint32_t)quality[j] < quality_threshold){
352-
continue;
156+
if(qualities[i] >= (uint32_t)min_qual){
157+
variants[positions[i]].update_alleles(bases[i], 1, qualities[i]);
353158
}
354-
char ch = 'L';
355-
if (nuc.find(ch) == std::string::npos) {
356-
variants[current_pos].update_alleles(nuc, 1, quality[j]);
357-
}
358159
}
359160
}
360161

@@ -379,44 +180,6 @@ int IntervalTree::unpaired_primers(ITNode *root, primer prim){
379180
return ret;
380181
}
381182

382-
void IntervalTree::set_haplotypes(ITNode *root, primer prim){
383-
if (root==NULL) return;
384-
char strand = prim.get_strand();
385-
//these are the bounds on the amplicon
386-
if(strand == '+' && ((int)prim.get_start() != root->data->low)){
387-
set_haplotypes(root->right, prim);
388-
} else if (strand == '-' && ((int)prim.get_end()+1 != root->data->high)){
389-
set_haplotypes(root->right, prim);
390-
} else {
391-
// we found the matching amplion, now we add this cigarotype to the amplicon
392-
std::vector<position> tmp_pos = prim.get_positions();
393-
for (uint32_t i=0; i < tmp_pos.size(); i++){
394-
position add_pos = tmp_pos[i];
395-
bool found = false;
396-
// check if we already have a pos for this pos
397-
for(uint32_t j=0; j < root->amp_positions.size(); j++){
398-
position here_pos = root->amp_positions[j];
399-
if(here_pos.pos == tmp_pos[i].pos){
400-
found = true;
401-
root->amp_positions[j].depth += tmp_pos[i].depth;
402-
std::vector<allele> new_alleles = add_allele_vectors(tmp_pos[i].alleles, root->amp_positions[j].alleles);
403-
root->amp_positions[j].alleles = new_alleles;
404-
break;
405-
}
406-
}
407-
//if we've never seen this pos for this haplotype, push a new one
408-
if (!found){
409-
position tmp;
410-
tmp.depth = add_pos.depth;
411-
tmp.alleles = add_pos.alleles;
412-
tmp.pos = add_pos.pos;
413-
root->amp_positions.push_back(tmp);
414-
}
415-
}
416-
return;
417-
}
418-
}
419-
420183
// A utility function to insert a new Interval Search Tree Node
421184
// This is similar to BST Insert. Here the low value of interval
422185
// is used tomaintain BST property

src/interval_tree.h

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -52,15 +52,14 @@ class IntervalTree {
5252
void amplicon_position_pop(ITNode *root);
5353
void print_amplicons(ITNode *root);
5454
void get_max_pos(ITNode *root);
55-
void set_haplotypes(ITNode *root, primer prim);
5655
int unpaired_primers(ITNode *root, primer prim);
5756
void combine_haplotypes(ITNode *root);
5857
void write_out_frequencies(ITNode *root, std::string filename);
5958
void detect_abberations(ITNode *root, uint32_t pos);
6059
void detect_amplicon_overlaps(ITNode *root, uint32_t pos);
6160
void detect_primer_issues(ITNode *root, uint32_t pos);
6261
void find_read_amplicon(ITNode *root, uint32_t lower, uint32_t upper, bool &found, std::string read_name, uint32_t &amp_start, uint32_t &amp_dist);
63-
void assign_read_amplicon(ITNode *root, uint32_t amp_start, std::vector<uint32_t> positions, std::vector<std::string> bases, std::vector<uint32_t> qualities);
62+
void assign_read_amplicon(ITNode *root, uint32_t amp_start, std::vector<uint32_t> positions, std::vector<std::string> bases, std::vector<uint32_t> qualities, uint8_t min_qual);
6463

6564
public:
6665
uint32_t max_pos=0;
@@ -77,34 +76,30 @@ class IntervalTree {
7776
void inOrder() { inOrder(_root); }
7877
void print_amplicons() {print_amplicons(_root);}
7978
void get_max_pos() {get_max_pos(_root);}
80-
void set_haplotypes(primer prim) {set_haplotypes(_root, prim);}
8179
int unpaired_primers(primer prim) { return unpaired_primers(_root, prim);}
8280
void detect_abberations(uint32_t pos) {detect_abberations(_root, pos);}
8381
void detect_amplicon_overlaps(uint32_t pos) {detect_amplicon_overlaps(_root, pos);}
8482
void detect_primer_issues(uint32_t pos) {detect_primer_issues(_root, pos);}
8583
void combine_haplotypes() {combine_haplotypes(_root);}
8684
void write_out_frequencies(std::string filename){write_out_frequencies(_root, filename);}
87-
void add_read_variants(uint32_t *cigar, uint32_t start_pos, uint32_t nlength, uint8_t *sequence, uint8_t *aux, uint8_t *quality, std::string qname);
8885
void populate_variants(uint32_t last_position);
89-
void add_read_variants(std::vector<uint32_t> positions, std::vector<std::string> bases, std::vector<uint32_t> qualities);
86+
void add_read_variants(std::vector<uint32_t> positions, std::vector<std::string> bases, std::vector<uint32_t> qualities, uint8_t min_qual);
9087
void find_read_amplicon(uint32_t lower, uint32_t upper, bool &found, std::string read_name, uint32_t &amp_start, uint32_t &amp_dist) {find_read_amplicon(_root, lower, upper, found, read_name, amp_start, amp_dist);}
91-
void assign_read_amplicon(uint32_t amp_start, std::vector<uint32_t> positions, std::vector<std::string> bases, std::vector<uint32_t> qualities) {assign_read_amplicon(_root, amp_start, positions, bases, qualities);}
88+
void assign_read_amplicon(uint32_t amp_start, std::vector<uint32_t> positions, std::vector<std::string> bases, std::vector<uint32_t> qualities, uint8_t min_qual) {assign_read_amplicon(_root, amp_start, positions, bases, qualities, min_qual);}
9289
void amplicon_position_pop() {amplicon_position_pop(_root);}
9390
};
9491

9592
void combine_haplotypes();
9693
void detect_abberations(ITNode *root, uint32_t find_position);
9794
void get_max_pos();
98-
void set_haplotypes(ITNode *root, primer prim);
99-
void add_read_variants(uint32_t *cigar, uint32_t start_pos, uint32_t nlength, uint8_t *sequence, uint8_t *aux, uint8_t *quality, std::string qname);
100-
void add_read_variants(std::vector<uint32_t> positions, std::vector<std::string> bases, std::vector<uint32_t> qualities);
95+
void add_read_variants(std::vector<uint32_t> positions, std::vector<std::string> bases, std::vector<uint32_t> qualities, uint8_t min_qual);
10196
void populate_variants(uint32_t last_position);
10297
int unpaired_primers(ITNode *root, primer prim);
10398
void detect_primer_issues(ITNode *root, uint32_t find_position);
10499
void detect_amplicon_overlaps(ITNode *root, uint32_t find_position);
105100
void find_read_amplicon(ITNode *root, uint32_t lower, uint32_t upper, bool &found, std::string read_name, uint32_t &amp_start, uint32_t &amp_dist);
106101
IntervalTree populate_amplicons(std::string pair_info_file, std::vector<primer> &primers);
107102
IntervalTree amplicon_position_pop();
108-
void assign_read_amplicon(ITNode *root, uint32_t amp_start, std::vector<uint32_t> positions, std::vector<std::string> bases, std::vector<uint32_t> qualities);
103+
void assign_read_amplicon(ITNode *root, uint32_t amp_start, std::vector<uint32_t> positions, std::vector<std::string> bases, std::vector<uint32_t> qualities, uint8_t min_qual);
109104
void write_out_frequencies(ITNode *root, std::string filename);
110105
#endif

src/ivar.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,8 @@ int main(int argc, char *argv[]) {
364364
g_args.bed = "";
365365
g_args.primer_pair_file = "";
366366
g_args.primer_offset = 0;
367+
g_args.min_depth = 10;
368+
g_args.min_qual = 20;
367369
opt = getopt(argc, argv, trim_opt_str);
368370
while (opt != -1) {
369371
switch (opt) {
@@ -404,7 +406,7 @@ int main(int argc, char *argv[]) {
404406
g_args.prefix = get_filename_without_extension(g_args.prefix, ".bam");
405407
res = preprocess_reads(g_args.bam, g_args.bed, g_args.prefix,
406408
cl_cmd.str(),
407-
g_args.primer_pair_file, g_args.primer_offset);
409+
g_args.primer_pair_file, g_args.primer_offset, g_args.min_depth, g_args.min_qual);
408410
//std::vector<uint32_t> populations_iterate;
409411
//for(uint32_t i= 2; i < 6; i++){
410412
// populations_iterate.push_back(i);

0 commit comments

Comments
 (0)