Skip to content

Commit 58f6762

Browse files
committed
updating functions for assigning to amplicon and adding to amplicon
1 parent ff343db commit 58f6762

File tree

3 files changed

+90
-53
lines changed

3 files changed

+90
-53
lines changed

src/interval_tree.cpp

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -67,20 +67,34 @@ void IntervalTree::combine_haplotypes(ITNode *root){
6767
combine_haplotypes(root->right);
6868
}
6969

70-
void IntervalTree::find_read_amplicon(ITNode *root, uint32_t lower, uint32_t upper, std::vector<uint32_t> positions, std::vector<std::string> bases, std::vector<uint32_t> qualities, bool &found){
70+
void IntervalTree::assign_read_amplicon(ITNode *root, uint32_t amp_start, std::vector<uint32_t> positions, std::vector<std::string> bases, std::vector<uint32_t> qualities){
7171
if (root==NULL) return;
72-
//if ((uint32_t)root->data->low > upper) return;
73-
if(((uint32_t)root->data->low <= lower) && (upper <= (uint32_t)root->data->high)){
72+
if((uint32_t)root->data->low == amp_start){
7473
for(uint32_t i=0; i < positions.size(); i++){
7574
for(uint32_t j=0; j < root->amp_positions.size(); j++){
7675
if(positions[i] == root->amp_positions[j].pos){
7776
root->amp_positions[j].update_alleles(bases[i], 1, qualities[i]);
78-
found = true;
7977
}
8078
}
8179
}
8280
}
83-
find_read_amplicon(root->right, lower, upper, positions, bases, qualities, found);
81+
assign_read_amplicon(root->right, amp_start, positions, bases, qualities);
82+
}
83+
84+
void IntervalTree::find_read_amplicon(ITNode *root, uint32_t lower, uint32_t upper, bool &found, std::string read_name, uint32_t &amp_start, uint32_t &amp_dist){
85+
//read name here is for TEST
86+
if (root==NULL) return;
87+
//if ((uint32_t)root->data->low > upper) return;
88+
if(((uint32_t)root->data->low <= lower) && (upper <= (uint32_t)root->data->high)){
89+
//describes how far the ends of this are from the start/end of the amplicon
90+
uint32_t dist = (lower - root->data->low) + (root->data->high - upper);
91+
if(dist < amp_dist) {
92+
amp_dist = dist;
93+
amp_start = root->data->low;
94+
}
95+
found = true;
96+
}
97+
find_read_amplicon(root->right, lower, upper, found, read_name, amp_start, amp_dist);
8498
}
8599

86100
void IntervalTree::amplicon_position_pop(ITNode *root){

src/interval_tree.h

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,9 @@ class IntervalTree {
5959
void detect_abberations(ITNode *root, uint32_t pos);
6060
void detect_amplicon_overlaps(ITNode *root, uint32_t pos);
6161
void detect_primer_issues(ITNode *root, uint32_t pos);
62-
void find_read_amplicon(ITNode *root, uint32_t lower, uint32_t upper, std::vector<uint32_t> positions, std::vector<std::string> bases, std::vector<uint32_t> qualities, bool &found);
62+
void find_read_amplicon(ITNode *root, uint32_t lower, uint32_t upper, bool &found, std::string read_name, uint32_t &amp_start, uint32_t &amp_dist);
63+
void assign_read_amplicon(ITNode *root, uint32_t amp_start, std::vector<uint32_t> positions, std::vector<std::string> bases, std::vector<uint32_t> qualities);
64+
6365
public:
6466
uint32_t max_pos=0;
6567
std::vector<std::vector<uint32_t>> overlaps;
@@ -85,7 +87,8 @@ class IntervalTree {
8587
void add_read_variants(uint32_t *cigar, uint32_t start_pos, uint32_t nlength, uint8_t *sequence, uint8_t *aux, uint8_t *quality, std::string qname);
8688
void populate_variants(uint32_t last_position);
8789
void add_read_variants(std::vector<uint32_t> positions, std::vector<std::string> bases, std::vector<uint32_t> qualities);
88-
void find_read_amplicon(uint32_t lower, uint32_t upper, std::vector<uint32_t> positions, std::vector<std::string> bases, std::vector<uint32_t> qualities, bool &found) {find_read_amplicon(_root, lower, upper, positions, bases, qualities, found);}
90+
void find_read_amplicon(uint32_t lower, uint32_t upper, bool &found, std::string read_name, uint32_t &amp_start, uint32_t &amp_dist) {find_read_amplicon(_root, lower, upper, found, read_name, amp_start, amp_dist);}
91+
void assign_read_amplicon(uint32_t amp_start, std::vector<uint32_t> positions, std::vector<std::string> bases, std::vector<uint32_t> qualities) {assign_read_amplicon(_root, amp_start, positions, bases, qualities);}
8992
void amplicon_position_pop() {amplicon_position_pop(_root);}
9093
};
9194

@@ -99,8 +102,9 @@ void populate_variants(uint32_t last_position);
99102
int unpaired_primers(ITNode *root, primer prim);
100103
void detect_primer_issues(ITNode *root, uint32_t find_position);
101104
void detect_amplicon_overlaps(ITNode *root, uint32_t find_position);
102-
void find_read_amplicon(ITNode *root, uint32_t lower, uint32_t upper, std::vector<uint32_t> positions, std::vector<std::string> bases, std::vector<uint32_t> qualities, bool &found);
105+
void find_read_amplicon(ITNode *root, uint32_t lower, uint32_t upper, bool &found, std::string read_name, uint32_t &amp_start, uint32_t &amp_dist);
103106
IntervalTree populate_amplicons(std::string pair_info_file, std::vector<primer> &primers);
104107
IntervalTree amplicon_position_pop();
108+
void assign_read_amplicon(ITNode *root, uint32_t amp_start, std::vector<uint32_t> positions, std::vector<std::string> bases, std::vector<uint32_t> qualities);
105109
void write_out_frequencies(ITNode *root, std::string filename);
106110
#endif

src/saga.cpp

Lines changed: 64 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,12 @@
99
#include <tuple>
1010
using namespace std::chrono;
1111

12-
void parse_cigar(const bam1_t* read1, std::vector<uint32_t> &positions, std::vector<std::string> &bases, std::vector<uint32_t> &qualities, uint32_t total_ref_pos, uint32_t total_query_pos, uint32_t ref_start_pos){
13-
//ref_start_pos describe the point after which we start recording bases
12+
void parse_cigar(const bam1_t* read1, std::vector<uint32_t> &positions, std::vector<std::string> &bases, std::vector<uint32_t> &qualities, uint32_t total_ref_pos){
13+
uint32_t total_query_pos=0;
1414
const uint8_t* seq_field1 = bam_get_seq(read1);
1515
uint32_t *cigar1 = bam_get_cigar(read1);
1616
uint8_t* qual = bam_get_qual(read1);
17-
17+
total_ref_pos += 1;
1818
for (uint32_t i = 0; i < read1->core.n_cigar; i++){
1919
uint32_t op = bam_cigar_op(cigar1[i]);
2020
uint32_t len = bam_cigar_oplen(cigar1[i]);
@@ -88,18 +88,14 @@ void merge_reads(const bam1_t* read1, const bam1_t* read2, IntervalTree &amplico
8888
//pass the forward first then reverse
8989
//underlying assumption here is that the overlap region is identical
9090
//also assumes that the forward read starts more "left" than the reverse
91-
91+
//std::cerr << "merging reads"<< std::endl;
9292
//get coordinates for potential overlap area
93-
uint32_t end_forward = find_sequence_end(read1);
93+
//uint32_t end_forward = find_sequence_end(read1);
9494
uint32_t start_reverse = read2->core.pos;
9595

9696
uint32_t start_forward = read1->core.pos;
9797
uint32_t end_reverse = find_sequence_end(read2);
98-
99-
//iterate the first cigar string
100-
uint32_t total_ref_pos = start_forward;
101-
uint32_t total_query_pos = 0;
102-
98+
10399
//record the positions and their bases
104100
std::vector<uint32_t> positions1;
105101
std::vector<std::string> bases1;
@@ -108,18 +104,10 @@ void merge_reads(const bam1_t* read1, const bam1_t* read2, IntervalTree &amplico
108104
std::vector<uint32_t> positions2;
109105
std::vector<std::string> bases2;
110106
std::vector<uint32_t> qualities2;
111-
112-
//we use all of the first read
113-
uint32_t end_overlap, begin_overlap;
114-
if(end_reverse <= end_forward){
115-
end_overlap = end_reverse-1;
116-
} else {
117-
end_overlap = end_forward-1;
118-
}
119-
begin_overlap = start_reverse;
120-
121-
parse_cigar(read1, positions1, bases1, qualities1, total_ref_pos, total_query_pos, start_forward);
122-
parse_cigar(read2, positions2, bases2, qualities2, start_reverse, total_query_pos, end_forward);
107+
108+
//start of read,
109+
parse_cigar(read1, positions1, bases1, qualities1, start_forward);
110+
parse_cigar(read2, positions2, bases2, qualities2, start_reverse);
123111

124112
//find all unique positions we need to cover
125113
std::unordered_set<uint32_t> unique_elements(positions1.begin(), positions1.end());
@@ -205,19 +193,27 @@ void merge_reads(const bam1_t* read1, const bam1_t* read2, IntervalTree &amplico
205193
}
206194

207195
//TESTLINES
208-
/*
196+
uint32_t test_counter = 0;
209197
for(uint32_t i=0; i < final_positions.size(); i++){
210-
std::cerr << final_positions[i] << " " << final_qualities[i] << " " << final_bases[i] << std::endl;
211-
}
212-
exit(0);*/
213-
198+
if(final_positions[i] == 16176){
199+
test_counter ++;
200+
//std::cerr << bam_get_qname(read1) << std::endl;
201+
//std::cerr << final_positions[i] << " " << final_qualities[i] << " " << final_bases[i] << std::endl;
202+
}
203+
}
204+
//exit(0);
214205

215206
//find assigned amplicon and populate position vector
216207
bool found_amplicon = false;
217-
amplicons.find_read_amplicon(start_forward, end_reverse, final_positions, final_bases, final_qualities, found_amplicon);
208+
//std::cerr << bam_get_qname(read1) << std::endl;
209+
uint32_t amp_dist = 429496729;
210+
uint32_t amp_start = 0;
211+
amplicons.find_read_amplicon(start_forward, end_reverse, found_amplicon, bam_get_qname(read1), amp_start, amp_dist);
212+
//std::cerr << found_amplicon << std::endl;
218213
if(!found_amplicon){
219214
amplicons.add_read_variants(final_positions, final_bases, final_qualities);
220-
std::cerr << "back " << found_amplicon << std::endl;
215+
} else {
216+
amplicons.assign_read_amplicon(amp_start, final_positions, final_bases, final_qualities);
221217
}
222218
}
223219

@@ -383,8 +379,10 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out,
383379
while (sam_read1(in, header, aln) >= 0) {
384380
//get the name of the read
385381
std::string read_name = bam_get_qname(aln);
386-
//if(read_name != "A01535:8:HJ3YYDSX2:4:1103:6840:26052") continue;
387-
std::cerr << read_name << std::endl;
382+
//TESTLINES
383+
//if(read_name != "A01535:8:HJ3YYDSX2:4:1147:23972:15421") continue;
384+
//if(read_name != "A01535:8:HJ3YYDSX2:4:1108:10212:30326") continue;
385+
388386
//std::cerr << read_name << std::endl;
389387
strand = '+';
390388
if (bam_is_rev(aln)) {
@@ -394,7 +392,7 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out,
394392
start_pos = aln->core.pos;
395393
}
396394
//TEST LINES
397-
//if(start_pos < 13000 || start_pos > 15000) continue;
395+
//if(start_pos < 15176 || start_pos > 17176) continue;
398396
bam1_t *r = aln;
399397
//get the md tag
400398
uint8_t *aux = bam_aux_get(aln, "MD");
@@ -415,17 +413,27 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out,
415413
amplicons.add_read_variants(cigar, aln->core.pos, nlength, seq, aux, qualities, bam_get_qname(aln));
416414
continue;
417415
}
418-
//TESTLINES
419-
if (aln->core.flag & BAM_FPAIRED){
420-
//std::cerr << bam_get_qname(aln) << std::endl;
421-
} else{
422-
//TODO HANDLE THIS CASE
423-
std::cerr << "read is unpaired" << std::endl;
424-
std::cerr << bam_get_qname(aln) << std::endl;
425-
exit(0);
416+
if (!aln->core.flag & BAM_FPAIRED){
417+
std::cerr << "not paired" << std::endl;
418+
//if the read is unpaired try to assign it to an amplicon anyways
419+
std::vector<uint32_t> positions;
420+
std::vector<std::string> bases;
421+
std::vector<uint32_t> qualities;
422+
uint32_t start_read = aln->core.pos;
423+
uint32_t end_read = find_sequence_end(aln);
424+
parse_cigar(aln, positions, bases, qualities, start_read);
425+
bool found_amplicon = false;
426+
uint32_t amp_dist = 429496729;
427+
uint32_t amp_start = 0;
428+
amplicons.find_read_amplicon(start_read, end_read, found_amplicon, read_name, amp_start, amp_dist);
429+
if(!found_amplicon){
430+
amplicons.add_read_variants(positions, bases, qualities);
431+
} else{
432+
amplicons.assign_read_amplicon(amp_start, positions, bases, qualities);
433+
}
434+
std::cerr << read_name << std::endl;
435+
continue;
426436
}
427-
428-
429437
auto it = read_map.find(read_name);
430438
//assumption is that read pairs share a name
431439
//execute if we've already seen the mate
@@ -444,7 +452,6 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out,
444452
// Store the current read in the map
445453
read_map[read_name] = bam_dup1(aln); // Duplicate the read to avoid overwriting
446454
}
447-
std::cerr << "end this" << std::endl;
448455
continue;
449456

450457
//TEST LINES
@@ -515,7 +522,6 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out,
515522
}
516523

517524
}
518-
std::cerr << "this" << std::endl;
519525
/*
520526
std::cerr << "transforming primers" << std::endl;
521527
//this is super time costly
@@ -537,10 +543,23 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out,
537543
amplicons.write_out_frequencies(amp_file);
538544
}
539545

546+
std::cerr << "test here " << amplicons.variants[16176].depth << std::endl;
547+
std::cerr << "read map remaining " << read_map.size() << std::endl;
548+
for(auto allele : amplicons.variants[16176].alleles){
549+
std::cerr << allele.nuc << " " << allele.depth << std::endl;
550+
}
551+
540552
//combine amplicon counts to get total variants
541553
amplicons.combine_haplotypes();
542554
//detect primer binding issues
543555
std::vector<position> variants = amplicons.variants;
556+
557+
std::cerr << "test here " << amplicons.variants[16176].depth << std::endl;
558+
for(auto allele : amplicons.variants[16176].alleles){
559+
std::cerr << allele.nuc << " " << allele.depth << std::endl;
560+
}
561+
//exit(0);
562+
544563

545564
//add in primer info
546565
for(uint32_t i=0; i < variants.size(); i++){
@@ -567,7 +586,7 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out,
567586
std::vector<uint32_t> flagged_positions;
568587
std::vector<float> std_deviations;
569588
std::vector<std::string> pos_nuc;
570-
uint32_t test_pos = 13572;
589+
uint32_t test_pos = 0;
571590
//detect fluctuating variants across amplicons
572591
for(uint32_t i=0; i < amplicons.max_pos; i++){
573592
amplicons.test_flux.clear();

0 commit comments

Comments
 (0)