Skip to content

Commit ff343db

Browse files
committed
adding insertion overlap read merge code
1 parent b27ea58 commit ff343db

File tree

3 files changed

+117
-81
lines changed

3 files changed

+117
-81
lines changed

src/interval_tree.cpp

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,19 +67,20 @@ void IntervalTree::combine_haplotypes(ITNode *root){
6767
combine_haplotypes(root->right);
6868
}
6969

70-
void IntervalTree::find_read_amplicon(ITNode *root, uint32_t lower, uint32_t upper, std::vector<uint32_t> positions, std::vector<std::string> bases, std::vector<uint32_t> qualities){
70+
void IntervalTree::find_read_amplicon(ITNode *root, uint32_t lower, uint32_t upper, std::vector<uint32_t> positions, std::vector<std::string> bases, std::vector<uint32_t> qualities, bool &found){
7171
if (root==NULL) return;
7272
//if ((uint32_t)root->data->low > upper) return;
7373
if(((uint32_t)root->data->low <= lower) && (upper <= (uint32_t)root->data->high)){
7474
for(uint32_t i=0; i < positions.size(); i++){
7575
for(uint32_t j=0; j < root->amp_positions.size(); j++){
7676
if(positions[i] == root->amp_positions[j].pos){
7777
root->amp_positions[j].update_alleles(bases[i], 1, qualities[i]);
78+
found = true;
7879
}
7980
}
8081
}
8182
}
82-
find_read_amplicon(root->right, lower, upper, positions, bases, qualities);
83+
find_read_amplicon(root->right, lower, upper, positions, bases, qualities, found);
8384
}
8485

8586
void IntervalTree::amplicon_position_pop(ITNode *root){
@@ -134,6 +135,12 @@ void IntervalTree::detect_abberations(ITNode *root, uint32_t find_position){
134135

135136
}
136137

138+
void IntervalTree::add_read_variants(std::vector<uint32_t> positions, std::vector<std::string> bases, std::vector<uint32_t> qualities){
139+
for(uint32_t i=0; i < positions.size(); i++){
140+
variants[positions[i]].update_alleles(bases[i], 1, qualities[i]);
141+
}
142+
}
143+
137144
void IntervalTree::add_read_variants(uint32_t *cigar, uint32_t start_pos, uint32_t nlength, uint8_t *seq, uint8_t *aux, uint8_t* qual, std::string qname) {
138145
uint32_t consumed_query = 0;
139146
uint32_t consumed_ref = 0;

src/interval_tree.h

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ class IntervalTree {
5959
void detect_abberations(ITNode *root, uint32_t pos);
6060
void detect_amplicon_overlaps(ITNode *root, uint32_t pos);
6161
void detect_primer_issues(ITNode *root, uint32_t pos);
62-
void find_read_amplicon(ITNode *root, uint32_t lower, uint32_t upper, std::vector<uint32_t> positions, std::vector<std::string> bases, std::vector<uint32_t> qualities);
62+
void find_read_amplicon(ITNode *root, uint32_t lower, uint32_t upper, std::vector<uint32_t> positions, std::vector<std::string> bases, std::vector<uint32_t> qualities, bool &found);
6363
public:
6464
uint32_t max_pos=0;
6565
std::vector<std::vector<uint32_t>> overlaps;
@@ -84,7 +84,8 @@ class IntervalTree {
8484
void write_out_frequencies(std::string filename){write_out_frequencies(_root, filename);}
8585
void add_read_variants(uint32_t *cigar, uint32_t start_pos, uint32_t nlength, uint8_t *sequence, uint8_t *aux, uint8_t *quality, std::string qname);
8686
void populate_variants(uint32_t last_position);
87-
void find_read_amplicon(uint32_t lower, uint32_t upper, std::vector<uint32_t> positions, std::vector<std::string> bases, std::vector<uint32_t> qualities) {find_read_amplicon(_root, lower, upper, positions, bases, qualities);}
87+
void add_read_variants(std::vector<uint32_t> positions, std::vector<std::string> bases, std::vector<uint32_t> qualities);
88+
void find_read_amplicon(uint32_t lower, uint32_t upper, std::vector<uint32_t> positions, std::vector<std::string> bases, std::vector<uint32_t> qualities, bool &found) {find_read_amplicon(_root, lower, upper, positions, bases, qualities, found);}
8889
void amplicon_position_pop() {amplicon_position_pop(_root);}
8990
};
9091

@@ -93,11 +94,12 @@ void detect_abberations(ITNode *root, uint32_t find_position);
9394
void get_max_pos();
9495
void set_haplotypes(ITNode *root, primer prim);
9596
void add_read_variants(uint32_t *cigar, uint32_t start_pos, uint32_t nlength, uint8_t *sequence, uint8_t *aux, uint8_t *quality, std::string qname);
97+
void add_read_variants(std::vector<uint32_t> positions, std::vector<std::string> bases, std::vector<uint32_t> qualities);
9698
void populate_variants(uint32_t last_position);
9799
int unpaired_primers(ITNode *root, primer prim);
98100
void detect_primer_issues(ITNode *root, uint32_t find_position);
99101
void detect_amplicon_overlaps(ITNode *root, uint32_t find_position);
100-
void find_read_amplicon(ITNode *root, uint32_t lower, uint32_t upper, std::vector<uint32_t> positions, std::vector<std::string> bases, std::vector<uint32_t> qualities);
102+
void find_read_amplicon(ITNode *root, uint32_t lower, uint32_t upper, std::vector<uint32_t> positions, std::vector<std::string> bases, std::vector<uint32_t> qualities, bool &found);
101103
IntervalTree populate_amplicons(std::string pair_info_file, std::vector<primer> &primers);
102104
IntervalTree amplicon_position_pop();
103105
void write_out_frequencies(ITNode *root, std::string filename);

src/saga.cpp

Lines changed: 103 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
#include <tuple>
1010
using namespace std::chrono;
1111

12-
void parse_cigar(const bam1_t* read1, std::vector<uint32_t> &positions, std::vector<std::string> &bases, std::vector<uint32_t> &qualities, uint32_t total_ref_pos, uint32_t total_query_pos, uint32_t ref_start_pos, std::vector<std::string> &overlap_seq, uint32_t o_lower, uint32_t o_upper, std::vector<uint32_t> &insertion_positions){
12+
void parse_cigar(const bam1_t* read1, std::vector<uint32_t> &positions, std::vector<std::string> &bases, std::vector<uint32_t> &qualities, uint32_t total_ref_pos, uint32_t total_query_pos, uint32_t ref_start_pos){
1313
//ref_start_pos describe the point after which we start recording bases
1414
const uint8_t* seq_field1 = bam_get_seq(read1);
1515
uint32_t *cigar1 = bam_get_cigar(read1);
@@ -23,46 +23,27 @@ void parse_cigar(const bam1_t* read1, std::vector<uint32_t> &positions, std::vec
2323
for(uint32_t j=total_query_pos; j < total_query_pos+len; j++){
2424
char nuc = seq_nt16_str[bam_seqi(seq_field1, j)];
2525
std::string tmp = std::string(1, nuc);
26-
if(total_ref_pos+counter >= ref_start_pos){
27-
positions.push_back(total_ref_pos+counter);
28-
bases.push_back(tmp);
29-
qualities.push_back((uint32_t)qual[j]);
30-
}
31-
if(total_ref_pos+counter >= o_lower && total_ref_pos+counter <= o_upper){
32-
//std::cerr << total_ref_pos+counter << " this " << std::endl;
33-
overlap_seq.push_back(tmp);
34-
}
26+
positions.push_back(total_ref_pos+counter);
27+
bases.push_back(tmp);
28+
qualities.push_back((uint32_t)qual[j]);
3529
counter++;
3630
}
3731
}
3832
if(op == 2){
39-
for(uint32_t j=0; j < len; j++){
40-
if(total_ref_pos+counter >= ref_start_pos){
41-
positions.push_back(total_ref_pos+counter);
42-
bases.push_back("-");
43-
qualities.push_back((uint32_t)20);
44-
}
45-
if(total_ref_pos+counter >= o_lower && total_ref_pos+counter <= o_upper){
46-
overlap_seq.push_back("-");
47-
//std::cerr << total_ref_pos+counter << " - " << std::endl;
48-
}
33+
for(uint32_t j=0; j < len; j++){
34+
positions.push_back(total_ref_pos+counter);
35+
bases.push_back("-");
36+
qualities.push_back((uint32_t)20);
4937
counter++;
5038
}
5139
}
5240
if(op == 1){
5341
for(uint32_t j=total_query_pos; j < total_query_pos+len; j++){
5442
char nuc = seq_nt16_str[bam_seqi(seq_field1, j)];
5543
std::string tmp = "+" + std::string(1, nuc);
56-
if(total_ref_pos+counter >= ref_start_pos){
57-
positions.push_back(total_ref_pos+counter);
58-
bases.push_back(tmp);
59-
qualities.push_back((uint32_t)qual[j]);
60-
}
61-
if(total_ref_pos+counter >= o_lower && total_ref_pos+counter <= o_upper){
62-
overlap_seq.push_back(tmp);
63-
insertion_positions.push_back(total_query_pos+counter);
64-
//std::cerr << total_ref_pos << " " << nuc << std::endl;
65-
}
44+
positions.push_back(total_ref_pos+counter);
45+
bases.push_back(tmp);
46+
qualities.push_back((uint32_t)qual[j]);
6647
counter++;
6748
}
6849
}
@@ -103,7 +84,7 @@ uint32_t find_sequence_end(const bam1_t* read){
10384
return start;
10485
}
10586

106-
void merge_reads(const bam1_t* read1, const bam1_t* read2, IntervalTree amplicons){
87+
void merge_reads(const bam1_t* read1, const bam1_t* read2, IntervalTree &amplicons){
10788
//pass the forward first then reverse
10889
//underlying assumption here is that the overlap region is identical
10990
//also assumes that the forward read starts more "left" than the reverse
@@ -128,10 +109,6 @@ void merge_reads(const bam1_t* read1, const bam1_t* read2, IntervalTree amplicon
128109
std::vector<std::string> bases2;
129110
std::vector<uint32_t> qualities2;
130111

131-
//CLEANUP can probably get rid of this
132-
std::vector<std::string> overlap_seq1;
133-
std::vector<std::string> overlap_seq2;
134-
135112
//we use all of the first read
136113
uint32_t end_overlap, begin_overlap;
137114
if(end_reverse <= end_forward){
@@ -141,57 +118,107 @@ void merge_reads(const bam1_t* read1, const bam1_t* read2, IntervalTree amplicon
141118
}
142119
begin_overlap = start_reverse;
143120

144-
//std::cerr << start_forward << " " << end_forward << std::endl;
145-
//std::cerr << start_reverse << " " << end_reverse << std::endl;
146-
std::vector<uint32_t> insertion_positions1;
147-
std::vector<uint32_t> insertion_positions2;
148-
149-
parse_cigar(read1, positions1, bases1, qualities1, total_ref_pos, total_query_pos, start_forward, overlap_seq1, begin_overlap, end_overlap, insertion_positions1);
150-
parse_cigar(read2, positions2, bases2, qualities2, start_reverse, total_query_pos, end_forward, overlap_seq2, begin_overlap, end_overlap, insertion_positions2);
151-
152-
std::vector<uint32_t> removal;
121+
parse_cigar(read1, positions1, bases1, qualities1, total_ref_pos, total_query_pos, start_forward);
122+
parse_cigar(read2, positions2, bases2, qualities2, start_reverse, total_query_pos, end_forward);
153123

154124
//find all unique positions we need to cover
155-
std::unordered_set<int> unique_elements(positions1.begin(), positions1.end());
125+
std::unordered_set<uint32_t> unique_elements(positions1.begin(), positions1.end());
156126
unique_elements.insert(positions2.begin(), positions2.end());
157127

128+
std::vector<uint32_t> final_positions;
129+
std::vector<uint32_t> final_qualities;
130+
std::vector<std::string> final_bases;
131+
158132
//for every position make sure the bases and qualities match
159133
//make sure insertions match
160134
for(auto pos : unique_elements){
161-
std::cerr << p << std::endl;
162-
}
163-
exit(0);
164-
/*if(overlap_seq1.size() == overlap_seq2.size(){
165-
for(uint32_t j=0; j < overlap_seq1.size(); j++){
166-
if(overlap_seq1[j] != overlap_seq2[j]){
167-
removal.push_back(j);
135+
auto first_it = std::find(positions1.begin(), positions1.end(), pos);
136+
auto second_it = std::find(positions2.begin(), positions2.end(), pos);
137+
138+
//if its in the first read but not the second
139+
if(first_it != positions1.end() && second_it == positions2.end()){
140+
auto it = positions1.begin();
141+
while ((it = std::find(it, positions1.end(), pos)) != positions1.end()) {
142+
uint32_t index1 = std::distance(positions1.begin(), it);
143+
final_positions.push_back(positions1[index1]);
144+
final_qualities.push_back(qualities1[index1]);
145+
final_bases.push_back(bases1[index1]);
146+
++it;
168147
}
169-
}
170-
} else {
171-
uint32_t max_range = std::max(overlap_seq1.size(), overlap_seq2.size());
172-
uint32_t counter1=0;
173-
uint32_t counter2=0;
174-
for(uint32_t j=0; j < max_range; j++){
175-
//check if this position is marked as an insertion in either sequence
176-
bool exists1 = std::count(insertion_positions1.begin(), insertion_positions1.end(), j) > 0;
177-
bool exists2 = std::count(insertion_positions2.begin(), insertion_positions2.end(), j) > 0;
178-
if(exists1 == exists2){
179-
if(overlap_seq1[counter1] != overlap_seq2[counter2]){
180-
removal.push_back();
148+
} else if(first_it == positions1.end() && second_it != positions2.end()){
149+
//if its in the second read but not the first
150+
auto it = positions2.begin();
151+
while ((it = std::find(it, positions2.end(), pos)) != positions2.end()) {
152+
uint32_t index2 = std::distance(positions2.begin(), it);
153+
final_positions.push_back(positions2[index2]);
154+
final_qualities.push_back(qualities2[index2]);
155+
final_bases.push_back(bases2[index2]);
156+
++it;
157+
}
158+
} else if(first_it != positions1.end() && second_it != positions2.end()){
159+
//if its in both reads
160+
//check if the number of times the position occurs is the same
161+
uint32_t count1 = std::count(positions1.begin(), positions1.end(), pos);
162+
uint32_t count2 = std::count(positions2.begin(), positions2.end(), pos);
163+
if(count1 != count2) continue;
164+
165+
std::vector<uint32_t> tmp_pos2;
166+
std::vector<std::string> tmp_base2;
167+
std::vector<uint32_t> tmp_qual2;
168+
//go get all the second read info
169+
auto sit = positions2.begin();
170+
while ((sit = std::find(sit, positions2.end(), pos)) != positions2.end()) {
171+
uint32_t index2 = std::distance(positions2.begin(), sit);
172+
tmp_pos2.push_back(positions2[index2]);
173+
tmp_qual2.push_back(qualities2[index2]);
174+
tmp_base2.push_back(bases2[index2]);
175+
++sit;
176+
}
177+
std::vector<uint32_t> tmp_pos1;
178+
std::vector<std::string> tmp_base1;
179+
std::vector<uint32_t> tmp_qual1;
180+
//go get all the first read info
181+
auto fit = positions1.begin();
182+
while ((fit = std::find(fit, positions1.end(), pos)) != positions1.end()) {
183+
uint32_t index1 = std::distance(positions1.begin(), fit);
184+
tmp_pos1.push_back(positions1[index1]);
185+
tmp_qual1.push_back(qualities1[index1]);
186+
tmp_base1.push_back(bases1[index1]);
187+
++fit;
188+
}
189+
//now do the base and quality comparison
190+
bool use= true;
191+
for(uint32_t j=0; j < tmp_pos1.size(); j++){
192+
if(tmp_base1[j] != tmp_base2[j]){
193+
use = false;
194+
break;
195+
}
196+
}
197+
if(use) {
198+
for(uint32_t j=0; j < tmp_pos1.size(); j++){
199+
final_positions.push_back(tmp_pos1[j]);
200+
final_qualities.push_back(tmp_qual1[j]);
201+
final_bases.push_back(tmp_base1[j]);
181202
}
182203
}
183204
}
184-
}*/
185-
186-
//remove positions, bases, and qualitieis where it is not the same
187-
/*std::sort(removal.rbegin(), removal.rend());
188-
for (uint32_t idx : removal) {
189-
positions.erase(positions.begin() + idx);
190-
bases.erase(bases.begin() + idx);
191-
qualities.erase(qualities.begin() + idx);
192-
}*/
205+
}
206+
207+
//TESTLINES
208+
/*
209+
for(uint32_t i=0; i < final_positions.size(); i++){
210+
std::cerr << final_positions[i] << " " << final_qualities[i] << " " << final_bases[i] << std::endl;
211+
}
212+
exit(0);*/
213+
214+
193215
//find assigned amplicon and populate position vector
194-
amplicons.find_read_amplicon(start_forward, end_reverse, positions1, bases1, qualities1);
216+
bool found_amplicon = false;
217+
amplicons.find_read_amplicon(start_forward, end_reverse, final_positions, final_bases, final_qualities, found_amplicon);
218+
if(!found_amplicon){
219+
amplicons.add_read_variants(final_positions, final_bases, final_qualities);
220+
std::cerr << "back " << found_amplicon << std::endl;
221+
}
195222
}
196223

197224
void generate_range(uint32_t start, uint32_t end, std::vector<uint32_t> &result) {
@@ -356,7 +383,7 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out,
356383
while (sam_read1(in, header, aln) >= 0) {
357384
//get the name of the read
358385
std::string read_name = bam_get_qname(aln);
359-
if(read_name != "A01535:8:HJ3YYDSX2:4:1103:6840:26052") continue;
386+
//if(read_name != "A01535:8:HJ3YYDSX2:4:1103:6840:26052") continue;
360387
std::cerr << read_name << std::endl;
361388
//std::cerr << read_name << std::endl;
362389
strand = '+';
@@ -367,7 +394,7 @@ int preprocess_reads(std::string bam, std::string bed, std::string bam_out,
367394
start_pos = aln->core.pos;
368395
}
369396
//TEST LINES
370-
if(start_pos < 13000 || start_pos > 15000) continue;
397+
//if(start_pos < 13000 || start_pos > 15000) continue;
371398
bam1_t *r = aln;
372399
//get the md tag
373400
uint8_t *aux = bam_aux_get(aln, "MD");

0 commit comments

Comments
 (0)