Skip to content

Commit 0ddebe7

Browse files
committed
Fix a bug with SV calling from clipped SFS.
1 parent f1eddf3 commit 0ddebe7

5 files changed

+44
-44
lines changed

clipper.cpp

+19-26
Original file line numberDiff line numberDiff line change
@@ -98,41 +98,34 @@ vector<Clip> Clipper::filter_tooclose_clips(const vector<Clip> &clips, interval_
9898
return fclips;
9999
}
100100

101-
int binary_search_left(const vector<Clip>& clips, int begin, int end, const Clip& query) {
101+
// find smallest right that is larger than query
102+
int binary_search(const vector<Clip>& clips, int begin, int end, const Clip& query) {
103+
//for (int i = 0; i < clips.size(); i++) {
104+
// if (query.p < clips[i].p) {
105+
// return i ;
106+
// }
107+
//}
108+
//return -1 ;
102109
if (begin > end || begin >= clips.size()) {
103110
return -1 ;
104111
}
105112
int m = (begin + end) / 2 ;
106113
if (clips[m].p == query.p) {
107-
if (m != 0) {
108-
return m - 1 ;
109-
} else {
110-
return -1 ;
111-
}
112-
} else if (clips[m].p < query.p) {
113-
return binary_search_left(clips, m, end, query) ;
114-
} else {
115-
return binary_search_left(clips, begin, m - 1, query) ;
116-
}
117-
}
118-
119-
int binary_search_right(const vector<Clip>& clips, int begin, int end, const Clip& query) {
120-
if (begin > end || begin >= clips.size()) {
121-
return -1 ;
122-
}
123-
int m = (begin + end) / 2 ;
124-
if (clips[m].p == query.p) {
125-
if (m != 0) {
114+
if (m + 1 < clips.size()) {
126115
return m + 1 ;
127116
} else {
128-
return -1 ;
117+
return m ;
129118
}
130119
} else if (clips[m].p > query.p) {
131-
return binary_search_right(clips, begin, m, query) ;
120+
if (m > 0 && clips[m - 1].p < query.p) {
121+
return m ;
122+
}
123+
return binary_search(clips, begin, m - 1, query) ;
132124
} else {
133-
return binary_search_right(clips, m + 1, end, query) ;
125+
return binary_search(clips, m + 1, end, query) ;
134126
}
135127
}
128+
136129
void Clipper::call(int threads, interval_tree_t<int>& vartree) {
137130
lprint({"Predicting SVS from", to_string(clips.size()), "clipped SFS on", to_string(threads), "threads.."}) ;
138131
vector<Clip> rclips;
@@ -144,9 +137,9 @@ void Clipper::call(int threads, interval_tree_t<int>& vartree) {
144137
rclips.push_back(clip);
145138
}
146139
}
147-
lprint({"Preprocessing clipped SFS.."}) ;
148140
lprint({to_string(lclips.size()), "left clips."}) ;
149141
lprint({to_string(rclips.size()), "right clips."}) ;
142+
lprint({"Preprocessing clipped SFS.."}) ;
150143
#pragma omp parallel for num_threads(2) schedule(static,1)
151144
for (int i = 0; i < 2; i++) {
152145
if (i == 0) {
@@ -178,7 +171,7 @@ void Clipper::call(int threads, interval_tree_t<int>& vartree) {
178171
int t = omp_get_thread_num() ;
179172
string chrom = lc.chrom ;
180173
// we get the closest right clip
181-
int r = binary_search_right(rclips, 0, rclips.size() - 1, lc) ;
174+
int r = binary_search(rclips, 0, rclips.size() - 1, lc) ;
182175
if (r == -1) {
183176
continue ;
184177
}
@@ -202,7 +195,7 @@ void Clipper::call(int threads, interval_tree_t<int>& vartree) {
202195
int t = omp_get_thread_num() ;
203196
string chrom = rc.chrom ;
204197
// we get the closest right clip
205-
int l = binary_search_left(lclips, 0, lclips.size() - 1, rc) ;
198+
int l = binary_search(lclips, 0, lclips.size() - 1, rc) ;
206199
if (l == -1) {
207200
continue ;
208201
}

extender.cpp

+18-13
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ void Extender::run(int _threads) {
2828
}
2929
clips.insert(clips.begin(), _p_clips[i].begin(), _p_clips[i].end()) ;
3030
}
31-
lprint({"Extension complete."}) ;
31+
lprint({"Extension complete.", to_string(clips.size()), " clipped SFS."}) ;
3232
// build a separate interval tree for each chromosome
3333
cluster_no_interval_tree() ;
3434
// put all clusters in a single vector
@@ -60,6 +60,7 @@ void Extender::run(int _threads) {
6060
lprint({"Extracted", to_string(svs.size()), "SVs."});
6161
sam_close(bam_file) ;
6262
filter_sv_chains() ;
63+
//lprint({"Error stats: ", to_string(skip_1), " can't be mapped, ", to_string(skip_2), " can't be extended ", to_string(skip_3), " anomalies.", to_string(skip_4), " clipped."}) ;
6364
}
6465

6566
pair<int, int> Extender::get_unique_kmers(const vector<pair<int, int>> &alpairs, const uint k, const bool from_end, string chrom) {
@@ -207,9 +208,6 @@ bool Extender::load_batch_bam(int threads, int batch_size, int p) {
207208
if (aln->core.flag & BAM_FUNMAP || aln->core.flag & BAM_FSUPPLEMENTARY || aln->core.flag & BAM_FSECONDARY) {
208209
continue ;
209210
}
210-
if (aln->core.qual != 0) {
211-
continue ;
212-
}
213211
char *qname = bam_get_qname(aln);
214212
if (SFSs->find(qname) == SFSs->end()) {
215213
continue ;
@@ -298,6 +296,7 @@ void Extender::extend_alignment(bam1_t* aln, int index) {
298296
uint l = bam_cigar_oplen(*(cigar + 0));
299297
if (op == BAM_CSOFT_CLIP) {
300298
lclip = make_pair(aln->core.pos, l);
299+
skip_4++ ;
301300
}
302301
// we skip this SFS
303302
continue ;
@@ -306,6 +305,7 @@ void Extender::extend_alignment(bam1_t* aln, int index) {
306305
uint l = bam_cigar_oplen(*(cigar + aln->core.n_cigar - 1));
307306
if (op == BAM_CSOFT_CLIP) {
308307
rclip = make_pair(bam_endpos(aln), l);
308+
skip_4++ ;
309309
}
310310
// we skip this SFS
311311
continue ;
@@ -382,12 +382,12 @@ void Extender::extend_alignment(bam1_t* aln, int index) {
382382
} else {
383383
_p_extended_sfs[index].push_back(ExtSFS(string(chrom), string(qname), prekmer.second, postkmer.second + kmer_size));
384384
}
385-
if (lclip.second > 0) {
386-
_p_clips[index].push_back(Clip(qname, chrom, lclip.first, lclip.second, true)) ;
387-
}
388-
if (rclip.second > 0) {
389-
_p_clips[index].push_back(Clip(qname, chrom, rclip.first, rclip.second, false)) ;
390-
}
385+
}
386+
if (lclip.second > 0) {
387+
_p_clips[index].push_back(Clip(qname, chrom, lclip.first, lclip.second, true)) ;
388+
}
389+
if (rclip.second > 0) {
390+
_p_clips[index].push_back(Clip(qname, chrom, rclip.first, rclip.second, false)) ;
391391
}
392392
}
393393

@@ -607,7 +607,9 @@ void Extender::extract_sfs_sequences() {
607607
cerr << endl ;
608608
}
609609
for (int i = 0; i < threads; i++) {
610-
free(seq[i]) ;
610+
if (len[i] > 0) {
611+
free(seq[i]) ;
612+
}
611613
bam_destroy1(_p_aln[i]) ;
612614
sam_close(_p_bam_file[i]) ;
613615
}
@@ -784,6 +786,9 @@ void Extender::call() {
784786

785787
void Extender::filter_sv_chains() {
786788
std::sort(svs.begin(), svs.end()) ;
789+
if (svs.size() < 2) {
790+
return ;
791+
}
787792
lprint({to_string(svs.size()), "SVs before chain filtering."}) ;
788793
vector<SV> _svs ;
789794
auto& prev = svs[0] ;
@@ -796,7 +801,7 @@ void Extender::filter_sv_chains() {
796801
}
797802
auto& sv = svs[i] ;
798803
if (sv.chrom == prev.chrom && sv.s - prev.e < 2 * sv.l && prev.type == sv.type) {
799-
cout << sv.chrom << " " << sv.s << " " << sv.type << " ---- " << prev.s << endl ;
804+
//cout << sv.chrom << " " << sv.s << " " << sv.type << " ---- " << prev.s << endl ;
800805
// check for sequence similarity
801806
double w_r = max((double)sv.w, (double)prev.w) / min((double)sv.w, (double)prev.w) ;
802807
double l_r = min((double)sv.l, (double)prev.l) / max((double)sv.l, (double)prev.l) ;
@@ -807,7 +812,7 @@ void Extender::filter_sv_chains() {
807812
} else {
808813
d = rapidfuzz::fuzz::ratio(sv.altall, prev.altall) ;
809814
}
810-
cout << w_r << " " << l_r << " " << d << endl ;
815+
//cout << w_r << " " << l_r << " " << d << endl ;
811816
if (d > 70) {
812817
if (sv.w > prev.w) {
813818
_svs.push_back(sv) ;

extender.hpp

+1
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ class Extender {
3535
uint skip_1 = 0 ; // SFS skipped since no first/last base can be placed from read alignment (should be rare)
3636
uint skip_2 = 0 ; // SFS skipped since it couldn't be extended
3737
uint skip_3 = 0 ; // SFS skipped since reads starts/ends inside a cluster
38+
uint skip_4 = 0 ; // SFS skipped since reads starts/ends inside a cluster
3839
uint small_cl = 0 ; // number of cluster (before clustering) with low support
3940
uint extcl = 0 ; // number of extended clusters (after clustering)
4041
uint small_extcl = 0 ; // number of extended clusters (after clustering) with low support

ping_pong.cpp

+5-4
Original file line numberDiff line numberDiff line change
@@ -57,9 +57,6 @@ bool PingPong::backward_search(rld_t *index, const uint8_t *P, int p2) {
5757
// However non-reconstructed reads are going to produce loads of crappy SFS, unless we filter them
5858
void PingPong::ping_pong_search(rld_t *index, uint8_t* P, int l, std::vector<sfs_type_t>& solutions, bool isreconstructed, bam1_t* aln) {
5959
//DEBUG(cerr << "Read Length: " << l << endl ;)
60-
if (!isreconstructed) {
61-
return ;
62-
}
6360
// this is not needed anymore
6461
//int current_interval = -1 ;
6562
//vector<pair<int, int>> m_intervals ;
@@ -268,12 +265,16 @@ batch_type_t PingPong::process_batch(rld_t* index, int p, int i) {
268265
} else {
269266
for (int j = 0; j < read_seqs[p][i].size(); j++) {
270267
char *qname = bam_get_qname(bam_entries[p][i][j]) ;
268+
bool isreconstructed = reconstructed_reads.find(qname) != reconstructed_reads.end() ;
271269
if (config->putative) {
272270
if (ignored_reads.find(qname) != ignored_reads.end()) {
273271
continue ;
274272
}
273+
// was not ignored, so either it's reconstructed or not:
274+
if (!isreconstructed) {
275+
continue ;
276+
}
275277
}
276-
bool isreconstructed = reconstructed_reads.find(qname) != reconstructed_reads.end() ;
277278
//cout << qname << " " << isreconstructed << endl ;
278279
ping_pong_search(index, read_seqs[p][i][j], read_seq_lengths[p][i][j], solutions[qname], isreconstructed, bam_entries[p][i][j]) ;
279280
}

reconstructor.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ void Reconstructor::reconstruct_read(bam1_t* alignment, char* read_seq, string c
172172
new_cigar.push_back(cigar_offsets[m]) ;
173173
} else {
174174
cout << "Illegal Cigar OP" << endl ;
175-
break ;
175+
break ;
176176
//if (cigar_offsets[m].second == BAM_CPAD || cigar_offsets[m].second == BAM_CHARD_CLIP || cigar_offsets[m].second == BAM_CBACK) {
177177
}
178178
m += 1 ;

0 commit comments

Comments
 (0)