Skip to content

Commit 91a4699

Browse files
committed
Use ksw2 instead of parasail + Remove genotyper
1 parent 723790e commit 91a4699

File tree

6 files changed

+81
-87
lines changed

6 files changed

+81
-87
lines changed

CMakeLists.txt

+18-20
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ if(DEFINED CONDAPREFIX)
1717
endif()
1818

1919
project(SVDSS VERSION 2.0.0)
20-
add_executable(SVDSS assembler.cpp bam.cpp caller.cpp chromosomes.cpp clipper.cpp clusterer.cpp config.cpp ping_pong.cpp sfs.cpp smoother.cpp sv.cpp genotyper.cpp main.cpp)
20+
add_executable(SVDSS assembler.cpp bam.cpp caller.cpp chromosomes.cpp clipper.cpp clusterer.cpp config.cpp ping_pong.cpp sfs.cpp smoother.cpp sv.cpp main.cpp)
2121
set_target_properties(SVDSS PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_SOURCE_DIR}")
2222
target_compile_options(SVDSS PRIVATE -Wall -Wextra)
2323

@@ -107,22 +107,19 @@ SET(ABPOA_INCLUDE_DIR ${ABPOA_SOURCE_DIR}/include)
107107
# add_library(ABPOA STATIC IMPORTED)
108108
# set_target_properties(ABPOA PROPERTIES IMPORTED_LOCATION ${ABPOA_SOURCE_DIR}/lib/libabpoa.a)
109109

110-
# parasail
111-
###########
112-
message(STATUS "parasail will be built from source")
113-
ExternalProject_Add(parasail
114-
GIT_REPOSITORY https://github.com/jeffdaily/parasail.git
115-
GIT_TAG 600fb26151ff19899ee39a214972dcf2b9b11ed7 # v2.6.2
116-
UPDATE_COMMAND ""
117-
BUILD_IN_SOURCE 1
118-
CMAKE_ARGS -DBUILD_SHARED_LIBS=OFF
119-
INSTALL_COMMAND ""
120-
)
121-
ExternalProject_Get_Property(parasail SOURCE_DIR)
122-
SET(PARASAIL_SOURCE_DIR ${SOURCE_DIR})
123-
SET(PARASAIL_INCLUDE_DIR ${PARASAIL_SOURCE_DIR})
124-
add_library(PARASAIL STATIC IMPORTED)
125-
set_target_properties(PARASAIL PROPERTIES IMPORTED_LOCATION ${PARASAIL_SOURCE_DIR}/libparasail.a)
110+
# ksw2
111+
message(STATUS "ksw2 will be built from source")
112+
ExternalProject_Add(ksw2
113+
GIT_REPOSITORY https://github.com/lh3/ksw2.git
114+
GIT_TAG ""
115+
BUILD_IN_SOURCE 1
116+
UPDATE_COMMAND ""
117+
CONFIGURE_COMMAND ""
118+
BUILD_COMMAND make
119+
INSTALL_COMMAND ""
120+
)
121+
ExternalProject_Get_Property(ksw2 SOURCE_DIR)
122+
SET(KSW2_INCLUDE_DIR ${SOURCE_DIR})
126123

127124
# rapidfuzz-cpp
128125
################
@@ -182,7 +179,7 @@ FetchContent_MakeAvailable(spdlog)
182179
### final setup (includes and libraries) ###
183180
############################################
184181

185-
add_dependencies(SVDSS abpoa parasail rapidfuzz intervaltree ropebwt spdlog)
182+
add_dependencies(SVDSS abpoa rapidfuzz intervaltree ropebwt spdlog ksw2)
186183
if(NOT DEFINED CONDAPREFIX)
187184
# we need to build them
188185
add_dependencies(SVDSS deflate htslib)
@@ -191,7 +188,7 @@ endif()
191188
target_include_directories(SVDSS
192189
PRIVATE ${HTS_INCLUDE_DIR}
193190
PRIVATE ${ABPOA_INCLUDE_DIR}
194-
PRIVATE ${PARASAIL_INCLUDE_DIR}
191+
PRIVATE ${KSW2_INCLUDE_DIR}
195192
PRIVATE ${RAPIDFUZZ_INCLUDE_DIR}
196193
PRIVATE ${INTERVALTREE_INCLUDE_DIR}
197194
PRIVATE ${ROPEBWT_INCLUDE_DIR}
@@ -220,7 +217,8 @@ endif()
220217
target_link_libraries(SVDSS
221218
PUBLIC rapidfuzz::rapidfuzz
222219
PUBLIC ${BINARY_DIR}/lib/libabpoa.a
223-
PUBLIC PARASAIL
220+
PUBLIC ${KSW2_INCLUDE_DIR}/ksw2_extz2_sse.o
221+
PUBLIC ${KSW2_INCLUDE_DIR}/ksw2_extd2_sse.o
224222
PUBLIC ROPEBWT
225223
PUBLIC spdlog::spdlog
226224
PUBLIC z

bam.hpp

+10-6
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#ifndef BAM_HPP
22
#define BAM_HPP
33

4+
#include <cstring>
45
#include <ctype.h>
56
#include <iostream>
67
#include <iterator>
@@ -47,21 +48,24 @@ struct CIGAR {
4748
}
4849
}
4950

50-
void parse_cigar(char *cigar) {
51+
void parse_cigar(const char *cigar) {
5152
int b = 0;
5253
int lc = strlen(cigar);
54+
char *cigar_cpy = (char *)malloc(lc + 1);
55+
strncpy(cigar_cpy, cigar, lc + 1);
5356
for (int i = 0; i < lc; i++) {
54-
if (isdigit(cigar[i]))
57+
if (isdigit(cigar_cpy[i]))
5558
continue;
5659
else {
57-
char type = cigar[i];
58-
cigar[i] = '\0';
59-
int l = stoi(string(cigar + b));
60+
char type = cigar_cpy[i];
61+
cigar_cpy[i] = '\0';
62+
int l = stoi(string(cigar_cpy + b));
6063
ops.push_back(make_pair(l, type));
61-
cigar[i] = type;
64+
cigar_cpy[i] = type;
6265
b = i + 1;
6366
}
6467
}
68+
free(cigar_cpy);
6569
}
6670

6771
CIGAR(char *cigar, int score_, int start_ = 0) {

caller.cpp

+41-40
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ void Caller::run() {
5252
for (const SV &sv : clipped_svs)
5353
cout << sv << endl;
5454
}
55+
56+
destroy_chromosomes();
5557
}
5658

5759
void Caller::write_vcf() {
@@ -94,7 +96,7 @@ vector<Cluster> Caller::split_cluster_by_len(const Cluster &cluster) {
9496
return subclusters;
9597
}
9698

97-
/* Split cluster in subclusters */
99+
// Split cluster in subclusters
98100
vector<Cluster> Caller::split_cluster(const Cluster &cluster) {
99101
// Step 1: split cluster by haplotype tag
100102
Cluster cluster_0 = cluster;
@@ -256,13 +258,16 @@ string Caller::run_poa(const vector<string> &seqs) {
256258
uint n_seqs = seqs.size();
257259
abpoa_t *ab = abpoa_init();
258260
abpoa_para_t *abpt = abpoa_init_para();
259-
abpt->disable_seeding = 1;
260261
abpt->align_mode = 0; // global
262+
abpt->disable_seeding = 1;
263+
abpt->progressive_poa = 0;
264+
abpt->amb_strand = 0;
261265
abpt->out_msa = 0;
262266
abpt->out_cons = 1;
263267
abpt->out_gfa = 0;
264268
// abpt->is_diploid = 1; // TODO: maybe this works now
265-
abpt->progressive_poa = 0;
269+
// abpt->max_n_cons = 2;
270+
// abpt->min_freq = 0.25;
266271
abpoa_post_set_para(abpt);
267272

268273
// abpt->match = 2; // match score
@@ -285,7 +290,8 @@ string Caller::run_poa(const vector<string> &seqs) {
285290

286291
abpoa_msa(ab, abpt, n_seqs, NULL, seq_lens, bseqs, NULL, NULL);
287292
abpoa_cons_t *abc = ab->abc;
288-
string cons = "";
293+
string cons = ""; // XXX: we may avoid converting to ACGT here since we need
294+
// to reconvert back for ksw2
289295
if (abc->n_cons > 0)
290296
for (int j = 0; j < abc->cons_len[0]; ++j)
291297
cons += "ACGTN"[abc->cons_base[0][j]];
@@ -301,10 +307,8 @@ string Caller::run_poa(const vector<string> &seqs) {
301307
return cons;
302308
}
303309

304-
/* Call SVs by POA+realignment */
310+
// Call SVs by POA+realignment
305311
void Caller::pcall(const vector<Cluster> &clusters) {
306-
// vector<Genotyper> genotypers(config->threads);
307-
vector<string> gt_strings = {"0/0", "0/1", "1/0", "1/1"};
308312
#pragma omp parallel for num_threads(config->threads) schedule(static, 1)
309313
for (size_t i = 0; i < clusters.size(); i++) {
310314
int t = omp_get_thread_num();
@@ -313,24 +317,6 @@ void Caller::pcall(const vector<Cluster> &clusters) {
313317
continue;
314318
string chrom = cluster.chrom;
315319

316-
// genotypers.at(t).posterior_sv_genotype_give_reads(cluster.reads);
317-
// vector<double> gts = genotypers.at(t).get_posterior_sv_genotype();
318-
Genotyper gtyper;
319-
gtyper.posterior_sv_genotype_give_reads(cluster.reads);
320-
vector<double> gts = gtyper.get_posterior_sv_genotype();
321-
auto max_gt = max_element(gts.begin(), gts.end());
322-
double gtq = *max_gt;
323-
if (gtq < 0 || gtq > 1) {
324-
cerr << chrom << ":" << cluster.s << "-" << cluster.e << endl;
325-
for (const auto &tpl : cluster.reads)
326-
cerr << get<0>(tpl) << ":" << get<1>(tpl) << " ";
327-
cerr << endl;
328-
for (int i = 0; i < 4; ++i)
329-
cerr << gt_strings[i] << " - " << gts[i] << endl;
330-
}
331-
string gt = gt_strings[distance(gts.begin(), max_gt)];
332-
if (config->noref && gt.compare("0/0") == 0)
333-
continue;
334320
const vector<Cluster> &subclusters = split_cluster(cluster);
335321

336322
// Calling from one or two clusters
@@ -342,26 +328,41 @@ void Caller::pcall(const vector<Cluster> &clusters) {
342328

343329
string ref = string(chromosome_seqs[chrom] + cl.s, cl.e - cl.s + 1);
344330
string consensus = run_poa(cl.get_seqs());
345-
parasail_result_t *result = NULL;
346-
result = parasail_nw_trace_striped_16(consensus.c_str(), consensus.size(),
347-
ref.c_str(), ref.size(), 10, 1,
348-
&parasail_nuc44);
349-
parasail_cigar_t *pcigar =
350-
parasail_result_get_cigar(result, consensus.c_str(), consensus.size(),
351-
ref.c_str(), ref.size(), NULL);
352-
char *cigar_str = parasail_cigar_decode(pcigar);
353-
int score = result->score;
354-
parasail_cigar_free(pcigar);
355-
parasail_result_free(result);
331+
332+
// ksw2 stuff - TODO: move to a separate function
333+
int sc_mch = 1, sc_mis = -9, gapo = 16, gape = 2, gapo2 = 41, gape2 = 1;
334+
int8_t a = (int8_t)sc_mch,
335+
b = sc_mis < 0 ? (int8_t)sc_mis : -(int8_t)sc_mis; // a>0 and b<0
336+
int8_t mat[25] = {a, b, b, b, 0, b, a, b, b, 0, b, b, a,
337+
b, 0, b, b, b, a, 0, 0, 0, 0, 0, 0};
338+
uint tl = ref.size(), ql = consensus.size();
339+
uint8_t *ts = (uint8_t *)malloc(tl);
340+
uint8_t *qs = (uint8_t *)malloc(ql);
341+
for (i = 0; i < tl; ++i)
342+
ts[i] = _char26_table[(uint8_t)ref[i]]; // encode to 0/1/2/3
343+
for (i = 0; i < ql; ++i)
344+
qs[i] = _char26_table[(uint8_t)consensus[i]];
345+
346+
ksw_extz_t ez;
347+
memset(&ez, 0, sizeof(ksw_extz_t));
348+
ksw_extd2_sse(0, ql, qs, tl, ts, 5, mat, gapo, gape, gapo2, gape2, -1, -1,
349+
-1, 0, &ez);
350+
351+
int score = ez.score;
352+
string cigar_str = "";
353+
for (int i = 0; i < ez.n_cigar; ++i) {
354+
cigar_str += to_string(ez.cigar[i] >> 4) + "MID"[ez.cigar[i] & 0xf];
355+
}
356356
_p_alignments[t].push_back(
357357
Consensus(consensus, cigar_str, chrom, cl.s, cl.e));
358+
358359
// -- Extracting SVs
359360
uint rpos = cl.s; // position on reference
360361
uint cpos = 0; // position on consensus
361362
CIGAR cigar;
362-
cigar.parse_cigar(cigar_str);
363+
cigar.parse_cigar(cigar_str.c_str());
363364
int nv = 0;
364-
for (const auto cigar_pair : cigar.ops) {
365+
for (const auto &cigar_pair : cigar.ops) {
365366
uint l = cigar_pair.first;
366367
char op = cigar_pair.second;
367368
if (op == '=' || op == 'M') {
@@ -394,7 +395,7 @@ void Caller::pcall(const vector<Cluster> &clusters) {
394395
}
395396
for (size_t v = 0; v < _svs.size(); v++) {
396397
_svs[v].ngaps = nv;
397-
_svs[v].set_gt(gt, max(0, (int)(gtq * 100)));
398+
_svs[v].set_gt("./.", 100);
398399
_svs[v].set_cov(cl.cov, cl.cov0, cl.cov1, cl.cov2);
399400
_svs[v].set_rvec(cluster.reads);
400401
}
@@ -404,7 +405,7 @@ void Caller::pcall(const vector<Cluster> &clusters) {
404405
}
405406
}
406407

407-
/* Clean same SV reported twice */
408+
// Clean same SV reported twice
408409
void Caller::clean_dups() {
409410
vector<SV> _svs;
410411
string last_chrom = "";

caller.hpp

+1-3
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,14 @@
77
#include <map>
88

99
#include <abpoa.h>
10-
#include <parasail.h>
11-
#include <parasail/matrices/nuc44.h>
10+
#include <ksw2.h>
1211
#include <rapidfuzz/fuzz.hpp>
1312
#include <spdlog/spdlog.h>
1413

1514
#include "bam.hpp"
1615
#include "chromosomes.hpp"
1716
#include "clipper.hpp"
1817
#include "clusterer.hpp"
19-
#include "genotyper.hpp"
2018
#include "config.hpp"
2119
#include "sfs.hpp"
2220
#include "sv.hpp"

clipper.cpp

+10-17
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ vector<Clip> Clipper::combine(const vector<Clip> &clips) {
2525
}
2626
// we then merge
2727
#pragma omp parallel for num_threads(threads) schedule(static, 1)
28-
for (int i = 0; i < chromosomes.size(); i++) {
28+
for (uint i = 0; i < chromosomes.size(); i++) {
2929
int t = i % threads;
3030
const string &chrom = chromosomes[i];
3131
for (auto it = clips_dict[chrom].begin(); it != clips_dict[chrom].end();
@@ -36,16 +36,14 @@ vector<Clip> Clipper::combine(const vector<Clip> &clips) {
3636
max_l = c.l;
3737
}
3838
}
39-
Clip clip = Clip("", chrom, it->first, max_l,
40-
it->second.front().starting,
39+
Clip clip = Clip("", chrom, it->first, max_l, it->second.front().starting,
4140
it->second.size());
4241
_p_combined_clips[t].push_back(clip);
4342
}
4443
}
4544
vector<Clip> combined_clips;
4645
for (int i = 0; i < threads; i++) {
47-
combined_clips.insert(combined_clips.begin(),
48-
_p_combined_clips[i].begin(),
46+
combined_clips.insert(combined_clips.begin(), _p_combined_clips[i].begin(),
4947
_p_combined_clips[i].end());
5048
}
5149
return combined_clips;
@@ -93,26 +91,20 @@ vector<Clip> Clipper::filter_tooclose_clips(const vector<Clip> &clips,
9391
interval_tree_t<int> &vartree) {
9492
vector<Clip> fclips;
9593
for (const Clip &c : clips) {
96-
if (vartree.overlap_find({c.p, c.p + 1}) == end(vartree)) {
94+
if (vartree.overlap_find({(int)c.p, (int)c.p + 1}) == end(vartree)) {
9795
fclips.push_back(c);
9896
}
9997
}
10098
return fclips;
10199
}
102100

103101
// find smallest right that is larger than query
104-
int binary_search(const vector<Clip> &clips, int begin, int end,
102+
int binary_search(const vector<Clip> &clips, uint begin, uint end,
105103
const Clip &query) {
106-
// for (int i = 0; i < clips.size(); i++) {
107-
// if (query.p < clips[i].p) {
108-
// return i ;
109-
// }
110-
// }
111-
// return -1 ;
112104
if (begin > end || begin >= clips.size()) {
113105
return -1;
114106
}
115-
int m = (begin + end) / 2;
107+
uint m = (begin + end) / 2;
116108
if (clips[m].p == query.p) {
117109
if (m + 1 < clips.size()) {
118110
return m + 1;
@@ -130,7 +122,8 @@ int binary_search(const vector<Clip> &clips, int begin, int end,
130122
}
131123

132124
void Clipper::call(int threads, interval_tree_t<int> &vartree) {
133-
// lprint({"Predicting SVS from", to_string(clips.size()), "clipped SFS on", to_string(threads), "threads.."});
125+
// lprint({"Predicting SVS from", to_string(clips.size()), "clipped SFS on",
126+
// to_string(threads), "threads.."});
134127
vector<Clip> rclips;
135128
vector<Clip> lclips;
136129
for (const Clip &clip : clips) {
@@ -169,7 +162,7 @@ void Clipper::call(int threads, interval_tree_t<int> &vartree) {
169162
}
170163
// lprint({"Predicting insertions.."});
171164
#pragma omp parallel for num_threads(threads) schedule(static, 1)
172-
for (int i = 0; i < lclips.size(); i++) {
165+
for (uint i = 0; i < lclips.size(); i++) {
173166
const Clip &lc = lclips[i];
174167
int t = omp_get_thread_num();
175168
string chrom = lc.chrom;
@@ -194,7 +187,7 @@ void Clipper::call(int threads, interval_tree_t<int> &vartree) {
194187
}
195188
// lprint({"Predicting deletions.."});
196189
#pragma omp parallel for num_threads(threads) schedule(static, 1)
197-
for (int i = 0; i < rclips.size(); i++) {
190+
for (uint i = 0; i < rclips.size(); i++) {
198191
const Clip &rc = rclips[i];
199192
int t = omp_get_thread_num();
200193
string chrom = rc.chrom;

clusterer.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -334,7 +334,7 @@ void Clusterer::extend_alignment(bam1_t *aln, int index) {
334334
merged_extended_sfs.push_back(local_extended_sfs.at(i));
335335
}
336336
}
337-
for (const auto mes : merged_extended_sfs)
337+
for (const auto &mes : merged_extended_sfs)
338338
_p_extended_sfs[index].push_back(mes);
339339

340340
if (lclip.second > 0)

0 commit comments

Comments
 (0)