Skip to content

Commit 95afdcc

Browse files
committed
Re-implement extended SFS clustering without interval trees.
1 parent ccbe129 commit 95afdcc

9 files changed

+323
-280
lines changed

caller.cpp

+32-63
Original file line numberDiff line numberDiff line change
@@ -2,68 +2,37 @@
22

33
void Caller::run() {
44
config = Configuration::getInstance();
5-
6-
load_chromosomes(config->reference);
7-
cout << "Loaded all chromosomes" << endl ;
5+
lprint({"PingPong SV Caller running on", to_string(config->threads), "threads.."}) ;
6+
// load reference genome and SFS
7+
load_chromosomes(config->reference) ;
8+
lprint({"Loaded all chromosomes."}) ;
89
load_input_sfs() ;
9-
10-
ovcf.open(config->workdir + "/svs_poa.vcf");
10+
// call SVs from extended SFS
11+
vector<SV> svs ;
12+
Extender extender = Extender(&SFSs) ;
13+
extender.run(config->threads) ;
14+
svs.insert(svs.begin(), extender.svs.begin(), extender.svs.end()) ;
15+
lprint({"Predicted", to_string(svs.size()), "SVs from extended SFS."}) ;
16+
interval_tree_t<int> vartree ;
17+
for (const auto& sv: svs) {
18+
vartree.insert({sv.s - 1000, sv.e + 1000}) ;
19+
}
20+
// call SVS from clipped SFS
21+
Clipper clipper(extender.clips);
22+
clipper.call(config->threads, vartree) ;
23+
int s = 0 ;
24+
for (int i = 0; i < config->threads; i++) {
25+
s += clipper._p_svs[i].size() ;
26+
svs.insert(svs.begin(), clipper._p_svs[i].begin(), clipper._p_svs[i].end()) ;
27+
}
28+
lprint({"Predicted", to_string(s), "SVs from extended SFS."}) ;
29+
std::sort(svs.begin(), svs.end()) ;
30+
// output POA alignments SAM
1131
osam.open(config->workdir + "/poa.sam");
12-
// --- SAM header
1332
osam << "@HD\tVN:1.4" << endl;
1433
for (int i = 0; i < chromosomes.size(); ++i) {
1534
osam << "@SQ\tSN:" << chromosomes[i] << "\t" << "LN:" << strlen(chromosome_seqs[chromosomes[i]]) << endl ;
1635
}
17-
// --- VCF header
18-
print_vcf_header() ;
19-
20-
//vector<vector<Consensus>> alignments(config->threads) ; // produce a SAM file of consensus alignments
21-
//vector<vector<SV>> svs(config->threads) ; // produce a SAM file of consensus alignments
22-
//#pragma omp parallel for num_threads(config->threads) schedule(static,1)
23-
//for(int i = 0; i < chromosomes.size(); i++) {
24-
// string chrom = chromosomes[i] ;
25-
// int t = i % config->threads ;
26-
// cout << "Processing chromosome " << chrom << ".. " << endl ;
27-
28-
// Extender extender = Extender(chrom, &SFSs) ;
29-
// extender.run(4) ;
30-
// alignments[t].insert(alignments[t].begin(), extender.alignments.begin(), extender.alignments.end()) ;
31-
// svs[t].insert(svs[t].begin(), extender.svs.begin(), extender.svs.end()) ;
32-
33-
// cout << svs[t].size() << " SVs." << endl ;
34-
35-
// //Clipper clipper(chrom, extender.clips);
36-
// //clipper.call(reference[chrom], sv_tree);
37-
// //svs[omp_get_thread_num()].insert(svs[omp_get_thread_num()].begin(), clipper.svs.begin(), clipper.svs.end()) ;
38-
//}
39-
//for (int i = 0; i < config->threads; i++) {
40-
// for (int j = 0; j < alignments[i].size(); j++) {
41-
// const auto& c = alignments[i][j] ;
42-
// osam << c.chrom << ":" << c.s + 1 << "-" << c.e + 1 << "\t"
43-
// << "0"
44-
// << "\t" << c.chrom << "\t" << c.s + 1 << "\t"
45-
// << "60"
46-
// << "\t" << c.cigar << "\t"
47-
// << "*"
48-
// << "\t"
49-
// << "0"
50-
// << "\t"
51-
// << "0"
52-
// << "\t" << c.seq << "\t"
53-
// << "*" << endl ;
54-
// }
55-
// for (const SV& sv: svs[i]) {
56-
// ovcf << sv << endl ;
57-
// }
58-
//}
59-
60-
61-
Extender extender = Extender(&SFSs) ;
62-
extender.run(config->threads) ;
63-
//Clipper clipper(chrom, extender.clips);
64-
//clipper.call(reference[chrom], sv_tree);
65-
//svs[omp_get_thread_num()].insert(svs[omp_get_thread_num()].begin(), clipper.svs.begin(), clipper.svs.end()) ;
66-
6736
for (int j = 0; j < extender.alignments.size(); j++) {
6837
const auto& c = extender.alignments[j] ;
6938
osam << c.chrom << ":" << c.s + 1 << "-" << c.e + 1 << "\t"
@@ -79,13 +48,13 @@ void Caller::run() {
7948
<< "\t" << c.seq << "\t"
8049
<< "*" << endl ;
8150
}
82-
83-
std::sort(extender.svs.begin(), extender.svs.end()) ;
51+
osam.close() ;
52+
// output SV calls
53+
ovcf.open(config->workdir + "/svs_poa.vcf");
54+
print_vcf_header() ;
8455
for (const SV& sv: extender.svs) {
8556
ovcf << sv << endl ;
8657
}
87-
88-
osam.close() ;
8958
ovcf.close() ;
9059
}
9160

@@ -94,12 +63,12 @@ void Caller::load_input_sfs() {
9463
int num_batches = config->aggregate_batches ;
9564
int num_threads = num_batches < threads ? num_batches : threads ;
9665
vector<unordered_map<string, vector<SFS>>> _SFSs(num_batches) ;
97-
cout << "Loading assmbled SFS.." << endl ;
66+
lprint({"Loading assmbled SFS.."}) ;
9867
#pragma omp parallel for num_threads(num_threads)
9968
for (int j = 0; j < num_batches; j++) {
10069
string s_j = std::to_string(j) ;
10170
string inpath = config->workdir + "/solution_batch_" + s_j + ".assembled.sfs" ;
102-
cout << "[I] Loading SFS from " << inpath << endl ;
71+
//cout << "[I] Loading SFS from " << inpath << endl ;
10372
ifstream inf(inpath) ;
10473
string line ;
10574
if (inf.is_open()) {
@@ -122,7 +91,7 @@ void Caller::load_input_sfs() {
12291
int r = 0 ;
12392
int c = 0 ;
12493
for (int j = 0; j < num_batches; j++) {
125-
lprint({"Batch", to_string(j), "with", to_string(_SFSs[j].size()), "strings."});
94+
//lprint({"Batch", to_string(j), "with", to_string(_SFSs[j].size()), "strings."});
12695
r += _SFSs[j].size() ;
12796
SFSs.insert(_SFSs[j].begin(), _SFSs[j].end()) ;
12897
for (auto& read: _SFSs[j]) {

caller.hpp

+2-11
Original file line numberDiff line numberDiff line change
@@ -22,21 +22,12 @@ class Caller {
2222

2323
private:
2424
Configuration* config;
25+
std::unordered_map<std::string, std::vector<SFS>> SFSs ;
2526

2627
ofstream ovcf;
2728
ofstream osam;
28-
29-
samFile *sfs_bam;
30-
bam_hdr_t *sfs_bamhdr;
31-
hts_idx_t *sfs_bamindex;
32-
33-
samFile *read_bam;
34-
bam_hdr_t *read_bamhdr;
35-
hts_idx_t *read_bamindex;
36-
37-
std::unordered_map<std::string, std::vector<SFS>> SFSs ;
38-
void print_vcf_header() ;
3929
void load_input_sfs() ;
30+
void print_vcf_header() ;
4031
};
4132

4233
#endif

chromosomes.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -38,14 +38,14 @@ int get_reference_size(ifstream &fasta_file) {
3838
}
3939

4040
void load_chromosomes(string path) {
41-
lprint({"Loading reference genome from", path, ".."});
41+
lprint({"Loading reference genome from", path + ".."});
4242
gzFile fp = gzopen(path.c_str(), "r");
4343
kseq_t *seq = kseq_init(fp);
4444
int l;
4545
while ((l = kseq_read(seq)) >= 0) {
4646
string name(seq->name.s) ;
4747
if (name.find('_') == -1) {
48-
lprint({"Extracted", seq->name.s, "with", to_string(l), "bases"});
48+
lprint({"Extracted", seq->name.s, "with", to_string(l), "bases."});
4949
for (uint i = 0; i < l; i++) {
5050
seq->seq.s[i] = toupper(seq->seq.s[i]) ;
5151
}

0 commit comments

Comments
 (0)