-
Notifications
You must be signed in to change notification settings - Fork 19
/
corset.cc
688 lines (636 loc) · 26.5 KB
/
corset.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
// Copyright 2013 Nadia Davidson for Murdoch Childrens Research
// Institute Australia. This program is distributed under the GNU
// General Public License. We also ask that you cite this software in
// publications where you made use of it for any part of the data
// analysis.
/** This file contains the main function for the corset program. It
** controls the i/o including command line options.
**
** The program runs in four main stages:
** 1 - All the alignments in the bam files are read. See the function
** read_bam_file below and add_alignment() in ReadList for more
** detail.
** 2 - Filter out any transcripts which have fewer than a certain number
** of reads aligning. By deafult this is 10.
** 3 - The read-transcript associations are parsed and the transcripts
** undergo an initial clustering, see MakeClusters::makeSuperClusters(),
** in which transcripts are grouped together if they share a single read
** with another transcript in the same group.
** 4 - For each of the clusters above, hierarchical clustering is performed,
** see Cluster::cluster(). The group is split into multiple smaller
** clusters and counts are output for each of these smaller groups.
**
** Author: Nadia Davidson, [email protected]
** Modified: 25 June 2019
**/
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <sstream>
#include <cstdlib>
#include <cstring>
#include <algorithm>
#include <climits>
#include <map>
#include <random>
//#include <pthread.h>
#include <MakeClusters.h>
#include <Read.h>
#include <Transcript.h>
#include <sys/types.h>
#include <fcntl.h>
#include <unistd.h>
#include <signal.h>
#include <sam.h>
#include <bam.h>
//#include <gperftools/profiler.h>
#define MAX_BAM_LINE_LENGTH 10000
using namespace std;
const string corset_extension=".corset-reads";
// a function to parse a bam file. It required samtools
// to read it. The alignments are stored in a ReadList object.
ReadList * read_bam_file(string all_file_names, TranscriptList * trans, int sample){
ReadList * rList = new ReadList(trans);
string filename;
stringstream ss(all_file_names);
//read each file in the comma separated list
while(getline(ss,filename,',')){
cout << "Reading bam file : "<< filename << endl;
if(!(filename.find(corset_extension)==string::npos) ){
cout << "The input files look like corset read files (filenames with "<< corset_extension;
cout << "). Perhaps you meant to run with the -i corset option?" << endl;
exit(1);
}
samfile_t *in = 0 ;
if ((in = samopen(filename.c_str(), "br", NULL)) == 0 | in->header == 0) {
cerr << "fail to open "<< filename << " for reading." << endl;
exit(1);
}
bam1_t *b = bam_init1();
int r;
int i=0;
//read the header first..... only really need this when we want to
//output all the transcripts including those with no reads
for(int tid=0; tid < in->header->n_targets; tid++)
trans->insert(in->header->target_name[tid]);
while ((r = samread(in, b)) >= 0) { // read one alignment from `in'
//add the alignment into our data object
string read_name=string(bam1_qname(b));
int tid=b->core.tid;
if( tid != -1 ) //unmapped reads have tid=-1
rList->add_alignment(read_name, string(in->header->target_name[tid]),sample);
if(i % 200000 == 0) //output some info about how it's proceeding.
cout << float(i)/float(1000000) << " million alignments read" <<endl;
// if(i>5000000) break;
i++;
}
bam_destroy1(b);
samclose(in);
cout << "Done reading "<< filename << endl;
}
rList->compactify_reads(trans); //reduce the memory usage
return rList; //return the alignment list
}
// a function to parse a fasta file and k-mers to cluster
ReadList * read_fasta_file(string all_file_names, TranscriptList * trans, int sample){
ReadList * rList = new ReadList(trans);
string filename;
stringstream ss(all_file_names);
ifstream file;
string line;
//read each file in the comma separated list
while(getline(ss,filename,',')){
cout << "Reading fasta file : "<< filename << endl;
//now open the fasta file
file.open(filename);
if(!(file.good())){
cout << "Unable to open file " << filename << endl;
exit(1);
}
map<string,string> sequences;
string id="";
while ( getline (file,line) ){
int start=line.find(">")+1;
if(start==1){ //if this is the ID line...
int end=line.find_first_of("\t\n ")-1;
id=line.substr(start,end);
} else {
sequences[id]=sequences[id]+line;
}
}
cout << "Done reading file, getting k-mers" << endl;
map<string,string>::iterator it=sequences.begin();
map<string,string>::iterator it_end=sequences.end();
int k=70;
int t_count=1;
for(;it!=it_end; it++, t_count++){
if(t_count % 1000 == 0) //output some info about how it's proceeding.
cout << float(t_count)/float(1000) << " thousand contigs read" <<endl;
string seq = it->second;
trans->insert(it->first);
for(int i=0; i <= seq.length()-k ; i++){
string kmer=seq.substr(i,k) ;
rList->add_alignment(kmer,it->first,sample);
}
}
cout << "Done reading "<< filename << endl;
}
rList->compactify_reads(trans); //reduce the memory usage
return rList; //return the alignment list
}
#define READS_REDISTRIBUTED 0
#define READS_COUNTED 1
#define READS_FILTERED 2
/** Code to add an EC into the readList. This code is common to corset and
salmon EC file parsing **/
int add_equivalence_class(ReadList * rList, int & sample, vector<string> & transNames, int & weight) {
//case that the number of reads is smaller than threshold for a link b/n transcripts
if(weight<Transcript::min_reads_for_link){
shuffle(transNames.begin(),transNames.end(),default_random_engine());
for(int rr=0; rr < transNames.size() ; rr++){
int this_weight = weight/transNames.size();
if(rr < (weight%transNames.size()) )
this_weight++;
vector<string> this_tran;
this_tran.push_back(transNames.at(rr));
rList->add_alignment(this_tran,sample,this_weight);
}
return READS_REDISTRIBUTED;
}
else if(transNames.size()<=Transcript::max_alignments || Transcript::max_alignments<=0){
rList->add_alignment(transNames,sample,weight);
return READS_COUNTED;
}
return READS_FILTERED;
}
/** Process a corset format equivalence class file **/
ReadList * read_corset_file(string all_file_names, TranscriptList * trans, int sample){
ReadList * rList = new ReadList(trans);
string filename;
stringstream ss(all_file_names);
//read each file in the comma separated list
while(getline(ss,filename,',')){
cout << "Reading corset file : "<< filename << endl;
//check that the filename has the correct file extension
//i.e. that we can't accidently pass a bam file.
if(filename.find(corset_extension)==string::npos ){
cout << "The input files don't have the extension, "<< corset_extension;
cout << ". Please check them." << endl;
exit(1);
}
ifstream file(filename);
string line;
int reads_counted=0;
int reads_filtered=0;
int reads_redistributed=0;
while(getline(file, line)){
istringstream istream(line);
int weight;
vector<string> transNames;
string name;
int alignments=-1; //start at -1 because first column is weight
istream >> weight;
while(istream >> name)
transNames.push_back(name);
int ret_flag=add_equivalence_class(rList,sample,transNames,weight);
switch(ret_flag){
case READS_REDISTRIBUTED : reads_redistributed+=weight; break;
case READS_COUNTED : reads_counted+=weight; break;
case READS_FILTERED : reads_filtered+=weight; break;
}
}
cout<<reads_counted<< " reads counted, "
<<reads_filtered<<" reads filtered, "
<<reads_redistributed<<" reads redistributed."
<<endl;
}
return rList;
}
// Process a salom equivalence class file
// This has the same information as a corset-reads file
ReadList * read_salmon_eq_classes_file(string all_file_names, TranscriptList * trans, int sample){
ReadList * rList = new ReadList(trans);
string filename;
stringstream ss(all_file_names);
//read each file in the comma separated list
while(getline(ss,filename,',')){
cout << "Reading salmon eq_classes file : "<< filename << endl;
ifstream file(filename);
string line;
int reads_counted=0;
int reads_filtered=0;
int reads_redistributed=0;
//First read the header information with the transcript IDs
getline(file, line); int ntranscripts=atoi(line.c_str()) ;
getline(file, line); int neqclasses=atoi(line.c_str()) ;
cout << "Reading data on " << ntranscripts << " transcripts in " << neqclasses << " equivalence classes" << endl;
vector<string> trans_ids;
for(int nt=0; nt < ntranscripts ; nt++){
getline(file, line);
trans_ids.push_back(line);
}
//Now read the equivalence class information
for(int ne=0; ne < neqclasses ; ne++){
vector<string> transNames;
getline(file, line);
istringstream istream(line);
int eq_size; istream >> eq_size ;
for(int es=0; es < eq_size ; es++){
int trans_pos; istream >> trans_pos ;
transNames.push_back(trans_ids.at(trans_pos));
}
int weight;
istream >> weight;
int ret_flag=add_equivalence_class(rList,sample,transNames,weight);
switch(ret_flag){
case READS_REDISTRIBUTED : reads_redistributed+=weight; break;
case READS_COUNTED : reads_counted+=weight; break;
case READS_FILTERED : reads_filtered+=weight; break;
}
}
cout<<reads_counted<< " reads counted, "
<<reads_filtered<<" reads filtered, "
<<reads_redistributed<<" reads redistributed."
<<endl;
}
return rList;
}
// the help information which is printed when a user puts in the wrong
// combination of command line options.
void print_usage(){
cout << endl;
cout << "Corset clusters contigs and counts reads from de novo assembled transcriptomes." << endl;
cout << endl;
cout << "Usage: corset [options] <input bam files>" << endl;
cout << endl;
cout << "Input bam files:" << endl;
cout << "\t The input files should be multi-mapped bam files. They can be single, paired-end or mixed" << endl;
cout << "\t and do not need to be indexed. A space separated list should be given." << endl;
cout << "\t e.g. corset sample1.bam sample2.bam sample3.bam" << endl;
cout << "\t or just: corset sample*.bam" << endl;
cout << endl;
cout << "\t If you want to combine the results from different transcriptomes. i.e. the same reads have " << endl;
cout << "\t been mapped twice or more, you can used a comma separated list like below:" << endl;
cout << "\t corset sample1_Trinity.bam,sample1_Oases.bam sample2_Trinity.bam,sample2_Oases.bam ..." << endl;
cout << endl;
cout << "Options are:" << endl;
cout << endl;
cout << "\t -d <double list> A comma separated list of distance thresholds. The range must be" << endl;
cout << "\t between 0 and 1. e.g -d 0.4,0.5. If more than one distance threshold" << endl;
cout << "\t is supplied, the output filenames will be of the form:" << endl;
cout << "\t counts-<threshold>.txt and clusters-<threshold>.txt " << endl;
cout << "\t Default: 0.3" << endl;
cout << endl;
cout << "\t -D <double> The value used for thresholding the log likelihood ratio. The default " << endl;
cout << "\t value will depend on the number of degrees of freedom (which is the " << endl;
cout << "\t number of groups -1). By default D = 17.5 + 2.5 * ndf, which corresponds " << endl;
cout << "\t approximately to a p-value threshold of 10^-5, when there are fewer than" << endl;
cout << "\t 10 groups." << endl;
cout << endl;
cout << "\t -I Switches off the log likelihood ratio test and should be used " << endl;
cout << "\t for downstream differential transcript usage analysis. It will prevent " << endl;
cout << "\t differentially expressed transcript being split into different clusters. " << endl;
cout << "\t This option is the equivalent of setting -D to a very large number. " << endl;
cout << "\t Default: log likelihood ratio test on" << endl;
cout << endl;
cout << "\t -m <int> Filter out any transcripts with fewer than this many reads aligning." << endl;
cout << "\t Default: 10" << endl;
cout << endl;
cout << "\t -g <list> Specifies the grouping. i.e. which samples belong to which experimental" << endl;
cout << "\t groups. The parameter must be a comma separated list (no spaces), with the " << endl;
cout << "\t groupings given in the same order as the bam filename. For example:" << endl;
cout << "\t -g Group1,Group1,Group2,Group2 etc. If this option is not used, each sample" << endl;
cout << "\t is treated as an independent experimental group." << endl;
cout << endl;
cout << "\t -p <string> Prefix for the output filenames. The output files will be of the form" <<endl;
cout << "\t <prefix>-counts.txt and <prefix>-clusters.txt. Default filenames are:" << endl;
cout << "\t counts.txt and clusters.txt" << endl;
cout << endl;
cout << "\t -f <true/false> Specifies whether the output files should be overwritten if they already exist." << endl;
cout << "\t Default: false" << endl;
cout << endl;
cout << "\t -n <string list> Specifies the sample names to be used in the header of the output count file." << endl;
cout << "\t This should be a comma separated list without spaces." << endl;
cout << "\t e.g. -n Group1-ReplicateA,Group1-ReplicateB,Group2-ReplicateA etc." << endl;
cout << "\t Default: the input filenames will be used." << endl;
cout << endl;
cout << "\t -r <true/true-stop/false> " << endl;
cout << "\t Output a file summarising the read alignments. This may be used if you" << endl;
cout << "\t would like to read the bam files and run the clustering in seperate runs" << endl;
cout << "\t of corset. e.g. to read input bam files in parallel. The output will be the" << endl;
cout << "\t bam filename appended with "<< corset_extension <<"." << endl;
cout << "\t Default: false" << endl;
cout << endl;
cout << "\t -i <bam/corset/salmon_eq_classes> The input file type. Use -i corset, if you previously ran" << endl ;
cout << "\t corset with the -r option and would like to restart using those" << endl;
cout << "\t read summary files. Use salmon_eq_classes, if you aligned with salmon with" << endl;
cout << "\t the flag --dumpEq and are passing corset the equivalent class files." << endl;
cout << "\t Running with either -i corset or salmon_eq_classes will switch off the -r option." << endl;
cout << "\t Default: bam" << endl;
cout << endl;
cout << "\t -l <int> If running with -i corset or salmon_eq_classes, this will filter out a link between contigs" << endl;
cout << "\t if the link is supported by less than this many reads (performed sample-wise). Reads will " << endl;
cout << "\t be reassigned uniformly to the contigs in the equivalence class. This option will" << endl;
cout << "\t improve runtime and memory usage, but will increase the number of clusters reported." << endl;
cout << "\t Default: 1 (no filtering)" << endl;
cout << endl;
cout << "\t -x <int> If running with -i corset or salmon_eq_classes, this option will filter out reads that" << endl;
cout << "\t align to more than x contigs. Default: no filtering" << endl;
cout << endl;
cout << "Citation: Nadia M. Davidson and Alicia Oshlack, Corset: enabling differential gene expression " << endl;
cout << " analysis for de novo assembled transcriptomes, Genome Biology 2014, 15:410" << endl;
cout << endl;
cout << "Please see https://github.com/Oshlack/Corset/wiki for more information"<< endl;
cout << endl;
}
// the real stuff starts here.
int main(int argc, char **argv){
//default parameters:
string distance_string("0.3");
bool force=false;
string sample_names;
int c;
int params=1;
vector<int> groups;
bool output_reads=false;
bool stop_after_read=false;
string input_type="bam";
//function pointer to the method to read the bam or corset input files
ReadList * (*read_input)(string, TranscriptList *, int) = read_bam_file;
cout << endl;
cout << "Running Corset Version "<< VERSION << endl;
//parse the command line options
while((c = getopt(argc, argv, "f:p:d:n:g:D:Im:r:i:l:x:")) != EOF){
switch(c){
case 'f': { //f=force output to be overwritten?
std::string value(optarg);
transform(value.begin(), value.end(), value.begin(), ::tolower);
if( value.compare("true")==0 | value.compare("t")==0 | value.compare("1")==0 ){
force=true;
cout << "Setting output files to be overridden"<<endl;
}
else if (value.compare("false")==0 | value.compare("f")==0 | value.compare("0")==0 )
force=false;
else {
cerr << "Unknown argument passed with -f. Please specify true or false." << endl;
print_usage();
exit(1);
}
params+=2;
break; }
case 'p': //p=the prefix for the output files
cout << "Setting output filename prefix to " << optarg << endl;
Cluster::file_prefix=string(optarg)+"-";
params+=2;
break;
case 'd': //d=the distance threshold for the clustering
cout << "Setting distance threshold to: " << optarg << endl;
distance_string = optarg;
params+=2;
break;
case 'n':{ //names to give the samples in the output files
cout << "Setting sample names to:" << optarg << endl;
sample_names=string(optarg);
replace( sample_names.begin(), sample_names.end(), ',', '\t');
params+=2;
break;
}
case 'g':{ //the sample groups. Used for the likelihood ratio test
cout << "Setting sample groups:" << optarg;
stringstream ss(optarg);
string s;
vector < string > names;
while (getline(ss, s, ','))
names.push_back(s);
int ngroups=0;
for(int s1=0; s1 < names.size(); s1++){
//find the first occurance
int s2=0;
for(; s2 < names.size(); s2++){
if(names.at(s1).compare(names.at(s2))==0)
break;
}
if(groups.size()<=s2){
groups.push_back(ngroups);
ngroups++;
}
else
groups.push_back(groups.at(s2));
}
Transcript::groups=ngroups;
cout << ", " << Transcript::groups << " groups in total" << endl;
params+=2;
break;
}
case 'D':{
cout << "Setting likelihood threshold at "<<optarg<<endl;
Cluster::D_cut=atof(optarg);
params+=2;
break;
}
case 'I':{
cout << "Switching likelihood test off - will not separate differentially expressed isoforms" << endl;
Cluster::D_cut=INT_MAX;
params+=1;
break;
}
case 'm':{
cout << "Setting minimum counts to "<<optarg<<endl;
Transcript::min_counts=atoi(optarg);
params+=2;
break;
}
case 'r':{ //r=whether to output read information
std::string value(optarg);
transform(value.begin(), value.end(), value.begin(), ::tolower);
if( value.compare("true-stop")==0){
output_reads=true;
stop_after_read=true;
cout << "Setting read alignments to be output to file." << endl;
cout << "Corset will exit after reading bam files." << endl;
}
else if( value.compare("true")==0 | value.compare("t")==0 ){
output_reads=true;
cout << "Setting read alignments to be output to file."<<endl;
}
else if (value.compare("false")!=0 & value.compare("f")!=0 ){
cerr << "Unknown argument passed with -r. Please specify true or false." << endl;
print_usage();
exit(1);
}
params+=2;
break;
}
case 'i':{
std::string value(optarg);
transform(value.begin(), value.end(), value.begin(), ::tolower);
if( value.compare("corset")==0 ){
read_input=read_corset_file;
output_reads=false; //override the -r option
stop_after_read=false;
} else if( value.compare("salmon_eq_classes")==0 ){
read_input=read_salmon_eq_classes_file;
output_reads=false;
stop_after_read=false;
}
else if(value.compare("fasta")==0 ){
read_input=read_fasta_file;
}
else if(value.compare("bam")!=0 && value.compare("fasta")!=0){
cerr << "Unknown input type, "<< value<<", passed with -i. Please check options." << endl;
print_usage();
exit(1);
}
params+=2;
break;
}
case 'l':{
cout << "Setting minimum reads for a link to "<<optarg
<< " (only used if -i is set to corset or salmon_eq_classes)" << endl;
Transcript::min_reads_for_link=atoi(optarg);
params+=2;
break;
}
case 'x':{
cout << "Setting maximum alignments for a read to "<<optarg
<< " (only used if -i is set to corset or salmon_eq_classes )" << endl;
Transcript::max_alignments=atoi(optarg);
params+=2;
break;
}
case '?': //other unknown options
cerr << "Unknown option.. stopping" << endl;
print_usage();
exit(1);
break;
}
}
//convert the distances provided as a string into doubles
map<float,string> distance_thresholds;
char * cstr = new char[distance_string.length()+1];
strcpy(cstr, distance_string.c_str());
char * single_string = strtok(cstr,",");
while (single_string != NULL){
double value = atof(single_string);
//check that the value is valid
if(value<0|value>1){
cerr << "The distance provided is invalid. It must be between 0 and 1 (inclusive)." << endl;
print_usage();
exit(1);
}
distance_thresholds[ value ]=string("-")+single_string;
single_string = strtok(NULL,",");
}
if(distance_thresholds.size()==1)
distance_thresholds.begin()->second=""; //no suffix if only one distance cut-off
//the remaining command line arguments are names of the input files
int smpls=argc-params;
if(smpls==0){
cerr << "No input files specified" << endl;
print_usage();
exit(1);
}
if(sample_names==""){ //if the -n option (sample names) have not been
//specified, then use the filenames.
for(int s = params; s < argc ; s++){
sample_names+=string(argv[s]);
if(s < (argc-1)) sample_names+='\t';
}
}
//if the number of sample provided for the header of the counts file
//is inconsistent with the number of samples print an error and exit
int n_sample_names=count(sample_names.begin(), sample_names.end(),'\t')+1;
if(n_sample_names!=smpls){
cerr << "The number of sample names passed (via the -n option), " << n_sample_names
<< ", does not match the number of samples, " << smpls
<< ". Please check how many values you have passed." << endl;
exit(1);
}
if(groups.size()==0){ // if no -g option (no groups information) is given
//treat each file as a separate group
for(int s=0; s<smpls; s++)
groups.push_back(s);
Transcript::groups=smpls;
}
//before we go any further. Check that there are enough groups and
//sample names for all the files
if(groups.size()!=smpls){
cerr << "The number of experimental groups passed (via the -g option) "
<< "does not match the number of input files. Please check how "
<< "many values you have passed." << endl;
exit(1);
}
//if D was not set, then set the default value
if(Cluster::D_cut==0)
Cluster::D_cut=17.5+2.5*(Transcript::groups-1);
//test whether the output files already exist
for (map<float,string>::iterator it=distance_thresholds.begin();
stop_after_read==false & it!=distance_thresholds.end(); ++it){
string type[]={Cluster::file_counts,Cluster::file_clusters};
for(int t=0; t < 2; t++){
string filename(Cluster::file_prefix+type[t]+it->second+Cluster::file_ext);
ifstream file(filename.c_str());
if(file.good()){
if(force & remove( filename.c_str() )!=0){
cerr << "Could not replace the file, "<< filename << endl;
exit(1);
}
else if(!force){
cerr << "File already exists, " << filename << ". Use \"-f true\" option to overwrite " << endl;
exit(1);
}
}
file.close();
ofstream ofile(filename.c_str());
//output the header for the counts files.
if(type[t].compare(Cluster::file_counts)==0){
ofile << '\t' + sample_names << endl;
}
ofile.close();
}
}
//set the number of samples
Transcript::samples=smpls;
//set-up out data structures ready to receive the alignment information
TranscriptList * tList = new TranscriptList;
vector<ReadList*> rList; //one ReadList per sample
for(int bam_file=0; bam_file < smpls; bam_file++){
rList.push_back(read_input(string(argv[params+bam_file]),tList,bam_file));
//This is where we output the read alignment summary file for future runs of corset
if(output_reads){
string bam_filename = string(argv[params+bam_file]) ;
string base_filename = bam_filename.substr(bam_filename.find_last_of("/\\,") + 1) ;
rList[bam_file]->write(base_filename+corset_extension);
}
}
cout << "Done reading all files. "<< endl;
if(stop_after_read==true) exit(0);
//output clusters with no reads in this special case
StringSet<Transcript>::iterator it;
int n=0;
for(it=tList->begin(); it!=tList->end(); it++){
if(!(*it).second->reached_min_counts()){
if(Transcript::min_counts==0){
n++;
//output clusters
for (map<float,string>::iterator dis=distance_thresholds.begin(); dis!=distance_thresholds.end(); ++dis){
ofstream clusterFile;
string filename(Cluster::file_prefix+Cluster::file_clusters+dis->second+Cluster::file_ext);
clusterFile.open(filename.c_str(),ios_base::app);
clusterFile << (*it).second->get_name() << "\t" << Cluster::cluster_id_prefix_no_reads << n << endl;
clusterFile.close();
}
}
(*it).second->remove();
}
};
delete tList; //don't need this anymore, so why not free some space
cout << "Start to cluster the reads" << endl;
//Now the reads are parsed and the clustering and counting is performed.
// ProfilerStart("gprof.prof");
MakeClusters cList(rList, distance_thresholds,groups);
// ProfilerStop();
cout << "Finished" << endl;
return(0);
}