-
Notifications
You must be signed in to change notification settings - Fork 0
/
make_superReference.c
155 lines (145 loc) · 4.88 KB
/
make_superReference.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
/**
**
** Author: Nadia Davidson, [email protected]
** Modified: 26 October 2016
**
** e.g. gffread temp.gff -g /group/bioi1/shared/genomes/hg38/fasta/hg38.fa -w temp.fasta
**/
#include <iostream>
#include <fstream>
#include <istream>
#include <string>
#include <sstream>
#include <map>
#include <stdlib.h>
#include <vector>
#include <algorithm>
#include "misc.h"
using namespace std;
vector<g_interval> merge_intervals( vector<g_interval> interval){
//loop over each interval
vector<g_interval> result;
if(interval.size()<2) return interval;
for(int i=0; i<(interval.size()-1); i++){
bool merged=false;
for(int j=(i+1); (j<interval.size()) && !merged; j++){
//check if the two intervals overlap
int x1=interval.at(i).start;
int y1=interval.at(i).end;
int x2=interval.at(j).start;
int y2=interval.at(j).end;
if((interval.at(i).chrom==interval.at(j).chrom) &&
!(y1<x2) && !(y2<x1) &&
interval.at(i).strand==interval.at(j).strand){
interval.at(j).start=min(x1,x2);
interval.at(j).end=max(y1,y2);
merged=true;
}
}
if(!merged) result.push_back(interval.at(i));
}
result.push_back(interval.at(interval.size()-1));
return result;
}
// the real stuff starts here.
int main(int argc, char **argv){
if(argc!=4){
cout << "Wrong number of arguments" << endl;
cout << "Usage: " << endl;
cout << " make_superReference <in_uscs_table_file> <out_genome_gtf_flattened> <out_super_transcript_gtf>" << endl;
exit(1);
}
ifstream file;
//Open the exons file
file.open(argv[1]);
if(!(file.good())){
cout << "Unable to open file " << argv[1] << endl;
exit(1);
}
// reading the known junction list file
cout << "Reading the input file..." << endl;
string line;
string column;
string chrom;
string symbol;
string strand;
map<string,vector<g_interval> > exons;
for(int skip=1; skip!=0 && getline(file,line) ; skip--); //skip the first line
while(getline(file,line) ){
istringstream line_stream(line);
line_stream >> column; //transcript ID (not used)
line_stream >> chrom;
line_stream >> strand;
line_stream >> column;
vector<int> starts = get_vector_from_list(column);
line_stream >> column;
vector<int> ends = get_vector_from_list(column);
line_stream >> symbol;
for(int i=0; i < starts.size() ; i++){
g_interval new_exon;
new_exon.chrom=chrom;
new_exon.start=starts.at(i)+1;
new_exon.end=ends.at(i);
new_exon.strand=strand;
exons[symbol].push_back(new_exon);
}
}
file.close();
const static int MAX_INTRON = 1000000;
//loop over exons
map<string,vector<g_interval> >::iterator exonItr=exons.begin();
map<string,vector<g_interval> >::iterator exonItrEnd=exons.end();
//Open the output file
cout << "Merging exon intervals to make a flat gff file ..." << endl;
ofstream flat_file, ann_file;
static int chain_id=1;
flat_file.open(argv[2]);
ann_file.open(argv[3]);
for( ; exonItr!=exonItrEnd ; exonItr++){ //looping over the genes
vector<g_interval> exonInterval = merge_intervals(exonItr->second); //merging exons
sort(exonInterval.begin(),exonInterval.end(),g_interval_compare);
int last_start, last_end=0;
vector<int> genome_starts,genome_ends,block_sizes;
string chrom=exonInterval.at(0).chrom;
string strand=exonInterval.at(0).strand;
int st_length=0;
for(int i=0; i<exonInterval.size(); i++){ //loop over the exons in the gene
string this_chrom=exonInterval.at(i).chrom;
int this_start=exonInterval.at(i).start;
int this_end=exonInterval.at(i).end;
string this_strand=exonInterval.at(i).strand;
if(last_end==0
|| ( chrom==this_chrom && abs(this_start-last_end)<MAX_INTRON && strand==this_strand)){
genome_starts.push_back(this_start);
genome_ends.push_back(this_end);
st_length+=this_end-this_start+1;
block_sizes.push_back(this_end-this_start+1);
last_start=this_start; last_end=this_end;
}
}
//now loop again over the filtered exons
ann_file << "chain\t1\t" << chrom << "\t10000000000\t"
<< "\t+\t" << genome_starts.at(0)-1
<< "\t" << genome_ends.at(genome_ends.size()-1)
<< "\t" << exonItr->first << "\t" << st_length
<< "\t" << strand << "\t0\t" << st_length
<< "\t" << chain_id << endl;
for(int i=0; i<genome_starts.size(); i++){
flat_file << chrom << "\t" ;
flat_file << "superT\texon\t";
flat_file << genome_starts.at(i) << "\t" ;
flat_file << genome_ends.at(i) << "\t" ;
flat_file << ".\t"<< strand << "\t.\t";
flat_file << exonItr->first << endl;
int i_max=genome_starts.size()-1;
ann_file << genome_ends.at(i) - genome_starts.at(i) + 1 ;
if(i<i_max)
ann_file << "\t" << genome_starts.at(i+1) - genome_ends.at(i) - 1 << "\t0";
ann_file << endl;
}
chain_id++;
}
flat_file.close();
ann_file.close();
cout << "All done" << endl;
}