-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathgraphEval.py
executable file
·310 lines (293 loc) · 10.7 KB
/
graphEval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
#!/usr/bin/env python
from __future__ import print_function,division
import urllib2, json, argparse, sys
from collections import OrderedDict, defaultdict
from itertools import izip
def getAlleles(url):
"""
Sends a series of get-allele requests to the server url specified by the user, starting
with an allele id of 0 and incrementing the id until the get request fails. Returns
all alleles in the dictionary alleleDict, in which allele names are the key and
allele path items (stored as a list of dicts) are values.
Currently assumes that all alt IDs are greater than the ref ID, and only starts
storing alleles once the ref allele is reached. This is so that ancestral
alleles (which somtimes precede the ref) are skipped.
"""
print("Getting alleles from server...")
alleleDict=OrderedDict()
success=True
alleleNumber=-1
refReached=False
while success==True:
alleleNumber+=1
try:
text=urllib2.urlopen(url+'/alleles/'+str(alleleNumber)).read()
json_=json.loads(text)
id_=int(json_['id'])
name=json_['name'].split('.')[-1]
if name=='ref':
refReached=True
if refReached:
variantSetID=json_['variantSetId']
path=json_['path']
segments=path['segments']
allelePathItemList=[]
for segment in segments:
allelePathItem={}
length=int(segment['length'])
start=segment['start']
strand=start['strand']
base=start['base']
pos=int(base['position'])
seqId=int(base['sequenceId'])
allelePathItem['seq']=seqId
allelePathItem['strand']=strand
allelePathItem['pos']=pos
allelePathItem['length']=length
allelePathItemList.append(allelePathItem)
alleleDict[name]=allelePathItemList
except urllib2.HTTPError:
success=False
return alleleDict
def maf2Indices(inputFile):
"""
Extracts from the maf file input by the user a dictionary containing
allele names as keys. Values are lists of integers with lengths equal to
the lengths of the allele sequences; integers correspond to the 1-based index of
the reference base that the allele base is aligned to (or 0 if the
base is not aligned.) If the alt base aligns to the '-' strand of the ref,
the integer will be negative.
ToDo: This program currently assumes that there is only one
block per MAF file, which is not generally true of the format,
but happens to be true of the GRC maf files.
"""
print("Converting maf file into alignment indices...")
altDict=defaultdict(list)
cursorDict={}
infoList=[]
sequenceList=[]
seqID=-1
for line in inputFile:
line=line.strip().split()
if line and line[0]=='s':
seqID+=1
name,start,length,strand,sourceLength,sequence=line[1:]
start=int(start)
sourceLength=int(sourceLength)
infoList.append([name,strand,seqID])
sequenceList.append(sequence)
if strand=='+':
cursorDict[seqID]=start
elif strand=='-':
cursorDict[seqID]=sourceLength-start-1
else:
raise Exception("strand is not '+' or '-'.")
if name not in altDict and name!='ref':
altDict[name]=[0]*sourceLength
refIndex=0
for letterList in izip(*sequenceList):
if letterList[0]!='-':refIndex+=1
for info,altLetter in izip(infoList[1:],letterList[1:]):
name,strand,seqID=info
altIndex=cursorDict[seqID]
if letterList[0]=='-':
if altLetter=='-':
pass
else:
if strand=='-':
cursorDict[seqID]-=1
else:
cursorDict[seqID]+=1
else:
if altLetter=='-':
pass
else:
if strand=='-':
altDict[name][altIndex]=-refIndex
cursorDict[seqID]-=1
else:
altDict[name][altIndex]=refIndex
cursorDict[seqID]+=1
return altDict
def graph2Indices(alleleDict):
"""
Extracts from the graph server alleles input by the user a dictionary
containing allele names as keys. Values are lists of integers with lengths
equal to the lengths of the allele sequences; integers correspond to the 1-based
index of the reference base that the allele base is aligned to (or 0 if the
base is not aligned.) If the alt base aligns to the '-' strand of the ref,
the integer will be negative.
"""
print("Converting graph alleles into alignment indices...")
altDict=defaultdict(list)
refSegMap=defaultdict(dict)
refIndex=1
#Record which sequence bases map to which ref bases
for pathItem in alleleDict['ref']:
seqID=pathItem['seq']
strand=pathItem['strand']
start=pathItem['pos']
length=pathItem['length']
if strand=='POS_STRAND':
for seqIndex in xrange(start+1,start+length+1):
refSegMap[seqID][seqIndex]=refIndex
refIndex+=1
else:
for seqIndex in xrange(-start-1,-start+length-1):
refSegMap[seqID][seqIndex]=refIndex
refIndex+=1
for name in alleleDict:
if name!='ref':
for pathItem in alleleDict[name]:
seqID=pathItem['seq']
strand=pathItem['strand']
start=pathItem['pos']
length=pathItem['length']
if seqID in refSegMap:
if strand=='POS_STRAND':
for seqIndex in xrange(start+1,start+length+1):
if seqIndex in refSegMap[seqID]:
altDict[name].append(refSegMap[seqID][seqIndex])
elif -seqIndex in refSegMap[seqID]:
altDict[name].append(-refSegMap[seqID][-seqIndex])
else:
altDict[name].append(0)
else:
for seqIndex in xrange(-start-1,-start+length-1):
if seqIndex in refSegMap[seqID]:
altDict[name].append(refSegMap[seqID][seqIndex])
elif -seqIndex in refSegMap[seqID]:
altDict[name].append(-refSegMap[seqID][-seqIndex])
else:
altDict[name].append(0)
else:
altDict[name]+=[0]*length
return altDict
def mergeRefItems(refPathList):
"""
Before computing the overlap between alt alleles and a reference
allele, it helps to eliminate overlapping regions in the reference
(like from duplicated segments.)
Thus, this function takes the reference allele (a list of dicts), and
returns a dict where sequence IDs (integers) are keys, and values are
lists of tuples corresponding to base-index-ranges of each sequence that
the reference allele spans.
"""
unmergedRefDict=defaultdict(list)
refDict=defaultdict(list)
for pathItem in refPathList:
seq=pathItem['seq']
length=pathItem['length']
strand=pathItem['strand']
start=pathItem['pos']
if strand=='POS_STRAND':
end=start+length-1
else:
end=start-length+1
unmergedRefDict[seq].append(sorted([start,end]))
for seq,rangeList in unmergedRefDict.iteritems():
for begin,end in sorted(rangeList):
if refDict[seq] and refDict[seq][-1][1] >= begin - 1:
refDict[seq][-1] = (refDict[seq][-1][0], end)
else:
refDict[seq].append((begin, end))
return refDict
def getRefOverlap(allelePathItemList,refDict):
"""
Takes an alt allele (a list of dicts), and a ref dict
(a dict containing base-index-ranges for each Sequence...
see function mergeRefItems.)
Computes a percent overlap of the alt allele with the reference,
i.e. number of bases in alt overlapping ref, divided by total
alt bases.
"""
overlapLength=0
totalLength=0
for pathItem in allelePathItemList:
totalLength+=pathItem['length']
for pathItem in allelePathItemList:
segSeq=pathItem['seq']
segStrand=pathItem['strand']
segStart=pathItem['pos']
segLength=pathItem['length']
segEnd=segStart+segLength-1
if segStrand=='NEG_STRAND':
segEnd=segStart-segLength+1
segStart,segEnd=sorted([segStart,segEnd])
for refStart,refEnd in refDict[segSeq]:
if not ((segStart<refStart and segEnd<refEnd) or (segStart>refStart and segEnd>refEnd)):
if segStart>=refStart and segEnd<=refEnd:
overlapLength+=segLength
elif segStart<=refStart and segEnd>=refEnd:
overlapLength+=refEnd-refStart+1
elif segStart<=refStart and segEnd>=refStart:
overlapLength+=segEnd-refStart+1
elif segStart<=refEnd and segEnd>=refEnd:
overlapLength+=segEnd-refEnd+1
refOverlapFraction=overlapLength/totalLength
return refOverlapFraction
def parseArgs():
parser = argparse.ArgumentParser(description="""Performs the specified evaluation on a specified graph server.
Requires a url to be supplied from the user.""")
parser.add_argument('url',type=str,help="""A string containing the url of the graph server to be evaluated.
Must include a version number at the end of the url, e.g. 'v0.6.g' """)
parser.add_argument('--align2ref', action='store_const', const=True,
help="""Compares each allele returned by the server to the reference allele, returning a percent overlap.
Assumes that the reference allele is named "ref", or "ref.ref".""")
parser.add_argument('--maf',type=file, help="""Compares the graph-alignments of each allele to the reference,
to the corresponding graph-alignments in a separate MAF file. Requires the name of the maf file.
Assumes the reference allele is named 'ref' or 'ref.ref'. Also currently assumes the MAF only contains
a single block.""")
args = parser.parse_args()
return args
def main():
args=parseArgs()
if args.align2ref:
alleleDict=getAlleles(args.url)
refAllele=alleleDict['ref']
refDict=mergeRefItems(refAllele)
for id_,allele in alleleDict.items():
refOverlapFraction=getRefOverlap(allele,refDict)
print(refOverlapFraction)
elif args.maf:
alleleDict=getAlleles(args.url)
graphAltDict=graph2Indices(alleleDict)
mafAltDict=maf2Indices(args.maf)
assert set(mafAltDict.keys())==set(graphAltDict.keys())
for alt in mafAltDict:
assert len(mafAltDict[alt])==len(graphAltDict[alt])
print("Computing precision and recall...\n")
totalMatchCount=0
totalNumMafAlignedBases=0
totalNumGraphAlignedBases=0
for alt in mafAltDict:
mafAlt=mafAltDict[alt]
graphAlt=graphAltDict[alt]
matchCount=sum(map(lambda n:1 if n[0]!=0 and n[0]==n[1] else 0,izip(mafAlt,graphAlt)))
numMafAlignedBases=len([i for i in mafAlt if i!=0])
numGraphAlignedBases=len([i for i in graphAlt if i!=0])
totalMatchCount+=matchCount
totalNumMafAlignedBases+=numMafAlignedBases
totalNumGraphAlignedBases+=numGraphAlignedBases
try: precision=matchCount/numGraphAlignedBases
except ZeroDivisionError:
precision=0
print("ZeroDivisionError when computing precision for {}.".format(alt))
try: recall=matchCount/numMafAlignedBases
except ZeroDivisionError:
recall=0
print("ZeroDivisionError when computing recall for {}.".format(alt))
print("Allele: {}\nPrecision: {:.3f}\nRecall: {:.3f}\n".format(alt,precision,recall))
try: avePrecision=totalMatchCount/totalNumGraphAlignedBases
except ZeroDivisionError:
avePrecision=0
print("ZeroDivisionError when computing average precision.")
try: aveRecall=totalMatchCount/totalNumMafAlignedBases
except ZeroDivisionError:
aveRecall=0
print("ZeroDivisionError when computing average recall.")
print("Overall\nPrecision: {:.3f}\nRecall: {:.3f}".format(avePrecision,aveRecall))
else:
print("Please specify an evaluation option.")
if __name__ == '__main__':
main()