-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse.py
executable file
·279 lines (232 loc) · 9.29 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
#from Python
import re
from string import Template
#user defined
import tree_traverse
import levels
from is_classes import Node
from is_classes import Tree
#module for parsing a newick tree. A newick tree is converted into a
#datastructure Tree composed of Node. See nodeAreas.py
def VerifyBrlen(token):
"""Verify that the given token conforms to a real floating point number.
return True or False
The definition follows python's definitition of a float, which is:
floatnumber ::= pointfloat | exponentfloat
pointfloat ::= [intpart] fraction | intpart "."
exponentfloat ::= (intpart | pointfloat) exponent
intpart ::= digit+
fraction ::= "." digit+
exponent ::= ("e" | "E") ["+" | "-"] digit+
"""
exponent = r'[eE][+-]\d+'
fraction = r'\.\d+'
intpart = r'\d+'
pf = Template('($ip)?($f)|($ip)\.')
pointfloat = pf.substitute(ip=intpart, f=fraction)
ef = Template('(($ip)|($pf))$e')
exponentfloat = ef.substitute(ip=intpart, pf=pointfloat, e=exponent)
fn = Template('($pf)|($ef)')
floatnumber = fn.substitute(pf=pointfloat, ef=exponentfloat)
re_floatnumber = re.compile(floatnumber)
m = re_floatnumber.match(token)
#assert (m != None), "Unrecognized format for brlen:"+token
#assert (m.group == token), "unexpectedly long token:"+token
if m != None:
return True
else:
return False
def ExtractToken(line):
"""Extract the first proper newick tokens from the line;
return a tuple (token_type, token). token_type is defined below
The proper tokens are:
Type 1 '('
Type 2 r',\s*' regular expression: comma followed by zero or more whitespace
characters
Type 3 ')[:floatingnumber] - rightparen optionally followed by
: and a floatingpoint number denoting brlen
Type 4 r'\S+'[:floatingnumber] - any sequence of non-whitespace characters of
length atleast 1 optionally followed by : and a float
Note that the above tokens are not fully refined: for example, (a) floatingnumber itself is an elaborate
regular expression (b) '\S+' can be refined into r'(\w+)(#(\w+)(-\w+)?)?' with a taxon name possibly followed
by island name which can possibly be followed by a zone name. The idea is that once the crudely defined
tokens are identified, it can be verified that they conform to the proper syntax of taxon names, branch lengths
etc.
"""
#check if token is left par, and if so return its type (1)
leftpar = r'\('
re_leftpar = re.compile(leftpar)
x = re_leftpar.match(line)
if x != None:
token = x.group()
print 'Type 1: '+token
return (1, token)
# Check if Type 2: a
# comma followed by zero or more whitespace characters
not_names = r',\s*'
re_not_names = re.compile(not_names)
x = re_not_names.match(line)
if x != None:
token = x.group()
print 'Type 2: '+token
return (2, token)
# Check if Type 3: right par optionally followed by :float
rightparfloat = r'\)(:[^,\)\s]+)?'
re_rightparfloat = re.compile(rightparfloat)
x = re_rightparfloat.match(line)
if x != None:
token = x.group()
print 'Type 3: '+token
l = token.split(':')
if len(l) == 2:
assert VerifyBrlen(l[1]), 'Possibly Illegal Branch Length:'+l[1]
return (3, token)
# explaining the regular expression "names":
# the token can take the following forms:
# a. taxon[:branchlength]
# b. taxon#area[:branchlength]
# c. taxon#island-zone[:branchlength]
# each of taxon, area, island and zone should themselves match \w+:
# (sequence of one or more alphanumeric characters)
# branchlength should be a floatingpoint number as defined in python.
# See function VerifyBrlen
names = r'(\w+)(#(\w+)(-\w+)?)?(:[^,\)\s]+)?'
re_names = re.compile(names)
x = re_names.match(line)
assert x != None, line+"Unrecognized token"
token = x.group()
print 'Type 4: '+token
l = token.split(':')
if len(l) == 2:
assert VerifyBrlen(l[1]), 'Possibly Illegal Branch Length:'+l[0]+' '+l[1]
return (4, token)
def Read(tree_name, taxon_table, state_table, tree):
"""Parse a newick string return the tree as a datastructure """
# variables for iterating through the string
original_tree = tree
tree_len = len(tree)
parsed_len = 0
# stack for parsing the string. Think push-down-automata
stack = []
# the logic for parsing is quite simple. It looks complicated only
# because one has to catch syntax errors. The logic is as follows
# (disregarding branch lengths):
# Keeping pushing tokens on to a stack, until a ')' is encountered. At
# that time, keep popping the items from the stack until a '(' is
# encountered, and do any
# operation with them (e.g., a node can be created that is the parent
# of all the popped nodes ) - essentially the operation should
# result in the popped items being combined into one item. Now, there
# should be a '(' at the top of the stack. Pop that, and push the
# combined item to the stack. Essentially, replace, say, (x, y, z) with a single
# item W. Now that doesn't take care of of expressions without the
# outermost pair of parenthesis. The second outer while loop,
# conditioned on len(stack) > 0, is to take care of those cases. Things
# to keep in mind in this latter case: (a) only the outermost pair can
# go missing, (b) this can be recognized only on reaching the end of
# the whole string.
while parsed_len < tree_len:
(token_type, token) = ExtractToken(tree)
token_len = len(token)
parsed_len = parsed_len + token_len
tree = original_tree[parsed_len:tree_len]
# comma spaces token
if token_type == 2:
continue
# right paren token
if token_type == 3:
# read in the potential branchlength
z = token.split(':')
prebrlen = z[0]
brlen = 0.0
if len(z) == 2:
brlen = float(z[1])
#else:
# print 'WARNING: UNSPECIFIED BRANCH LENGTH SET TO 0.0'
node = Node()
node.brlen = brlen
try:
top = stack.pop()
while top.label != '(':
node.children.append(top)
top = stack.pop()
expr = ''
for child in node.children:
child.parent = node
if expr == '':
expr = expr+child.label
else:
expr = expr+', '+child.label
assert top.label == '('
assert prebrlen == ')'
node.label = '('+expr+')'
stack.append(node)
continue
except IndexError:
print """Malformed tree. A missing left paren?"""
# left paren token
if token_type == 1:
node = Node()
node.label = token
stack.append(node)
continue
# type 4 token
z = token.split(':')
name = z[0]
brlen = 0.0
if len(z) == 2:
brlen = float(z[1])
#else:
# print 'WARNING: UNSPECIFIED BRANCH LENGTH SET TO 0.0'
node = Node()
#node.label = name
node.brlen = brlen
taxon_state = name.split('#')
taxon = taxon_state[0]
node.label = taxon
if len(taxon_table) == 0:
node.taxon_name = taxon
else:
node.taxon_name = taxon_table[taxon]
if len(taxon_state) > 1:
state = taxon_state[1]
if len(state_table) == 0:
node.state = state
else:
node.state = state_table[taxon]
else:
node.state = 'NOWHERE'
stack.append(node)
tree = Tree()
if len(stack) == 1:
tree.root = stack.pop()
tree.root.parent = None
#return tree
else:
expr = ''
node = Node()
while len(stack) > 0:
top = stack.pop()
assert top.label != '(', """Malformed tree. A missing right paren?"""
node.children.append(top)
top.parent = node
if expr == '':
expr = expr+top.label
else:
expr = expr+', '+top.label
node.label = '(' + expr + ')'
tree.root = node
tree.root.parent = None
tree.name = tree_name
print 'Something'
tree_traverse.PostOrderTraverse(tree.root, tree_traverse.CalculateNodeAges)
tree_traverse.PrintLabel(tree.root)
tree_traverse.PostOrderTraverse(tree.root, tree_traverse.PrintSelfParentLabel)
tree_traverse.TreeLeaves(tree)
levels.DemarcateLevels(tree)
levels.PrintAllLevels(tree)
temp_levels=tree.levels
#k=len(tree.levels)
# for i in range(1,2):
# tree.levels[i]=temp_levels[k-i]
return tree