-
Notifications
You must be signed in to change notification settings - Fork 13
/
pdtb2.py
652 lines (557 loc) · 23.5 KB
/
pdtb2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
#!/usr/bin/env python
"""
Penn Discourse Treebank 2.0 reader, requiring the PDTB file format
used for the materials at http://compprag.christopherpotts.net/
"""
__author__ = "Christopher Potts"
__version__ = "2.0"
__license__ = "GNU general public license, version 2"
__maintainer__ = "Christopher Potts"
__email__ = "See the author's website"
######################################################################
import os
import sys
import csv
import re
from collections import defaultdict
from nltk.tree import Tree
from nltk.stem import WordNetLemmatizer
GRAPHVIZ_TEMPLATE_FILENAME = 'pdtb2-template.dot'
######################################################################
class CorpusReader:
def __init__(self, src_filename):
self.src_filename = src_filename
def iter_data(self, display_progress=True):
with open(self.src_filename) as f:
row_iterator = csv.reader(f)
next(row_iterator) # Skip past the header.
for i, row in enumerate(row_iterator):
if display_progress:
sys.stderr.write("\r")
sys.stderr.write("row {}".format(i+1))
sys.stderr.flush()
yield Datum(row)
if display_progress: sys.stderr.write("\n")
######################################################################
class Datum:
header = [
# Corpus
'Relation', 'Section', 'FileNumber',
##### Connective.
'Connective_SpanList', 'Connective_GornList', 'Connective_Trees',
'Connective_RawText', 'Connective_StringPosition',
'SentenceNumber', 'ConnHead', 'Conn1', 'Conn2', 'ConnHeadSemClass1',
'ConnHeadSemClass2', 'Conn2SemClass1', 'Conn2SemClass2',
# Connective attribution.
'Attribution_Source', 'Attribution_Type', 'Attribution_Polarity',
'Attribution_Determinacy',
'Attribution_SpanList', 'Attribution_GornList', 'Attribution_Trees',
'Attribution_RawText',
##### Arg1
'Arg1_SpanList', 'Arg1_GornList', 'Arg1_Trees', 'Arg1_RawText',
# Arg1 Attribution.
'Arg1_Attribution_Source', 'Arg1_Attribution_Type', 'Arg1_Attribution_Polarity',
'Arg1_Attribution_Determinacy',
'Arg1_Attribution_SpanList', 'Arg1_Attribution_GornList', 'Arg1_Attribution_Trees',
'Arg1_Attribution_RawText',
##### Arg2.
'Arg2_SpanList', 'Arg2_GornList', 'Arg2_Trees', 'Arg2_RawText',
# Arg2 Attribution.
'Arg2_Attribution_Source', 'Arg2_Attribution_Type', 'Arg2_Attribution_Polarity',
'Arg2_Attribution_Determinacy',
'Arg2_Attribution_SpanList', 'Arg2_Attribution_GornList', 'Arg2_Attribution_Trees',
'Arg2_Attribution_RawText',
##### Sup1.
'Sup1_SpanList', 'Sup1_GornList', 'Sup1_Trees', 'Sup1_RawText',
##### Sup2
'Sup2_SpanList', 'Sup2_GornList', 'Sup2_Trees', 'Sup2_RawText',
##### Full raw text.
'FullRawText']
def __init__(self, row):
"""
Where row is a list, it is processed directly.
Where row is a string, it is parsed into a list using a csv.reader.
The attribute names are given in the class variable Datum.header.
"""
if row.__class__.__name__ in ('str', 'unicode'):
row = list(csv.reader([row.strip()]))[0]
# Set the attributes.
for i in range(len(row)):
att_name = Datum.header[i]
row_value = row[i]
# Span lists.
if re.search(r"SpanList", att_name):
row_value = self.__process_span_list(row_value)
# Gorn lists.
elif re.search(r"GornList", att_name):
row_value = self.__process_gorn_list(row_value)
# Integer-valued.
elif att_name in ('Connective_StringPosition', 'SentenceNumber'):
if row_value:
row_value = int(row_value)
else:
row_value = None
# Trees.
elif re.search(r"Trees", att_name):
row_value = self.__process_trees(row_value)
# The rest are strings and so don't require special handling.
elif not row_value:
row_value = None
setattr(self, att_name, row_value)
######################################################################
# SEMANTIC VALUES
def primary_semclass1(self):
"""
ConnHeadSemClass1 has fields separated by dots. This function
returns the first (as a str), which is always one of
Comparison, Contingency, Expansion, Temporal
"""
return self.semclass1_values()[0]
def secondary_semclass1(self):
"""
ConnHeadSemClass1 has fields separated by dots. This function
returns the second (as a str) if there is one, else None.
Values (except None):
Alternative, Asynchronous, Cause, Concession, Condition,
Conjunction, Contrast, Exception, Instantiation, List,
Pragmatic cause, Pragmatic concession, Pragmatic condition,
Pragmatic contrast, Restatement, Synchrony
"""
vals = self.semclass1_values()
if len(vals) >= 2:
return vals[1]
else:
return None
def tertiary_semclass1(self):
"""
ConnHeadSemClass1 has fields separated by dots. This function
returns the third (as a str) if there is one, else None.
Values (except None):
Chosen alternative, Conjunctive, Contra-expectation,
Disjunctive, Equivalence, Expectation, Factual past, Factual
present, General, Generalization, Hypothetical, Implicit
assertion, Justification, Juxtaposition, NONE, Opposition,
Precedence, Reason, Relevance, Result,Specification,
Succession, Unreal past, Unreal present
"""
vals = self.semclass1_values()
if len(vals) >= 3:
return vals[2]
else:
return None
def semclass1_values(self):
if self.ConnHeadSemClass1:
return self.ConnHeadSemClass1.split(".")
else:
return [None]
######################################################################
# TOKENIZING AND POS-TAGGING WITH THE OPTION TO CONVERT
# CONVERTABLE TAGS TO WORDNET STYLE
def arg1_words(self, lemmatize=False):
"""
Returns the list of words associated with Arg1. lemmatize=True
uses nltk.stem.WordNetStemmer() on the list.
"""
return self.__words(self.arg1_pos, lemmatize=lemmatize)
def arg2_words(self, lemmatize=False):
"""
Returns the list of words associated with Arg2. lemmatize=True
uses nltk.stem.WordNetStemmer() on the list.
"""
return self.__words(self.arg2_pos, lemmatize=lemmatize)
def arg1_attribution_words(self, lemmatize=False):
"""
Returns the list of words associated with Arg1's attrbution
(if any). lemmatize=True uses nltk.stem.WordNetStemmer() on
the list.
"""
return self.__words(self.arg1_attribution_pos, lemmatize=lemmatize)
def arg2_attribution_words(self, lemmatize=False):
"""
Returns the list of words associated with Arg2's attrbution
(if any). lemmatize=True uses nltk.stem.WordNetStemmer() on
the list.
"""
return self.__words(self.arg2_attribution_pos, lemmatize=lemmatize)
def connective_words(self, lemmatize=False):
"""
Returns the list of words associated with an Explicit or
AltLex connective (else it always returns []). lemmatize=True
uses nltk.stem.WordNetStemmer() on the list.
"""
return self.__words(self.connective_pos, lemmatize=lemmatize)
def sup1_words(self, lemmatize=False):
return self.__words(self.sup1_pos, lemmatize=lemmatize)
def sup2_words(self, lemmatize=False):
"""
Returns the list of words associated with Sup1 (if
any). lemmatize=True uses nltk.stem.WordNetStemmer() on the
list.
"""
return self.__words(self.sup2_pos, lemmatize=lemmatize)
def __words(self, method, lemmatize=False):
"""
Internal method used by the X_words functions to get at their
(possibly stemmed) words.
"""
lemmas = method(lemmatize=lemmatize)
return [x[0] for x in lemmas]
def arg1_pos(self, wn_format=False, lemmatize=False):
"""
Returns the list of (word, pos) pairs associated with
Arg1. lemmatize=True uses nltk.stem.WordNetStemmer() on the list.
"""
return self.arg_pos(1, wn_format=wn_format, lemmatize=lemmatize)
def arg2_pos(self, wn_format=False, lemmatize=False):
"""
Returns the list of (word, pos) pairs associated with
Arg2. lemmatize=True uses nltk.stem.WordNetStemmer() on the list.
"""
return self.arg_pos(2, wn_format=wn_format, lemmatize=lemmatize)
def arg_pos(self, index, wn_format=False, lemmatize=False):
"""
Returns the list of (word, pos) pairs associated with ArgN,
where N = index (1 or 2). lemmatize=True uses
nltk.stem.WordNetStemmer() on the list. wn_format=True merely
converts to WordNet tags where possible, without stemming.
"""
return self.__pos("Arg%s" % index, wn_format=wn_format, lemmatize=lemmatize)
def arg1_attribution_pos(self, wn_format=False, lemmatize=False):
"""
Returns the list of (word, pos) pairs associated with
Arg1. lemmatize=True uses nltk.stem.WordNetStemmer() on the
list. wn_format=True merely converts to WordNet tags where
possible, without stemming.
"""
return self.arg_attribution_pos(1, wn_format=wn_format, lemmatize=lemmatize)
def arg2_attribution_pos(self, wn_format=False, lemmatize=False):
"""
Returns the list of (word, pos) pairs associated with
Arg2. lemmatize=True uses nltk.stem.WordNetStemmer() on the
list. wn_format=True merely converts to WordNet tags where
possible, without stemming.
"""
return self.arg_attribution_pos(2, wn_format=wn_format, lemmatize=lemmatize)
def arg_attribution_pos(self, index, wn_format=False, lemmatize=False):
"""
Returns the list of (word, pos) pairs associated with ArgN's
attribution, where N = index (1 or 2). lemmatize=True uses
nltk.stem.WordNetStemmer() on the list. wn_format=True merely
converts to WordNet tags where possibl, without stemming.
"""
return self.__pos("Arg%s_Attribution" % index, wn_format=wn_format, lemmatize=lemmatize)
def connective_pos(self, wn_format=False, lemmatize=False):
"""
Returns the list of (word, pos) pairs associated with an
Explicit or AltLex connective (else returns []).
lemmatize=True uses nltk.stem.WordNetStemmer() on the
list. wn_format=True merely converts to WordNet tags where
possibl, without stemming.
"""
return self.__pos("Connective", wn_format=wn_format, lemmatize=lemmatize)
def sup1_pos(self, wn_format=False, lemmatize=False):
"""
Returns the list of (word, pos) pairs associated with Sup1 (if
any). lemmatize=True uses nltk.stem.WordNetStemmer() on the
list. wn_format=True merely converts to WordNet tags where
possibl, without stemming.
"""
return self.sup_pos(1, wn_format=wn_format, lemmatize=lemmatize)
def sup2_pos(self, wn_format=False, lemmatize=False):
"""
Returns the list of (word, pos) pairs associated with Sup2 (if
any). lemmatize=True uses nltk.stem.WordNetStemmer() on the
list. wn_format=True merely converts to WordNet tags where
possibl, without stemming.
"""
return self.sup_pos(2, wn_format=wn_format, lemmatize=lemmatize)
def sup_pos(self, index, wn_format=False, lemmatize=False):
"""
Returns the list of (word, pos) pairs (if any) associated with
SupN where N = index (1 or 2). lemmatize=True uses
nltk.stem.WordNetStemmer() on the list. wn_format=True merely
converts to WordNet tags where possibl, without stemming.
"""
return self.__pos("Sup%s" % index, wn_format=wn_format, lemmatize=lemmatize)
def __pos(self, typ, wn_format=False, lemmatize=False):
"""
Internal method used to get the POS version, potentially lemmatized, associated with
typ strings (Arg1, Sup2, etc.)
"""
results = []
trees = eval("self.%s_Trees" % typ)
for tree in trees:
results += tree.pos()
# Lemmatizing implies converting to WordNet tags.
if lemmatize:
results = list(map(self.__treebank2wn_pos, results))
results = list(map(self.__lemmatize, results))
# This is tag conversion without lemmatizing.
elif wn_format:
results = list(map(self.__treebank2wn_pos, results))
return results
def __treebank2wn_pos(self, lemma):
"""
Internal method used for converting a lemma's tag to WordNet format where possible.
"""
string, tag = lemma
tag = tag.lower()
if tag.startswith('v'):
tag = 'v'
elif tag.startswith('n'):
tag = 'n'
elif tag.startswith('j'):
tag = 'a'
elif tag.startswith('rb'):
tag = 'r'
return (string, tag)
def __lemmatize(self, lemma):
"""
Internal method used for applying the nltk.stem.WordNetStemmer() to the (word, pos) pair lemma.
"""
string, tag = lemma
if tag in ('a', 'n', 'r', 'v'):
wnl = WordNetLemmatizer()
string = wnl.lemmatize(string, tag)
return (string, tag)
######################################################################
# POSITIONING.
def relative_arg_order(self):
"""
1S ... 1F ... 2s ... 2f -> arg1_precedes_arg2
1S ... 2s ... 2f ... 1F -> arg1_contains_arg2
1S ... 2s ... 1F ... 2f -> arg1_precedes_and_overlaps_but_does_not_contain_arg2
2S ... 2F ... 1S ... 1F -> arg2_precedes_arg1
2S ... 1S ... 1F ... 2F -> arg2_contains_arg1
2S ... 1S ... 2F ... 2F -> arg2_precedes_and_overlaps_but_does_not_contain_arg1
"""
arg1_indices = [i for span in self.Arg1_SpanList for i in span]
arg1_start = min(arg1_indices)
arg1_finish = max(arg1_indices)
arg2_indices = [i for span in self.Arg2_SpanList for i in span]
arg2_start = min(arg2_indices)
arg2_finish = max(arg2_indices)
# Arg1 preceding:
if arg1_finish < arg2_start:
return 'arg1_precedes_arg2'
if arg1_start < arg2_start and arg2_finish < arg1_finish:
return 'arg1_contains_arg2'
if arg1_start < arg2_start and arg2_start < arg1_finish and arg1_finish < arg2_finish:
return 'arg1_precedes_and_overlaps_but_does_not_contain_arg2'
# Arg2 preceding:
if arg2_finish < arg1_start:
return 'arg2_precedes_arg1'
if arg2_start < arg1_start and arg1_finish < arg2_finish:
return 'arg2_contains_arg1'
if arg2_start < arg2_start and arg1_start < arg2_finish and arg2_finish < arg1_finish:
return 'arg2_precedes_and_overlaps_but_does_not_contain_arg1'
raise Exception("No relation could be determined for the two arguments!\n%s" % self.FullRawText)
def arg1_precedes_arg2(self):
"""
Returns True if the whole of Arg1 precedes the whole of Arg2,
else False. (So the False includes both where Arg2 precedes
Arg1 and where they are in some kind of overlap relationship).
1S ... 1F ... 2s ... 2f
"""
if self.relative_arg_order() == 'arg1_precedes_arg2':
return True
else:
return False
def arg2_precedes_arg1(self):
"""
Returns True if the whole of Arg2 precedes the whole of Arg1,
else False. (So the False includes both where Arg1 precedes
Arg2 and where they are in some kind of overlap relationship).
2S ... 2F ... 1S ... 1F
"""
if self.relative_arg_order() == 'arg2_precedes_arg1':
return True
else:
return False
def arg1_contains_arg2(self):
"""
Returns True if Arg1 contains Arg2 completely, else False.
1S ... 2s ... 2f ... 1F
"""
if self.relative_arg_order() == 'arg1_contains_arg2':
return True
else:
return False
def arg2_contains_arg1(self):
"""
Returns True if Arg1 contains Arg2 completely, else False.
2S ... 1S ... 1F ... 2F
"""
if self.relative_arg_order() == 'arg2_contains_arg1':
return True
else:
return False
def arg1_precedes_and_overlaps_but_does_not_contain_arg2(self):
"""
Arg1 begins before Arg2, but Arg1 also ends before Arg2:
1S ... 2s ... 1F ... 2f
It seems that this is not attested in the data.
"""
if self.relative_arg_order() == 'arg1_precedes_and_overlaps_but_does_not_contain_arg2':
return True
else:
return False
def arg2_precedes_and_overlaps_but_does_not_contain_arg1(self):
"""
Arg2 begins before Arg1, but Arg2 also ends before Arg1:
2S ... 1S ... 2F ... 2F
It seems that this is not attested in the data.
"""
if self.relative_arg_order() == 'arg2_precedes_and_overlaps_but_does_not_contain_arg1':
return True
else:
return False
######################################################################
# NORMALIZATION.
def conn_str(self, distinguish_implicit=True):
"""
Provides a method for looking at the intuitive main element of
a connective despite the variation across relation types
in what the main element is.
Optional argument:
distinguish_implicit -- prefixes Implicit= to the relation
where appropriate (default: True)
Value: a str
"""
rel = self.Relation
if rel == 'Explicit':
return self.ConnHead
elif rel == 'AltLex':
return self.Connective_RawText
elif rel == 'Implicit':
prefix = ""
if distinguish_implicit:
prefix = "Implicit="
return prefix + self.Conn1
else:
return None
def final_arg1_attribution_source(self):
"""Follows inhereted attribution values for Arg1."""
return self.final_arg_attribution_source(1)
def final_arg2_attribution_source(self):
"""Follows inhereted attribution values for Arg2."""
return self.final_arg_attribution_source(2)
def final_arg_attribution_source(self, index):
"""
Where the attribution on an argument is Inh (inherited),
supply the inherited value (from the connective).
Argument:
index -- 1 or 2 depending on the argument sought
Value: a str
"""
if index not in (1,2):
raise ArgumentError('index must be int 1 or int 2; was %s (type %s).\n' % (index, index.__class__.__.name__))
src = eval("self.Arg%s_Attribution_Source" % index)
if src == "Inh":
src = self.Attribution_Source
return src
######################################################################
# Vizualization.
def to_graphviz(self, include_ptb=True):
"""
Uses the Graphviz template to create Graphviz code for this Datum
"""
try:
template = open(GRAPHVIZ_TEMPLATE_FILENAME).read()
except:
raise Exception("Can't find the Graphviz template file %s" % GRAPHVIZ_TEMPLATE_FILENAME)
# Title.
new_line_removal = re.compile(r"\n\s*", re.M)
cleaned_text = new_line_removal.sub(" ", self.FullRawText)
title = '"%s\\n%s\\nSource: %s/wsj_00%s"' % (self.Relation, cleaned_text, self.Section, self.FileNumber)
template = template.replace("$TITLE", title)
# Attributes.
attributes = dir(self)
for att in attributes:
# Get the value for this attribute.
val = getattr(self, att)
# This is what we'll use for the label.
val_str = '""'
# Trees are handled specially.
if re.search(r"_Trees", att):
if include_ptb:
val_str = self.__format_graphviz_trees(val)
else:
val_str = '"(not pictured)"'
elif val == "Inh":
val_str = '"%s / %s"' % (val, self.Attribution_Source)
else:
val_str = '"%s"' % val
# Identify the variable in the template.
att_re = re.compile(r'"\$' + att.upper() + r'"', re.MULTILINE)
# Substitution string.
template = att_re.sub(val_str, template)
# Remove attributes that have None as their value.
for att in attributes:
val = getattr(self, att)
if not val:
template = re.sub(r".*\s%s\s.*" % att, "", template)
# Remove blank lines to keep the file neat.
blank_line_re = re.compile("^\s*$", re.M)
template = blank_line_re.sub("", template)
# Return the graphviz code, as a multiline string.
return template
def __format_graphviz_trees(self, trees):
"""Internal method for formatting trees for Graphviz node labels."""
if not trees:
return '""'
s = "<<table border='0'>"
for tree in trees:
s += "<tr><td>%s</td></tr>" % re.sub(r"\n\s*", " ", str(tree))
s += "</table>>"
return s
def __str__(self):
"""Just returns the full text of the string."""
return self.FullRawText
######################################################################
# INTERNAL HELPER METHODS.
def __process_span_list(self, s):
"""
Argument
s (str) -- of the form i..j;k..m;
Value
A list of pairs (list) of integers.
"""
if not s:
return []
parts = re.split(r"\s*;\s*", s)
seqs = list(map((lambda x : map(int, re.split(r"\s*\.\.\s*", x))), parts))
return seqs
def __process_gorn_list(self, s):
"""
Argument
s (str) -- of the form i,j,...;k,l,...
Value
A list of lists (any length) of integers.
"""
if not s:
return []
parts = re.split(r"\s*;\s*", s)
seqs = map((lambda x : map(int, re.split(r"\s*,\s*", x))), parts)
return seqs
def __process_trees(self, s):
"""
Input
a string representing Penn parsetrees, delimited by |||
Value:
A list of NLTK Tree objects.
"""
if not s:
return []
tree_strs = s.split("|||")
return [Tree.fromstring(s) for s in tree_strs]
######################################################################
if __name__ == '__main__':
"""
Will try to map a string representation of the row supplied to its
Graphviz code.
"""
if len(sys.argv) >= 2:
s = sys.argv[1]
d = Datum(s)
print((d.to_graphviz()))