-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_import.py
106 lines (84 loc) · 3.14 KB
/
data_import.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import re
from collections import Counter
from collections import defaultdict
import numpy as np
def read_voc_pos_tags_from_conllu_file(filename):
file = open(filename, 'r', encoding="utf8")
pos_tags = []
vocabulary = []
sentences = []
text = file.read()
for sentence in text.split('\n\n'):
s = {}
w2i = defaultdict(lambda: len(w2i))
w2i['0'] = 0
for line in sentence.split('\n'):
if line.startswith('#'):
continue
if line and line != '\n':
line_split = line.split('\t')
# remove sentences which start with integer index with hyphens
if re.match("\d+[-]\d+", line_split[0]):
file.remove(line)
id = w2i[line_split[0]]
s[id] = ([line_split[1].lower(),
line_split[4],
line_split[6],
line_split[7],
line_split[1],
line_split[0]])
pos_tags.append(line_split[4])
vocabulary.append(line_split[1].lower())
golden_labels = []
M = np.zeros((len(s) + 1, len(s) + 1))
for i, w in enumerate(s.keys()):
if s[w][2] == '_':
continue
M[w2i[s[w][2]]][i+1] = 1
golden_labels.append([w2i[s[w][2]], i+1, s[w][3]])
M[0, 0] = 1
if s:
sentences.append([s, M, golden_labels])
return vocabulary, pos_tags, sentences
def read_conllu_file(filename):
vocabulary, pos_tags, sentences = read_voc_pos_tags_from_conllu_file(filename)
vocabulary = set(vocabulary)
pos_tags = list(set(pos_tags))
voc_counter = Counter(vocabulary)
filtered_vocabulary = set()
labels = set()
for s in sentences:
for i, v in s[0].items():
labels.add(v[3])
# replace words that occur once with <unk>
for word in vocabulary:
if voc_counter[word] > 2:
filtered_vocabulary.add(word)
else:
filtered_vocabulary.add('<unk>')
voc_counter = Counter(vocabulary)
w2i = defaultdict(lambda: len(w2i))
t2i = defaultdict(lambda: len(t2i))
l2i = defaultdict(lambda: len(l2i))
for index, word in enumerate(voc_counter):
w2i[word] = index
i2w = {v: k for k, v in w2i.items()}
for index, tag in enumerate(pos_tags):
t2i[tag] = index
for index, label in enumerate(labels):
l2i[label] = index
i2t = {v: k for k, v in t2i.items()}
i2l = {v: k for k, v in l2i.items()}
index_sentences = []
golden_labels = []
for (sentence, _, gl) in sentences:
s = []
for k, v in sentence.items():
s.append((w2i[v[0]], t2i[v[1]]))
l = []
for f, t, label in gl:
l.append([f, t, l2i[label]])
golden_labels.append(l)
index_sentences.append(s)
return (dict(w2i), dict(i2w), dict(t2i), dict(i2t), dict(l2i), dict(i2l),
sentences, index_sentences, golden_labels)