-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy paths_word_lists.py
431 lines (378 loc) · 19.4 KB
/
s_word_lists.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
import csv
import time
import warnings
from transcription_word_list import TranscriptionWordList
from ocr_word_list import OcrWordList
from word import WordAlphab
from tri_matrix import IsHeadBodyTail
class ScoredWordLists():
"""Holds two wordlists, creates their distance scorings and handles calculation of single matching operations.
Attributes:
transcription (TranscriptionWordList): list of words of a transcription
ocr (OcrWordList): list of words of a OCRed file
sim_sets (list[set[str]]): sets of characters that are considered equal
when compared, their distance is 0 though they are not the same character.
This makes the levenshtein distance between two words "weighted".
distance_matrix (list[list[float]]): holds the weighted levenshtein distance score
for each comparison of a word in `transcription` with one in `ocr`.
Indexed first on transcription, then ocr.
distance_matrix[i][j] gets the distance between
the ith word of `transcription` and the jth word of `ocr`.
headbodytail (list[list[IsHeadBodyTail]]): holds isheadbodytail values
for each comparison of a word in `transcription` with one in `ocr`.
See `IsHeadBodyTail` for detailed documentation.
Indexed the same as `distance_matrix`.
do_numeric (bool): if true, returns a distance score of 1 when
two numeric words are not perfectly matched
ignore_cap (bool): if true, ignores all capitalization differences
when comparing two characters
"""
def __init__(self,
transcription: TranscriptionWordList = TranscriptionWordList(),
ocr: OcrWordList = OcrWordList(),
sim_sets: list[set[str]] = [],
do_numeric: bool = False,
ignore_cap: bool = False,
distance_matrix: list[list[float]] = [[]],
headbodytail_matrix: list[list[IsHeadBodyTail]] = [[]]):
self.transcription = transcription
self.ocr = ocr
self.sim_sets = sim_sets
if do_numeric: self.check_both_type()
self.do_numeric = do_numeric
self.ignore_cap = ignore_cap
self.distance_matrix = distance_matrix
self.headbodytail = headbodytail_matrix
@staticmethod
def matrix_to_csv(matrix, output_path):
"""Writes the rows of a matrix to a csv file. Currently not used."""
with open(output_path, 'w', newline='') as f:
writer = csv.writer(f)
writer.writerows(matrix)
@staticmethod
def csv_to_matrix(input_path: str, type: str):
"""Writes a matrix to a csv file. Currently not used."""
matrix = []
with open(input_path, 'r') as f:
reader = csv.reader(f)
for row in reader:
matrix.append(row)
if type == "int":
return [[int(x) for x in row] for row in matrix]
elif type == "float":
return [[float(x) for x in row] for row in matrix]
elif type == "bool":
return [[x == 'True' for x in row] for row in matrix]
return matrix
def set_distance_head_tail_matrices(self, do_body = False , t: bool = False) -> None:
"""Calculates and sets the distance matrix and the headtailbody matrix."""
if t: t0 = time.time()
self.calculate_and_set_distance_matrix()
if t: t1 = time.time()
self.calculate_and_set_headbodytail_matrix(do_body)
if t:
t2 = time.time()
print(f"Time to calculate the distance matrix: {t1-t0}")
print(f"Time to calculate the headbodytail matrix: {t2-t1}")
def calculate_and_set_distance_matrix(self) -> None:
m = self.transcription.length
n = self.ocr.length
self.distance_matrix = [[-1.0 for _ in range(n)] for _ in range(m)]
for i in range(m):
for j in range(n):
word_distance = self.calculate_word_distance_between_words_at(i, j)
self.distance_matrix[i][j] = word_distance
def calculate_and_set_headbodytail_matrix(self, do_body = False):
m = self.transcription.length
n = self.ocr.length
self.headbodytail = [[IsHeadBodyTail() for _ in range(n)] for _ in range(m)]
for i in range(0, m):
for j in range(0, n):
str_tr = self.transcription.get_content_at_index(i)
str_ocr = self.ocr.get_content_at_index(j)
head, tail = self.check_is_headtail_of_both(i,j)
if do_body:
body = self.check_is_body_of_new(str_tr, str_ocr)
else: body = False
self.headbodytail[i][j].set_headbodytail(head, tail, len(str_tr), len(str_ocr), body)
def check_both_type(self) -> None:
"""Raises exceptions when the types of either wordlist are not correct.
Raises:
Exception: when `do_numeric` and the transcription or ocr wordlists
do not contain words with attribute has_alpha
"""
tr_type = self.transcription.get_type()
ocr_type = self.ocr.get_type()
if tr_type is None:
warnings.warn("The transcription wordlist does not contain any words")
elif not issubclass(tr_type, WordAlphab):
raise Exception("Check numeric actions cannot be done: words in the transcription word list do not contain the attribute has_alphab")
if ocr_type is None:
warnings.warn("The OCR wordlist does not contain any words")
elif not issubclass(ocr_type, WordAlphab):
raise Exception("Check numeric actions cannot be done: words in the ocr wordlist do not contain the attribute has_alphab")
def check_is_headtail_of_both(self, tr_index: int, ocr_index: int) -> tuple[bool,bool]:
"""Returns if a transcription word and an ocr word are each others head or tail.
Worst case time complexity of O(min(m,n))
Args:
tr_index (int): get the transcription word from this index
ocr_index (int): get the ocr word from this index
Returns:
tuple[bool,bool]: (0): one is head of other, (1): one is tail of other.
"""
str1 = self.transcription.get_content_at_index(tr_index)
str2 = self.ocr.get_content_at_index(ocr_index)
n = len(str1)
m = len(str2)
if n == m: return False, False
is_head = True
is_tail = True
i = 0
min_n_m = min(n,m)
while i < min_n_m and (is_head or is_tail):
is_head = is_head and self.char_equals_char(str1[i], str2[i])
is_tail = is_tail and self.char_equals_char(str1[-i-1], str2[-i-1])
i += 1
return is_head, is_tail
def check_is_headtail_of_both_new(self, str1: str, str2: str) -> tuple[bool,bool,bool,bool]:
"""Returns if a transcription word and an ocr word are each others head or tail.
Worst case time complexity of O(min(m,n))
Args:
tr_index (int): get the transcription word from this index
ocr_index (int): get the ocr word from this index
Returns:
tuple[bool,bool]:
(0): str1 is head of str2,
(1): str2 is head of str1,
(2): str1 is tail of str2,
(3): str2 is tail of str1.
"""
n = len(str1)
m = len(str2)
if n == m: return False, False, False, False
is_head = True
is_tail = True
i = 0
min_n_m = min(n,m)
while i < min_n_m and (is_head or is_tail):
is_head = is_head and self.char_equals_char(str1[i], str2[i])
is_tail = is_tail and self.char_equals_char(str1[-i-1], str2[-i-1])
i += 1
if n == min_n_m:
return is_head, False, is_tail, False
else: return False, is_head, False, is_tail
def check_is_body_of_new(self, str1: str, str2: str):
"""Checks if either of the two strings is a substring of the other.
Returns False if they are the same length, or if the substring is
placed at the very front or end of the other string.
Naive pattern search algorithm with worst case O(m(n-m)),
faster algorithms exist and can be worth exploring."""
m = len(str1)
n = len(str2)
if m == n: return False
elif m < n: substr = str1; totalstr = str2
else: substr = str2; totalstr = str1
diflen = abs(n-m)
minlen = min(m,n)
for i in range(diflen):
k = 0
for j in range(minlen):
if self.char_equals_char(totalstr[i+j], substr[j]):
k += 1
else:
break
if k == minlen:
if i == 0:
continue
elif i == diflen:
return False
else: return True
return False
def check_is_body_of(self, str1: str, str2: str):
"""Checks if `str1` is a substring of `str2`. False if they are the same.
Naive pattern search algorithm with worst case O(m(n-m)),
faster algorithms exist and can be worth exploring."""
m = len(str1)
n = len(str2)
if m >= n: return False
for i in range(n-m):
k = 0
for j in range(m):
if self.char_equals_char(str2[i+j], str1[j]):
k += 1
else:
# if self.char_equals_char()
break
if k == m:
if i == 0:
continue
elif i == n-m:
return False
else: return True
return False
def calculate_word_distance_between_words_at(self, tr_index: int, ocr_index: int) -> float:
"""Calculate the weighted levenshtein distance between two strings.
Args:
tr_index (int): gets the transcription word from this index
ocr_index (int): gets the ocr word from this index
Returns:
float: the levenshtein distance, normalised to [0,1] by
dividing it by the max length of both strings
"""
str1 = self.transcription.get_content_at_index(tr_index)
str2 = self.ocr.get_content_at_index(ocr_index)
return self.calculate_word_distance_weighted_levenshtein(str1, str2)
def calculate_word_distance_weighted_levenshtein(self, str1: str, str2: str):
"""An O(n*m) time and space algorithm to compute the normalised edit distance between two strings.
An O(n+m) space algorithm exists (https://en.wikipedia.org/wiki/Levenshtein_distance#Iterative_with_two_matrix_rows),
but is not implemented here.
There are faster algorithms (e.g. with a trie https://stevehanov.ca/blog/index.php?id=114)
that are worth exploring, as this computing is the bottleneck of the whole program.
Another option could be to parallelise this process, by calculating multiple levenshtein
distances simultaniously."""
m = len(str1)
n = len(str2)
# if both words are of length 0, they are the same, thus distance 0
max_m_n = max(m,n)
if max_m_n == 0: return 0.0
matrix = [[0.0 for _ in range(n + 1)] for _ in range(m + 1)]
# initialize first row and column with ascending numbers
for i in range(m+1):
matrix[i][0] = float(i)
for j in range(n+1):
matrix[0][j] = float(j)
for i in range(1, m+1):
for j in range(1, n+1):
if self.char_equals_char(str1[i-1], str2[j-1]):
matrix[i][j] = matrix[i-1][j-1]
else: matrix[i][j] = 1.0 + min(matrix[i][j-1], matrix[i-1][j], matrix[i-1][j-1])
score = matrix[i][j] / max_m_n
return score
def char_equals_char(self, c1: str, c2: str) -> bool:
"""Checks if two characters are equal, including a similarity sets check.
Worst case time complexity O(len(sim_set)*width(sim_set))."""
if c1 == c2: return True
if self.ignore_cap and c1.lower() == c2.lower(): return True
for sim_set in self.sim_sets:
if c1 in sim_set and c2 in sim_set: return True
return False
def calculate_split_at(self, i: int, j: int, dif_j: int = 1) -> float|None:
"""Gets the distance score for a (split) match between one transcription word and two ocr words.
Args:
i (int): index of the transcription word, its word_id
j (int): index of the ocr word, its word_id
dif_j (int, optional): distance between `j` and the index of the ocr word before it. Defaults to 1.
Returns:
float|None: None if a split is not possible,
otherwise the total distance score of the matching
"""
if self.distance_matrix[i][j] > 0.0:
if self.headbodytail[i][j].is_tail.ocr_in_tr and self.headbodytail[i][j-dif_j].is_head.ocr_in_tr:
score = self.distance_matrix[i][j] + self.distance_matrix[i][j-dif_j]
return 1.0 - score
return None
def calculate_split_at_comb(self, tr_index: int, ocr_indices: list[int]) -> tuple[float|None,int]:
"""Calculates if a (split) match is possible between one transcription word and multiple ocr words.
Args:
tr_index (int): index of the transcription word, its word_id
ocr_indices (list[int]): list of indices of the ocr words, their word_ids,
containing two indices or more.
Last word in the list (the current index) is checked as the tail of the transcription word,
then the rest is traversed back to front to find the best, if any, split.
Returns:
tuple[float|None,int]: (0): total distance score, None if not possible,
(1): and total number of ocr words involved in the split, 0 if not possible
"""
best_split = (None, 0)
if self.distance_matrix[tr_index][ocr_indices[-1]] > 0.0 \
and self.headbodytail[tr_index][ocr_indices[-1]].is_tail.ocr_in_tr:
n = len(ocr_indices) - 2
total_score = 1.0 - self.distance_matrix[tr_index][ocr_indices[-1]]
while n >= 0:
cur_dist = self.distance_matrix[tr_index][ocr_indices[n]]
# if current is body or head: add score
if self.headbodytail[tr_index][ocr_indices[n]].is_body.ocr_in_tr \
or self.headbodytail[tr_index][ocr_indices[n]].is_head.ocr_in_tr:
total_score += 1.0 - cur_dist
else:
break
# if is head: check if it is the best option
if self.headbodytail[tr_index][ocr_indices[n]].is_head.ocr_in_tr:
if best_split[0] is None or best_split[0] > 1.0-total_score:
best_split = (1.0-total_score, len(ocr_indices) - n)
n -= 1
return best_split
def calculate_join_at_new(self, i: int, j: int, dif_i: int = 1) -> float|None:
"""Gets the distance score for a (join) match between two transcription words and one ocr word.
Args:
i (int): index of the transcription word, its word_id
j (int): index of the ocr word, its word_id
dif_i (int, optional): distance between `i` and the index of the transcription word before it. Defaults to 1.
Returns:
float|None: None if a join is not possible,
otherwise the total distance score of the matching
"""
# word_distance > 0.0, is tail, previous is head and combined cost is 1
if self.distance_matrix[i][j] > 0.0 \
and self.headbodytail[i][j].is_tail.tr_in_ocr \
and self.headbodytail[i-dif_i][j].is_head.tr_in_ocr:
score = self.distance_matrix[i][j] + self.distance_matrix[i-dif_i][j]
return 1.0-score
return None
def calculate_join_at_comb(self, tr_indices: list[int], ocr_index: int) -> tuple[float|None,int]:
"""Calculates if a (join) match is possible between multiple transcription words and one ocr word.
Args:
tr_indices (list[int]): list of indices of the transcription words, their word_ids,
containing two indices or more
Last word in the list (the current index) is checked as the tail of the OCR word,
then the rest is traversed back to front to find the best, if any, split.
ocr_index (int): index of the ocr word, its word_id
Returns:
tuple[float|None,int]: (0): total distance score, None if not possible,
(1): and total number of ocr words involved in the join, 0 if not possible
"""
best_join = (None, 0)
if self.distance_matrix[tr_indices[-1]][ocr_index] > 0.0 \
and self.headbodytail[tr_indices[-1]][ocr_index].is_tail.tr_in_ocr:
n = len(tr_indices) - 2
total_score = 1.0 - self.distance_matrix[tr_indices[-1]][ocr_index]
while n >= 0:
cur_dist = self.distance_matrix[tr_indices[n]][ocr_index]
# if current is body or head: add score
if self.headbodytail[tr_indices[n]][ocr_index].is_body.tr_in_ocr \
or self.headbodytail[tr_indices[n]][ocr_index].is_head.tr_in_ocr:
total_score += 1.0 - cur_dist
else:
break
# if is head: check if it is the best option
if self.headbodytail[tr_indices[n]][ocr_index].is_head.tr_in_ocr:
if best_join[0] is None or best_join[0] > 1.0-total_score:
best_join = (1.0-total_score, len(tr_indices) - n)
n -= 1
return best_join
def calculate_matching_at(self, i: int, j: int) -> float:
"""Calculates if a match is possible between one transcription word and one ocr word.
Args:
i (int): index of the transcription word, its word_id
j (int): index of the ocr word, its word_id
Returns:
float: the distance score of the matching (1.0 for bad match, 0.0 for perfect match)
"""
cur_distance = self.distance_matrix[i][j]
if self.do_numeric and cur_distance > 0.0 and not (self.transcription.get_word_at_index(i).has_alpha) and (not self.ocr.get_word_at_index(j).has_alpha):
cur_distance = 1.0
return cur_distance
def calculate_matching_at_with_prev(self, i: int, j: int, prev_i: int, prev_j: int) -> float:
"""Calculates if a match is possible between one transcription word and one ocr word.
Args:
i (int): index of the transcription word, its word_id
j (int): index of the ocr word, its word_id
prev_i (int): index of the previous transcription word, its word_id
prev_j (int): index of the previous ocr word, its word_id
Returns:
float: the distance score of the matching (1.0 for bad match, 0.0 for perfect match)
"""
cur_distance = self.distance_matrix[i][j]
if self.do_numeric and cur_distance > 0.0 and not (self.transcription.get_word_at_index(prev_i).has_alpha) and (not self.ocr.get_word_at_index(prev_j).has_alpha):
cur_distance = 1.0
return cur_distance