-
Notifications
You must be signed in to change notification settings - Fork 79
/
hocr-eval-lines
executable file
·110 lines (90 loc) · 3.11 KB
/
hocr-eval-lines
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env python
# compute statistics about the quality of the geometric segmentation
# at the level of the given OCR element
import argparse
import re
from lxml import html
################################################################
# misc library code
################################################################
def get_text(node):
textnodes = node.xpath(".//text()")
s = ''.join([text for text in textnodes])
return re.sub(r'\s+', ' ', s)
simp_re = re.compile(r'[^a-zA-Z0-9.,!?:;]+')
def normalize(s):
s = simp_re.sub(' ', s)
s = s.strip()
return s
def edit_distance(a, b, threshold=999999):
if a == b:
return 0
m = len(a)
n = len(b)
distances = [[threshold for j in range(n + 1)] for i in range(m + 1)]
# distances is a 2-dimensional array such that distances[i][j]
# will be equal to the edit distance of the first i characters
# of a and the first j characters of b.
for i in range(m + 1):
distances[i][0] = i
for j in range(n + 1):
distances[0][j] = j
for i in range(1, m + 1):
for j in range(1, n + 1):
if a[i - 1] == b[j - 1]:
cij = 0
else:
cij = 1
d = min(distances[i - 1][j] + 1, distances[i][j - 1] + 1,
distances[i - 1][j - 1] + cij)
if d >= threshold:
return d
distances[i][j] = d
return distances[m][n]
################################################################
# main program
################################################################
parser = argparse.ArgumentParser(
description=("Compute statistics about the quality of the geometric "
"segmentation at the level of the given OCR element")
)
parser.add_argument(
"tfile", help="text file with the true lines", type=argparse.FileType('r'))
parser.add_argument(
"hfile",
help="hOCR file with the actually recognized lines",
type=argparse.FileType('r'))
parser.add_argument("-v", "--verbose", action="store_true")
args = parser.parse_args()
truth_lines = args.tfile.read().split('\n')
actual_doc = html.parse(args.hfile)
actual_lines = [
get_text(node) for node in actual_doc.xpath("//*[@class='ocr_line']")
]
truth_lines = [normalize(s) for s in truth_lines]
truth_lines = [s for s in truth_lines if s != ""]
actual_lines = [normalize(s) for s in actual_lines]
actual_lines = [s for s in actual_lines if s != ""]
remaining = [] + truth_lines
ocr_errors = 0
for actual_line in actual_lines:
min_d = 999999
min_i = -1
for index in range(len(remaining)):
true_line = remaining[index]
d = edit_distance(true_line, actual_line, min_d)
if d < min_d:
min_d = d
min_i = index
if args.verbose and min_d > 0:
print("distance", min_d)
print("\t" + actual_line)
print("\t" + remaining[min_i])
assert min_i >= 0
del remaining[min_i]
ocr_errors += min_d
segmentation_errors = 0
for s in remaining:
segmentation_errors += len(s)
print("segmentation_errors", segmentation_errors)
print("ocr_errors", ocr_errors)