Skip to content

Commit 6b8efe2

Browse files
committed
move to core==1.0:
- update dependencies and API to reflect 1.0 changes - for building test assets, use `ocrd bag` to create self-contained directories atomically (but still work around core#176) - for testing, depend on ocrd_tesserocr==0.3 (with overwrite_words, and thereby rid of this manual step) - when creating output files, try to use the input file ID (modulo input file grp) in the basename note: testing depends on core#266!
1 parent 9c7ed8a commit 6b8efe2

File tree

9 files changed

+68
-74
lines changed

9 files changed

+68
-74
lines changed

Makefile

+1
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ test: test/assets
3939

4040
# prepare test assets
4141
test/assets:
42+
# TODO: instead of this, use bag repos, or add something useful to OCR-D/assets
4243
test/prepare_gt.bash $@
4344

4445
.PHONY: help deps deps-test install test

ocrd_keraslm/scripts/run.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import json
99
import click
1010

11-
from ocrd_keraslm import lib
11+
from .. import lib
1212

1313
class SortedGroup(click.Group):
1414
def list_commands(self, ctx):

ocrd_keraslm/wrapper/cli.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import click
22

33
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
4-
from ocrd_keraslm.wrapper import KerasRate
4+
from . import KerasRate
55

66
@click.command()
77
@ocrd_cli_options

ocrd_keraslm/wrapper/rate.py

+35-17
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,25 @@
11
from __future__ import absolute_import
2+
import os
23
from math import log, ceil
34

4-
from ocrd import Processor, MIMETYPE_PAGE
5-
from ocrd.validator.page_validator import PageValidator, ConsistencyError
6-
from ocrd.utils import getLogger, concat_padded, xywh_from_points, points_from_xywh
7-
from ocrd.model.ocrd_page import from_file, to_xml, GlyphType, CoordsType, TextEquivType
8-
from ocrd.model.ocrd_page_generateds import MetadataItemType, LabelsType, LabelType
5+
from ocrd import Processor
6+
from ocrd_validators.page_validator import PageValidator, ConsistencyError
7+
from ocrd_utils import (
8+
getLogger, concat_padded,
9+
xywh_from_points, points_from_xywh,
10+
MIMETYPE_PAGE
11+
)
12+
from ocrd_modelfactory import page_from_file
13+
from ocrd_models.ocrd_page import (
14+
to_xml, GlyphType,
15+
MetadataItemType, LabelsType, LabelType,
16+
CoordsType, TextEquivType
17+
)
918

1019
import networkx as nx
1120

12-
from ocrd_keraslm.wrapper.config import OCRD_TOOL
13-
from ocrd_keraslm import lib
21+
from .config import OCRD_TOOL
22+
from .. import lib
1423

1524
LOG = getLogger('processor.KerasRate')
1625

@@ -54,13 +63,15 @@ def process(self):
5463

5564
prev_traceback = None
5665
prev_pcgts = None
66+
prev_file_id = None
5767
for (n, input_file) in enumerate(self.input_files):
58-
LOG.info("INPUT FILE %i / %s", n, input_file)
59-
pcgts = from_file(self.workspace.download_file(input_file))
68+
page_id = input_file.pageId or input_file.ID
69+
LOG.info("INPUT FILE %i / %s", n, page_id)
70+
pcgts = page_from_file(self.workspace.download_file(input_file))
6071
LOG.info("Scoring text in page '%s' at the %s level", pcgts.get_pcGtsId(), level)
6172

6273
# annotate processing metadata:
63-
metadata = pcgts.get_Metadata() # ensured by from_file()
74+
metadata = pcgts.get_Metadata() # ensured by page_from_file()
6475
metadata.add_MetadataItem(
6576
MetadataItemType(type_="processingStep",
6677
name=OCRD_TOOL['tools']['ocrd-keraslm-rate']['steps'][0],
@@ -115,11 +126,13 @@ def process(self):
115126
page_update_higher_textequiv_levels(level, pcgts)
116127

117128
# write back result
118-
file_id = concat_padded(self.output_file_grp, n)
129+
file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
130+
if file_id == input_file.ID:
131+
file_id = concat_padded(self.output_file_grp, n)
119132
self.workspace.add_file(
120133
ID=file_id,
121134
file_grp=self.output_file_grp,
122-
basename=file_id + '.xml', # with suffix or bare?
135+
local_filename=os.path.join(self.output_file_grp, file_id + '.xml'),
123136
mimetype=MIMETYPE_PAGE,
124137
content=to_xml(pcgts),
125138
)
@@ -140,15 +153,18 @@ def process(self):
140153
page_update_higher_textequiv_levels(level, prev_pcgts)
141154

142155
# write back result
143-
file_id = concat_padded(self.output_file_grp, n - 1)
156+
file_id = prev_file_id.replace(self.input_file_grp, self.output_file_grp)
157+
if file_id == prev_file_id:
158+
file_id = concat_padded(self.output_file_grp, n - 1)
144159
self.workspace.add_file(
145160
ID=file_id,
146161
file_grp=self.output_file_grp,
147-
basename=file_id + '.xml', # with suffix or bare?
162+
local_filename=os.path.join(self.output_file_grp, file_id + '.xml'),
148163
mimetype=MIMETYPE_PAGE,
149164
content=to_xml(prev_pcgts),
150165
)
151-
166+
167+
prev_file_id = input_file.ID
152168
prev_pcgts = pcgts
153169
prev_traceback = traceback
154170

@@ -160,11 +176,13 @@ def process(self):
160176
page_update_higher_textequiv_levels(level, prev_pcgts)
161177

162178
# write back result
163-
file_id = concat_padded(self.output_file_grp, n)
179+
file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
180+
if file_id == input_file.ID:
181+
file_id = concat_padded(self.output_file_grp, n)
164182
self.workspace.add_file(
165183
ID=file_id,
166184
file_grp=self.output_file_grp,
167-
basename=file_id + '.xml', # with suffix or bare?
185+
local_filename=os.path.join(self.output_file_grp, file_id + '.xml'),
168186
mimetype=MIMETYPE_PAGE,
169187
content=to_xml(prev_pcgts),
170188
)

requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
ocrd >= 0.15.2
1+
ocrd >= 1.0.0b10
22
click
33
keras >= 2.2.4
44
numpy

requirements_test.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
pytest
2-
ocrd_tesserocr
2+
ocrd_tesserocr >= 0.3.0

setup.py

+2-9
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
"""
33
Installs:
44
- keraslm-rate
5+
- ocrd-keraslm-rate
56
"""
67
import codecs
78

@@ -20,15 +21,7 @@
2021
url='https://github.com/OCR-D/ocrd_keraslm',
2122
license='Apache License 2.0',
2223
packages=find_packages(exclude=('tests', 'docs')),
23-
install_requires=[
24-
'ocrd >= 0.15.2',
25-
'keras',
26-
'click',
27-
'numpy',
28-
'tensorflow',
29-
'h5py',
30-
'networkx',
31-
],
24+
install_requires=open('requirements.txt').read().split('\n'),
3225
extras_require={
3326
'plotting': [
3427
'sklearn',

test/prepare_gt.bash

+11-20
Original file line numberDiff line numberDiff line change
@@ -42,37 +42,28 @@ EOF
4242
for GT_FILE in $GT_FILES; do
4343
test -f "$CACHE_DIR/${GT_FILE}.zip" ||
4444
wget -P "$CACHE_DIR" http://www.ocr-d.de/sites/all/GTDaten/${GT_FILE}.zip
45-
unzip -d "$TMP_DIR" "$CACHE_DIR/${GT_FILE}.zip"
46-
pushd "$TMP_DIR/$GT_FILE/$GT_FILE"
45+
unzip -jod "$TMP_DIR/$GT_FILE" "$CACHE_DIR/${GT_FILE}.zip"
46+
pushd "$TMP_DIR/$GT_FILE"
4747
ocrd workspace init .
4848
ZEROS=0000
4949
i=0
50-
for PAGE_FILE in page/*.xml; do
50+
for PAGE_FILE in *.xml; do
51+
test "x$PAGE_FILE" = xmets.xml && continue
5152
i=$((i+1))
5253
ID=${ZEROS:0:$((4-${#i}))}$i
5354
IMG_FILE=$(xsltproc "$TMP_DIR/page-extract-imagefilename.xsl" "$PAGE_FILE")
5455
test -f "$IMG_FILE"
55-
ocrd workspace add -G OCR-D-IMG -i OCR-D-IMG_$ID -g OCR-D-IMG_$ID -m image/tiff "$IMG_FILE"
56-
ocrd workspace add -G OCR-D-GT-PAGE -i OCR-D-GT-PAGE_$ID -g OCR-D-IMG_$ID -m application/vnd.prima.page+xml "$PAGE_FILE"
56+
ocrd workspace add -G OCR-D-IMG -i OCR-D-IMG_$ID -g phys_$ID -m image/tiff "$IMG_FILE"
57+
ocrd workspace add -G OCR-D-GT-PAGE -i OCR-D-GT-PAGE_$ID -g phys_$ID -m application/vnd.prima.page+xml "$PAGE_FILE"
58+
# workaround for OCR-D/core/issues/176 (still true for ocrd v1.0.0b10 !!)
59+
sed -i -e "s|imageFilename=\"[^\"]*\"|imageFilename=\"OCR-D-IMG/OCR-D-IMG_$ID\"|" "$PAGE_FILE"
5760
done
61+
ocrd zip bag -i ${GT_FILE}.zip -D full -Z -I
5862
popd
5963
done
6064

61-
# this would break URIs: (still true for ocrd v0.15.2 !!)
62-
#mv "$TMP_DIR" "$1" # atomic
63-
# clone+cp instead:
64-
trap "rm -fr '$TMP_DIR' '$1'" ERR
65-
mkdir -p "$1"
66-
for GT_FILE in $GT_FILES; do # not so atomic
67-
WORKSPACE="$TMP_DIR/$GT_FILE/$GT_FILE"
68-
ocrd workspace clone -l "$WORKSPACE/mets.xml" "$1/$GT_FILE"
69-
# workaround for OCR-D/core/issues/176 (still true for ocrd v0.15.2 !!)
70-
for PAGE_FILE in "$1/$GT_FILE/OCR-D-GT-PAGE/"*.xml; do
71-
sed -ie "s|imageFilename=\"|imageFilename=\"file://$PWD/$1/$GT_FILE/OCR-D-IMG/|" "$PAGE_FILE"
72-
done
73-
cp "$TMP_DIR/${GT_FILE}.txt" "$1"
74-
done
75-
rm -fr "$TMP_DIR"
65+
mv "$TMP_DIR" "$1" # atomic
66+
7667

7768

7869

test/test_wrapper.py

+15-24
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,9 @@
33
from unittest import TestCase, main
44

55
from ocrd.resolver import Resolver
6-
from ocrd.model.ocrd_page import from_file, to_xml
7-
from ocrd import MIMETYPE_PAGE
6+
from ocrd_models.ocrd_page import to_xml
7+
from ocrd_modelfactory import page_from_file
8+
from ocrd_utils import MIMETYPE_PAGE
89
from ocrd_tesserocr.recognize import TesserocrRecognize
910
from ocrd_keraslm.wrapper import KerasRate
1011

@@ -20,8 +21,11 @@ def setUp(self):
2021

2122
def runTest(self):
2223
resolver = Resolver()
23-
workspace = resolver.workspace_from_url('test/assets/kant_aufklaerung_1784/mets.xml', dst_dir=WORKSPACE_DIR, download=True)
24+
workspace = resolver.workspace_from_url('test/assets/kant_aufklaerung_1784/data/mets.xml', dst_dir=WORKSPACE_DIR, download=True)
2425
self.assertIsNotNone(workspace)
26+
#
27+
# rate text alternative 1 on the word level:
28+
#
2529
KerasRate(
2630
workspace,
2731
input_file_grp='OCR-D-GT-PAGE', # has wrong tokenisation but that's ok now
@@ -33,36 +37,23 @@ def runTest(self):
3337
workspace.save_mets()
3438
for file in workspace.mets.find_files(fileGrp='OCR-D-LM-WORD'):
3539
continue # todo: for some reason, from_file yields NoneType here
36-
pcgts = from_file(file)
40+
pcgts = page_from_file(file)
3741
metadata = pcgts.get_Metadata()
3842
self.assertIsNotNone(metadata)
3943
metadataitems = metadata.get_MetadataItem()
4044
self.assertIsNotNone(metadataitems)
4145
rated = any([i for i in metadataitems if i.get_value() == 'ocrd-keraslm-rate'])
4246
self.assertTrue(rated)
43-
for file in workspace.mets.find_files(fileGrp='OCR-D-GT-PAGE'):
44-
grp='OCR-D-GT-SEG-LINE'
45-
ID=grp + '_' + file.ID.split(sep='_')[-1]
46-
pcgts = from_file(file)
47-
page = pcgts.get_Page()
48-
for region in page.get_TextRegion():
49-
for line in region.get_TextLine():
50-
line.set_TextEquiv([]) # remove text results (interferes with ocrd_tesserocr)
51-
line.set_Word([]) # remove word annotation (interferes with ocrd_tesserocr, has wrong tokenization)
52-
self.assertIsNotNone(
53-
workspace.add_file(
54-
ID=ID,
55-
file_grp=grp,
56-
basename=ID + '.xml',
57-
mimetype=MIMETYPE_PAGE,
58-
content=to_xml(pcgts)))
47+
#
48+
# rate and viterbi-decode all text alternatives on the glyph level:
49+
#
5950
TesserocrRecognize( # we need this to get alternatives to decode
6051
workspace,
61-
input_file_grp='OCR-D-GT-SEG-LINE',
62-
#input_file_grp='OCR-D-GT-PAGE', # only possible with ocrd_tesserocr >= 0.3.0
52+
input_file_grp='OCR-D-GT-PAGE', # has wrong tokenisation but that's ok now
6353
output_file_grp='OCR-D-OCR-TESS-GLYPH',
6454
parameter={'textequiv_level': 'glyph',
65-
'model': 'deu-frak'}
55+
'overwrite_words': True,
56+
'model': 'deu-frak'} # old model for alternatives
6657
).process()
6758
workspace.save_mets()
6859
KerasRate(
@@ -77,7 +68,7 @@ def runTest(self):
7768
workspace.save_mets()
7869
for file in workspace.mets.find_files(fileGrp='OCR-D-LM-GLYPH'):
7970
continue # todo: for some reason, from_file yields NoneType here
80-
pcgts = from_file(file)
71+
pcgts = page_from_file(file)
8172
metadata = pcgts.get_Metadata()
8273
self.assertIsNotNone(metadata)
8374
metadataitems = metadata.get_MetadataItem()

0 commit comments

Comments
 (0)