Skip to content

Commit 2535f4b

Browse files
authored
Enumerate entry/sense/synset ids for validation (#229)
Fixes #228
1 parent c994588 commit 2535f4b

File tree

7 files changed

+126
-3
lines changed

7 files changed

+126
-3
lines changed

CHANGELOG.md

+5
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,10 @@
1212
* `Sense.relation_map()` method ([#216])
1313
* `Synset.relation_map()` method ([#167], [#216])
1414

15+
## Fixed
16+
17+
* Enumerate repeated entry, sense, synset IDs for validation ([#228])
18+
1519

1620
## [v0.10.1]
1721

@@ -706,3 +710,4 @@ abandoned, but this is an entirely new codebase.
706710
[#215]: https://github.com/goodmami/wn/issues/215
707711
[#216]: https://github.com/goodmami/wn/issues/216
708712
[#221]: https://github.com/goodmami/wn/issues/221
713+
[#228]: https://github.com/goodmami/wn/issues/228

tests/data/E101-0.xml

+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<!DOCTYPE LexicalResource SYSTEM "http://globalwordnet.github.io/schemas/WN-LMF-1.0.dtd">
3+
<LexicalResource xmlns:dc="http://purl.org/dc/elements/1.1/">
4+
5+
<!-- duplicate ID in lexical entries -->
6+
7+
<Lexicon id="test-e101"
8+
label="Testing E101"
9+
language="en"
10+
11+
license="https://creativecommons.org/licenses/by/4.0/"
12+
version="1">
13+
14+
<LexicalEntry id="test-e101-foo-n">
15+
<Lemma partOfSpeech="n" writtenForm="foo" />
16+
<Sense id="test-e101-foo" synset="test-e101-01-n" />
17+
</LexicalEntry>
18+
19+
<LexicalEntry id="test-e101-foo-n">
20+
<Lemma partOfSpeech="n" writtenForm="foo2" />
21+
<Sense id="test-e101-foo2" synset="test-e101-01-n" />
22+
</LexicalEntry>
23+
24+
<Synset id="test-e101-01-n" ili="i12345" partOfSpeech="n" />
25+
26+
</Lexicon>
27+
28+
</LexicalResource>

tests/data/E101-1.xml

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<!DOCTYPE LexicalResource SYSTEM "http://globalwordnet.github.io/schemas/WN-LMF-1.0.dtd">
3+
<LexicalResource xmlns:dc="http://purl.org/dc/elements/1.1/">
4+
5+
<!-- duplicate ID in senses -->
6+
7+
<Lexicon id="test-e101"
8+
label="Testing E101"
9+
language="en"
10+
11+
license="https://creativecommons.org/licenses/by/4.0/"
12+
version="1">
13+
14+
<LexicalEntry id="test-e101-foo-n">
15+
<Lemma partOfSpeech="n" writtenForm="foo" />
16+
<Sense id="test-e101-foo" synset="test-e101-01-n" />
17+
<Sense id="test-e101-foo" synset="test-e101-02-n" />
18+
</LexicalEntry>
19+
20+
<Synset id="test-e101-01-n" ili="i12345" partOfSpeech="n" />
21+
<Synset id="test-e101-02-n" ili="i12346" partOfSpeech="n" />
22+
23+
</Lexicon>
24+
25+
</LexicalResource>

tests/data/E101-2.xml

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<!DOCTYPE LexicalResource SYSTEM "http://globalwordnet.github.io/schemas/WN-LMF-1.0.dtd">
3+
<LexicalResource xmlns:dc="http://purl.org/dc/elements/1.1/">
4+
5+
<!-- duplicate ID in synsets -->
6+
7+
<Lexicon id="test-e101"
8+
label="Testing E101"
9+
language="en"
10+
11+
license="https://creativecommons.org/licenses/by/4.0/"
12+
version="1">
13+
14+
<LexicalEntry id="test-e101-foo-n">
15+
<Lemma partOfSpeech="n" writtenForm="foo" />
16+
<Sense id="test-e101-foo-n" synset="test-e101-01-n" />
17+
</LexicalEntry>
18+
19+
<Synset id="test-e101-01-n" ili="i12345" partOfSpeech="n" />
20+
<Synset id="test-e101-01-n" ili="i12346" partOfSpeech="n" />
21+
22+
</Lexicon>
23+
24+
</LexicalResource>

tests/data/E101-3.xml

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<!DOCTYPE LexicalResource SYSTEM "http://globalwordnet.github.io/schemas/WN-LMF-1.0.dtd">
3+
<LexicalResource xmlns:dc="http://purl.org/dc/elements/1.1/">
4+
5+
<!-- duplicate ID in different entity types -->
6+
7+
<Lexicon id="test-e101"
8+
label="Testing E101"
9+
language="en"
10+
11+
license="https://creativecommons.org/licenses/by/4.0/"
12+
version="1">
13+
14+
<LexicalEntry id="test-e101-foo-n">
15+
<Lemma partOfSpeech="n" writtenForm="foo" />
16+
<Sense id="test-e101-foo-n" synset="test-e101-01-n" />
17+
</LexicalEntry>
18+
19+
<Synset id="test-e101-01-n" ili="i12345" partOfSpeech="n" />
20+
21+
</Lexicon>
22+
23+
</LexicalResource>

tests/validate_test.py

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
from pathlib import Path
2+
3+
from wn import lmf
4+
from wn.validate import validate
5+
6+
7+
def _assert_invalid(select: str, path: Path) -> None:
8+
lex = lmf.load(path, progress_handler=None)["lexicons"][0]
9+
report = validate(lex, select=[select], progress_handler=None)
10+
print(report)
11+
assert len(report[select]["items"]) > 0
12+
13+
14+
def test_E101(datadir):
15+
_assert_invalid("E101", datadir / "E101-0.xml")
16+
_assert_invalid("E101", datadir / "E101-1.xml")
17+
_assert_invalid("E101", datadir / "E101-2.xml")
18+
_assert_invalid("E101", datadir / "E101-3.xml")

wn/validate.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -57,9 +57,9 @@ def _non_unique_id(lex: lmf.Lexicon, ids: _Ids) -> _Result:
5757
[lex['id']],
5858
(f['id'] for e in _entries(lex) for f in _forms(e) if f.get('id')),
5959
(sb['id'] for sb in lex.get('frames', []) if sb.get('id')),
60-
ids['entry'],
61-
ids['sense'],
62-
ids['synset'],
60+
ids['entry'].elements(),
61+
ids['sense'].elements(),
62+
ids['synset'].elements(),
6363
))
6464

6565

0 commit comments

Comments
 (0)