Skip to content

Commit 0150052

Browse files
committed
Revise annotation content data model
Closes #987
1 parent cc7b3c2 commit 0150052

File tree

17 files changed

+467
-456
lines changed

17 files changed

+467
-456
lines changed

Tekst-API/demo/contents.json

Lines changed: 204 additions & 174 deletions
Large diffs are not rendered by default.

Tekst-API/demo/resources.json

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -481,6 +481,10 @@
481481
"multi_value_delimiter": "/",
482482
"anno_integration": {
483483
"groups": [
484+
{
485+
"key": "form",
486+
"translations": [{ "locale": "*", "translation": "Form" }]
487+
},
484488
{
485489
"key": "ne",
486490
"translations": [
@@ -497,6 +501,19 @@
497501
}
498502
],
499503
"item_props": [
504+
{
505+
"key": "form",
506+
"translations": [{ "locale": "*", "translation": "Form" }],
507+
"group": "form"
508+
},
509+
{
510+
"key": "eol",
511+
"translations": [
512+
{ "locale": "enUS", "translation": "End-of-line" },
513+
{ "locale": "deDE", "translation": "Zeilenende" }
514+
],
515+
"group": "misc"
516+
},
500517
{
501518
"key": "len",
502519
"translations": [

Tekst-API/openapi.json

Lines changed: 12 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"title": "Tekst-Dev",
55
"summary": "An online text research platform",
66
"contact": {},
7-
"version": "0.18.9a0"
7+
"version": "0.19.0a0"
88
},
99
"servers": [
1010
{
@@ -4956,15 +4956,9 @@
49564956
"in": "query",
49574957
"required": false,
49584958
"schema": {
4959-
"enum": [
4960-
"json",
4961-
"tekst-json",
4962-
"csv"
4963-
],
4964-
"type": "string",
4959+
"$ref": "#/components/schemas/ResourceExportFormat",
49654960
"description": "Export format",
4966-
"default": "json",
4967-
"title": "Format"
4961+
"default": "json"
49684962
},
49694963
"description": "Export format"
49704964
},
@@ -15623,6 +15617,14 @@
1562315617
],
1562415618
"title": "ResourceDescriptionTranslation"
1562515619
},
15620+
"ResourceExportFormat": {
15621+
"type": "string",
15622+
"enum": [
15623+
"json",
15624+
"tekst-json",
15625+
"csv"
15626+
]
15627+
},
1562615628
"ResourceSearchQuery": {
1562715629
"properties": {
1562815630
"cmn": {
@@ -17875,22 +17877,6 @@
1787517877
"title": "Type",
1787617878
"description": "Type of the resource to search in"
1787717879
},
17878-
"token": {
17879-
"type": "string",
17880-
"maxLength": 512,
17881-
"minLength": 0,
17882-
"title": "Token",
17883-
"description": "Token search query",
17884-
"default": "",
17885-
"optionalNullable": true
17886-
},
17887-
"twc": {
17888-
"type": "boolean",
17889-
"title": "Twc",
17890-
"description": "Whether to interpret wildcards in the token query",
17891-
"default": false,
17892-
"optionalNullable": true
17893-
},
1789417880
"anno": {
1789517881
"items": {
1789617882
"$ref": "#/components/schemas/TextAnnotationQueryEntry"
@@ -17927,34 +17913,18 @@
1792717913
},
1792817914
"TextAnnotationToken": {
1792917915
"properties": {
17930-
"token": {
17931-
"type": "string",
17932-
"maxLength": 4096,
17933-
"minLength": 1,
17934-
"title": "Token",
17935-
"description": "Text token"
17936-
},
1793717916
"annotations": {
1793817917
"items": {
1793917918
"$ref": "#/components/schemas/TextAnnotationEntry"
1794017919
},
1794117920
"type": "array",
1794217921
"maxItems": 128,
1794317922
"title": "Annotations",
17944-
"description": "List of annotations on this token",
17923+
"description": "List of annotations on a token",
1794517924
"default": []
17946-
},
17947-
"lb": {
17948-
"type": "boolean",
17949-
"title": "Lb",
17950-
"description": "Whether this token ends a line",
17951-
"default": false
1795217925
}
1795317926
},
1795417927
"type": "object",
17955-
"required": [
17956-
"token"
17957-
],
1795817928
"title": "TextAnnotationToken"
1795917929
},
1796017930
"TextAnnotationValue": {

Tekst-API/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "tekst"
3-
version = "0.18.9a0"
3+
version = "0.19.0a0"
44
description = "An online text research platform"
55
readme = "README.md"
66
authors = [
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
from tekst.db import Database
2+
3+
4+
async def migration(db: Database) -> None:
5+
# - add "token.token" to "token.annotations" as "form"
6+
# - add "token.lb" to "token.annotations" as "eol"
7+
async for content in db.contents.find({"resource_type": "textAnnotation"}):
8+
for token in content["tokens"]:
9+
if token_form := token.get("token"):
10+
token["annotations"].append({"key": "form", "value": [token_form]})
11+
if token.get("lb"):
12+
token["annotations"].append({"key": "eol", "value": ["true"]})
13+
# replace with updated content doc
14+
await db.contents.replace_one({"_id": content["_id"]}, content)
15+
16+
# remove "token" and "lb" fields from text annotation contents' "tokens" objects
17+
await db.contents.update_many(
18+
{"resource_type": "textAnnotation"},
19+
{"$unset": {"tokens.$[].token": 1, "tokens.$[].lb": 1}},
20+
)

Tekst-API/tekst/resources/text_annotation.py

Lines changed: 21 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -54,16 +54,6 @@ def _rtype_index_mappings(
5454
"tokens": {
5555
"type": "nested",
5656
"properties": {
57-
"token": {
58-
"type": "keyword",
59-
"normalizer": "no_diacritics_normalizer",
60-
"fields": {
61-
"strict": {
62-
"type": "keyword",
63-
"normalizer": "lowercase_normalizer",
64-
}
65-
},
66-
},
6757
"annotations": {
6858
"type": "nested",
6959
"properties": {
@@ -101,10 +91,14 @@ def _rtype_index_doc(
10191
cls,
10292
content: "TextAnnotationContent",
10393
) -> dict[str, Any] | None:
94+
token_forms = []
95+
for token in content.tokens:
96+
for anno in token.annotations:
97+
if anno.key == "form":
98+
token_forms.append("/".join(anno.value))
10499
return {
105100
"tokens": [
106101
{
107-
"token": token.token or "",
108102
"annotations": [
109103
{
110104
"key": anno.key,
@@ -119,7 +113,7 @@ def _rtype_index_doc(
119113
}
120114
for token in content.tokens
121115
],
122-
"tokens_concat": "; ".join(token.token or "" for token in content.tokens),
116+
"tokens_concat": "; ".join(token_forms),
123117
}
124118

125119
@classmethod
@@ -134,31 +128,9 @@ def rtype_es_queries(
134128
res_id = str(query.common.resource_id)
135129
q_id = str(uuid4())
136130

137-
token_usr_q = (query.resource_type_specific.token or "").strip(" ") or None
138-
token_es_q = []
139-
token_wc = query.resource_type_specific.token_wildcards
140131
annos_usr_q = query.resource_type_specific.annotations or []
141132
annos_es_q = []
142133

143-
# process token query
144-
if token_usr_q and token_usr_q.strip("* "):
145-
# handle actual token query with content
146-
token_es_q.append(
147-
{
148-
"wildcard": {
149-
f"resources.{res_id}.tokens.token{strict_suffix}": {
150-
"value": token_usr_q,
151-
}
152-
}
153-
}
154-
if token_wc
155-
else {
156-
"term": {
157-
f"resources.{res_id}.tokens.token{strict_suffix}": token_usr_q
158-
}
159-
}
160-
)
161-
162134
# process annotation queries
163135
for anno_q in annos_usr_q:
164136
if anno_q.key and not anno_q.value:
@@ -176,6 +148,7 @@ def rtype_es_queries(
176148
)
177149
elif anno_q.key and anno_q.value:
178150
# both key and value are set: query for specific key/value combination
151+
anno_v = anno_q.value.strip()
179152
anno_k_q = {
180153
"term": {f"resources.{res_id}.tokens.annotations.key": anno_q.key}
181154
}
@@ -186,7 +159,7 @@ def rtype_es_queries(
186159
f"resources.{res_id}.tokens.annotations"
187160
f".value{strict_suffix}"
188161
): {
189-
"value": anno_q.value,
162+
"value": anno_v,
190163
}
191164
}
192165
}
@@ -196,7 +169,7 @@ def rtype_es_queries(
196169
(
197170
f"resources.{res_id}.tokens.annotations"
198171
f".value{strict_suffix}"
199-
): anno_q.value
172+
): anno_v
200173
}
201174
}
202175
)
@@ -214,20 +187,19 @@ def rtype_es_queries(
214187
)
215188

216189
# add token and annotation queries to the ES queries
217-
if token_es_q or annos_es_q:
218-
es_sub_queries = [*token_es_q, *annos_es_q]
190+
if annos_es_q:
219191
es_queries.append(
220192
{
221193
"nested": {
222194
"path": f"resources.{res_id}.tokens",
223195
"inner_hits": {"name": q_id},
224196
"query": {
225197
"bool": {
226-
"must": es_sub_queries,
198+
"must": annos_es_q,
227199
},
228200
}
229-
if len(es_sub_queries) > 1
230-
else es_sub_queries[0],
201+
if len(annos_es_q) > 1
202+
else annos_es_q[0],
231203
}
232204
}
233205
)
@@ -243,14 +215,13 @@ def _highlights_generator(hit: dict[str, Any]) -> list[str]:
243215
hl_strings.extend(hl_v)
244216
for ih in hit.get("inner_hits", {}).values():
245217
for ih_hit in ih.get("hits", {}).get("hits", []):
246-
token = ih_hit["_source"]["token"]
247-
annos = ih_hit["_source"]["annotations"]
248-
values = [a["value"] for a in annos] if annos else []
249-
values_strings = []
250-
for v in values:
251-
values_strings.extend(v if isinstance(v, list) else [v])
252-
annos = f" ({'; '.join(values_strings)})"
253-
hl_strings.append(f"{token} {annos}")
218+
values = [
219+
a["value"] for a in ih_hit["_source"]["annotations"] or []
220+
]
221+
values_strings = [
222+
", ".join(v) if isinstance(v, list) else v for v in values
223+
]
224+
hl_strings.append("; ".join(values_strings))
254225
return hl_strings
255226

256227
return _highlights_generator
@@ -302,7 +273,6 @@ async def _export_csv(
302273
[
303274
"LOCATION",
304275
"POSITION",
305-
"TOKEN",
306276
*anno_keys,
307277
"AUTHORS_COMMENT",
308278
"EDITORS_COMMENT",
@@ -323,7 +293,6 @@ async def _export_csv(
323293
[
324294
full_location_labels.get(str(content.location_id), ""),
325295
i,
326-
token.token,
327296
*csv_annos,
328297
content.authors_comment,
329298
content.editors_comment,
@@ -529,26 +498,13 @@ class TextAnnotationEntry(ModelBase):
529498

530499

531500
class TextAnnotationToken(ModelBase):
532-
token: Annotated[
533-
ConStr(
534-
max_length=4096,
535-
cleanup="oneline",
536-
),
537-
Field(
538-
description="Text token",
539-
),
540-
]
541501
annotations: Annotated[
542502
list[TextAnnotationEntry],
543503
Field(
544-
description="List of annotations on this token",
504+
description="List of annotations on a token",
545505
max_length=128,
546506
),
547507
] = []
548-
lb: Annotated[
549-
bool,
550-
Field(description="Whether this token ends a line"),
551-
] = False
552508

553509

554510
class TextAnnotationContent(ContentBase):
@@ -603,25 +559,6 @@ class TextAnnotationSearchQuery(ModelBase):
603559
description="Type of the resource to search in",
604560
),
605561
]
606-
token: Annotated[
607-
ConStr(
608-
min_length=0,
609-
max_length=512,
610-
cleanup="oneline",
611-
),
612-
Field(
613-
description="Token search query",
614-
),
615-
SchemaOptionalNullable,
616-
] = ""
617-
token_wildcards: Annotated[
618-
bool,
619-
Field(
620-
alias="twc",
621-
description="Whether to interpret wildcards in the token query",
622-
),
623-
SchemaOptionalNullable,
624-
] = False
625562
annotations: Annotated[
626563
list[TextAnnotationQueryEntry],
627564
Field(

0 commit comments

Comments
 (0)