Skip to content

Commit 0218abd

Browse files
authored
remove utils.document.deduplicate_annotations in favor of Document.deduplicate_annotations (#437)
* remove utils.document.deduplicate_annotations in favor of Document.deduplicate_annotations * cleanup
1 parent 14f6679 commit 0218abd

File tree

2 files changed

+2
-27
lines changed

2 files changed

+2
-27
lines changed

src/pytorch_ie/utils/document.py

Lines changed: 2 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -29,27 +29,6 @@ def deduplicate_annotation_dicts(
2929
D = TypeVar("D", bound=Document)
3030

3131

32-
def deduplicate_annotations(document: D) -> D:
33-
"""Remove duplicate annotations from a document.
34-
35-
Args:
36-
document: The document to remove duplicate annotations from.
37-
38-
Returns:
39-
The document with duplicate annotations removed.
40-
"""
41-
annotation_field_names = [field.name for field in document.annotation_fields()]
42-
doc_dict = document.asdict()
43-
for annotation_field_name in annotation_field_names:
44-
doc_dict[annotation_field_name]["annotations"] = deduplicate_annotation_dicts(
45-
doc_dict[annotation_field_name]["annotations"]
46-
)
47-
doc_dict[annotation_field_name]["predictions"] = deduplicate_annotation_dicts(
48-
doc_dict[annotation_field_name]["predictions"]
49-
)
50-
return type(document).fromdict(doc_dict)
51-
52-
5332
def save_annotation_sources_to_metadata(
5433
document: D,
5534
annotation_id2source: Dict[int, List[str]],
@@ -124,8 +103,6 @@ def merge_annotations_from_documents(
124103
f"Document IDs do not match: {document.id} and {merged_document.id}"
125104
)
126105

127-
# TODO: add_all_annotations_from_other needs to be fixed! it should return a mapping from
128-
# original annotation *IDs* to new annotations!
129106
# Note: this does not check for duplicates!
130107
added_annotations = merged_document.add_all_annotations_from_other(
131108
other=document, strict=True
@@ -135,7 +112,8 @@ def merge_annotations_from_documents(
135112
for orig_id, new_annotation in orig_id2new_annotation.items():
136113
added_annotation_id2source_names[new_annotation._id].append(source_name)
137114

138-
merged_document = deduplicate_annotations(merged_document)
115+
# this will remove duplicates. If duplicates have different scores, the one with the highest score will be kept
116+
merged_document = merged_document.deduplicate_annotations()
139117

140118
# save source names in metadata (at key metadata_key_source_annotations / metadata_key_source_predictions
141119
# for each layer in the order of the annotations / predictions)

tests/utils/test_document.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,9 @@ def test_document_merge_annotations():
3232
assert result.id == "doc1"
3333
assert set(result.labeled_spans) == set(base_doc.labeled_spans)
3434
assert len(result.labeled_spans) == len(base_doc.labeled_spans) == 2
35-
assert len(result.labeled_spans.predictions) == 4
3635
assert result.labeled_spans.predictions.resolve() == [
3736
("label1", "This"),
3837
("label2", "is"),
39-
("label1", "This"),
4038
("label3", "is"),
4139
]
4240
annotations_with_sources = [
@@ -59,6 +57,5 @@ def test_document_merge_annotations():
5957
assert predictions_with_scores == [
6058
(LabeledSpan(start=0, end=4, label="label1", score=0.9), ["doc1"]),
6159
(LabeledSpan(start=5, end=7, label="label2", score=0.7), ["doc1", "doc2"]),
62-
(LabeledSpan(start=0, end=4, label="label1", score=0.8), ["doc2"]),
6360
(LabeledSpan(start=5, end=7, label="label3", score=0.6), ["doc2"]),
6461
]

0 commit comments

Comments
 (0)