|
| 1 | +import dataclasses |
1 | 2 | import logging
|
2 | 3 | from typing import Union
|
3 | 4 |
|
| 5 | +from pie_core import AnnotationLayer, Document, annotation_field |
4 | 6 | from pie_modules.document.processing import (
|
5 | 7 | RegexPartitioner,
|
6 | 8 | RelationArgumentSorter,
|
7 | 9 | SpansViaRelationMerger,
|
8 | 10 | TextSpanTrimmer,
|
9 | 11 | )
|
10 |
| -from pytorch_ie.core import Document |
11 |
| -from pytorch_ie.documents import ( |
| 12 | +from pie_modules.documents import ( |
12 | 13 | TextDocumentWithLabeledMultiSpansAndBinaryRelations,
|
13 | 14 | TextDocumentWithLabeledMultiSpansBinaryRelationsAndLabeledPartitions,
|
14 | 15 | TextDocumentWithLabeledSpansAndBinaryRelations,
|
15 | 16 | TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
|
16 | 17 | )
|
17 | 18 |
|
18 | 19 | from pie_datasets.builders import BratBuilder, BratConfig
|
19 |
| -from pie_datasets.builders.brat import BratDocument, BratDocumentWithMergedSpans |
| 20 | +from pie_datasets.builders.brat import ( |
| 21 | + BratAttribute, |
| 22 | + BratDocument, |
| 23 | + BratDocumentWithMergedSpans, |
| 24 | + BratNote, |
| 25 | +) |
20 | 26 | from pie_datasets.core.dataset import DocumentConvertersType
|
21 | 27 | from pie_datasets.document.processing import Caster, Pipeline
|
22 | 28 |
|
|
26 | 32 | SPLIT_PATHS = {"train": "compiled_corpus"}
|
27 | 33 |
|
28 | 34 |
|
| 35 | +@dataclasses.dataclass |
| 36 | +class ConvertedBratDocument(TextDocumentWithLabeledMultiSpansAndBinaryRelations): |
| 37 | + span_attributes: AnnotationLayer[BratAttribute] = annotation_field( |
| 38 | + target="labeled_multi_spans" |
| 39 | + ) |
| 40 | + relation_attributes: AnnotationLayer[BratAttribute] = annotation_field( |
| 41 | + target="binary_relations" |
| 42 | + ) |
| 43 | + notes: AnnotationLayer[BratNote] = annotation_field( |
| 44 | + targets=[ |
| 45 | + "labeled_multi_spans", |
| 46 | + "binary_relations", |
| 47 | + "span_attributes", |
| 48 | + "relation_attributes", |
| 49 | + ] |
| 50 | + ) |
| 51 | + |
| 52 | + |
| 53 | +@dataclasses.dataclass |
| 54 | +class ConvertedBratDocumentWithMergedSpans(TextDocumentWithLabeledSpansAndBinaryRelations): |
| 55 | + span_attributes: AnnotationLayer[BratAttribute] = annotation_field(target="labeled_spans") |
| 56 | + relation_attributes: AnnotationLayer[BratAttribute] = annotation_field( |
| 57 | + target="binary_relations" |
| 58 | + ) |
| 59 | + notes: AnnotationLayer[BratNote] = annotation_field( |
| 60 | + targets=["labeled_spans", "binary_relations", "span_attributes", "relation_attributes"] |
| 61 | + ) |
| 62 | + |
| 63 | + |
29 | 64 | def get_common_converter_pipeline_steps(target_document_type: type[Document]) -> dict:
|
30 | 65 | return dict(
|
31 | 66 | cast=Caster(
|
@@ -106,13 +141,36 @@ class SciArg(BratBuilder):
|
106 | 141 | def _generate_document(self, example, **kwargs):
|
107 | 142 | document = super()._generate_document(example, **kwargs)
|
108 | 143 | if self.config.resolve_parts_of_same:
|
109 |
| - document = SpansViaRelationMerger( |
110 |
| - relation_layer="relations", |
| 144 | + # we need to convert the document to a different type to be able to merge the spans: |
| 145 | + # SpansViaRelationMerger expects the spans to be of type LabeledSpan, |
| 146 | + # but the document has spans of type BratSpan |
| 147 | + converted_doc = document.as_type( |
| 148 | + ConvertedBratDocumentWithMergedSpans, |
| 149 | + field_mapping={ |
| 150 | + "spans": "labeled_spans", |
| 151 | + "relations": "binary_relations", |
| 152 | + }, |
| 153 | + keep_remaining=True, |
| 154 | + ) |
| 155 | + merged_document = SpansViaRelationMerger( |
| 156 | + relation_layer="binary_relations", |
111 | 157 | link_relation_label="parts_of_same",
|
112 | 158 | create_multi_spans=True,
|
113 |
| - result_document_type=BratDocument, |
114 |
| - result_field_mapping={"spans": "spans", "relations": "relations"}, |
115 |
| - )(document) |
| 159 | + result_document_type=ConvertedBratDocument, |
| 160 | + result_field_mapping={ |
| 161 | + "labeled_spans": "labeled_multi_spans", |
| 162 | + "binary_relations": "binary_relations", |
| 163 | + "span_attributes": "span_attributes", |
| 164 | + "relation_attributes": "relation_attributes", |
| 165 | + "notes": "notes", |
| 166 | + }, |
| 167 | + )(converted_doc) |
| 168 | + # convert back to BratDocument |
| 169 | + document = merged_document.as_type( |
| 170 | + BratDocument, |
| 171 | + field_mapping={"labeled_multi_spans": "spans", "binary_relations": "relations"}, |
| 172 | + keep_remaining=True, |
| 173 | + ) |
116 | 174 | else:
|
117 | 175 | # some documents have duplicate relations, remove them
|
118 | 176 | remove_duplicate_relations(document)
|
|
0 commit comments