Skip to content

Commit 0acbecd

Browse files
committed
use pie-core in dataset builders
1 parent bf0dae6 commit 0acbecd

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

54 files changed

+257
-169
lines changed

dataset_builders/pie/aae2/aae2.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22
from typing import Dict
33

44
import pandas as pd
5+
from pie_modules.annotations import BinaryRelation
56
from pie_modules.document.processing import RegexPartitioner
6-
from pytorch_ie.annotations import BinaryRelation
7-
from pytorch_ie.documents import (
7+
from pie_modules.documents import (
88
TextDocumentWithLabeledSpansAndBinaryRelations,
99
TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
1010
)
+3-2
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
1-
pie-datasets>=0.8.0,<0.11.0
2-
pie-modules>=0.8.3,<0.16.0
1+
pie-datasets @ git+https://github.com/ArneBinder/pie-datasets@use_pie_core
2+
#pie-modules>=0.13.5,<0.16.0
3+
pie-modules @ git+https://github.com/ArneBinder/pie-modules@use_pie_core

dataset_builders/pie/abstrct/abstrct.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from pytorch_ie.documents import TextDocumentWithLabeledSpansAndBinaryRelations
1+
from pie_modules.documents import TextDocumentWithLabeledSpansAndBinaryRelations
22

33
from pie_datasets.builders import BratBuilder, BratConfig
44
from pie_datasets.builders.brat import BratDocumentWithMergedSpans
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
1-
pie-datasets>=0.4.0,<0.11.0
1+
pie-datasets @ git+https://github.com/ArneBinder/pie-datasets@use_pie_core
2+
#pie-modules>=0.13.5,<0.16.0
3+
pie-modules @ git+https://github.com/ArneBinder/pie-modules@use_pie_core

dataset_builders/pie/argmicro/argmicro.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@
66
from typing import Any, Dict, List, Optional, Set, Tuple
77

88
import datasets
9-
from pytorch_ie.annotations import BinaryRelation, Label, LabeledSpan, Span
10-
from pytorch_ie.core import Annotation, AnnotationList, annotation_field
11-
from pytorch_ie.documents import (
9+
from pie_core import Annotation, AnnotationLayer, annotation_field
10+
from pie_modules.annotations import BinaryRelation, Label, LabeledSpan, Span
11+
from pie_modules.documents import (
1212
TextBasedDocument,
1313
TextDocumentWithLabeledSpansAndBinaryRelations,
1414
)
@@ -42,10 +42,10 @@ class MultiRelation(Annotation):
4242
@dataclasses.dataclass
4343
class ArgMicroDocument(TextBasedDocument):
4444
topic_id: Optional[str] = None
45-
stance: AnnotationList[Label] = annotation_field()
46-
edus: AnnotationList[Span] = annotation_field(target="text")
47-
adus: AnnotationList[LabeledAnnotationCollection] = annotation_field(target="edus")
48-
relations: AnnotationList[MultiRelation] = annotation_field(target="adus")
45+
stance: AnnotationLayer[Label] = annotation_field()
46+
edus: AnnotationLayer[Span] = annotation_field(target="text")
47+
adus: AnnotationLayer[LabeledAnnotationCollection] = annotation_field(target="edus")
48+
relations: AnnotationLayer[MultiRelation] = annotation_field(target="adus")
4949

5050

5151
def example_to_document(
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
1-
pie-datasets>=0.3.3,<0.11.0
1+
pie-datasets @ git+https://github.com/ArneBinder/pie-datasets@use_pie_core
2+
#pie-modules>=0.13.5,<0.16.0
3+
pie-modules @ git+https://github.com/ArneBinder/pie-modules@use_pie_core

dataset_builders/pie/biorel/biorel.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@
33
from typing import Any
44

55
import datasets
6-
from pytorch_ie import AnnotationLayer, annotation_field
7-
from pytorch_ie.annotations import BinaryRelation, LabeledSpan, Span
8-
from pytorch_ie.documents import (
6+
from pie_core import AnnotationLayer, annotation_field
7+
from pie_modules.annotations import BinaryRelation, LabeledSpan, Span
8+
from pie_modules.documents import (
99
TextBasedDocument,
1010
TextDocumentWithLabeledSpansAndBinaryRelations,
1111
)
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
1-
pie-datasets>=0.6.0,<0.11.0
1+
pie-datasets @ git+https://github.com/ArneBinder/pie-datasets@use_pie_core
2+
#pie-modules>=0.13.5,<0.16.0
3+
pie-modules @ git+https://github.com/ArneBinder/pie-modules@use_pie_core
+3-1
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
1-
pie-datasets>=0.4.0,<0.11.0
1+
pie-datasets @ git+https://github.com/ArneBinder/pie-datasets@use_pie_core
2+
#pie-modules>=0.13.5,<0.16.0
3+
pie-modules @ git+https://github.com/ArneBinder/pie-modules@use_pie_core

dataset_builders/pie/cdcp/cdcp.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,10 @@
33
from typing import Any, Dict, List, Optional
44

55
import datasets
6+
from pie_core import Annotation, AnnotationLayer, annotation_field
7+
from pie_modules.annotations import BinaryRelation, LabeledSpan
68
from pie_modules.document.processing.text_span_trimmer import trim_text_spans
7-
from pytorch_ie.annotations import BinaryRelation, LabeledSpan
8-
from pytorch_ie.core import Annotation, AnnotationList, annotation_field
9-
from pytorch_ie.documents import (
9+
from pie_modules.documents import (
1010
TextBasedDocument,
1111
TextDocumentWithLabeledSpansAndBinaryRelations,
1212
)
@@ -32,9 +32,9 @@ class Attribute(Annotation):
3232

3333
@dataclasses.dataclass
3434
class CDCPDocument(TextBasedDocument):
35-
propositions: AnnotationList[LabeledSpan] = annotation_field(target="text")
36-
relations: AnnotationList[BinaryRelation] = annotation_field(target="propositions")
37-
urls: AnnotationList[Attribute] = annotation_field(target="propositions")
35+
propositions: AnnotationLayer[LabeledSpan] = annotation_field(target="text")
36+
relations: AnnotationLayer[BinaryRelation] = annotation_field(target="propositions")
37+
urls: AnnotationLayer[Attribute] = annotation_field(target="propositions")
3838

3939

4040
def example_to_document(
+3-2
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
1-
pie-datasets>=0.6.0,<0.11.0
2-
pie-modules>=0.8.0,<0.16.0
1+
pie-datasets @ git+https://github.com/ArneBinder/pie-datasets@use_pie_core
2+
#pie-modules>=0.13.5,<0.16.0
3+
pie-modules @ git+https://github.com/ArneBinder/pie-modules@use_pie_core

dataset_builders/pie/chemprot/chemprot.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22
from typing import Any, Dict
33

44
import datasets
5-
from pytorch_ie import Document
6-
from pytorch_ie.annotations import BinaryRelation, LabeledSpan
7-
from pytorch_ie.documents import (
5+
from pie_core import Document
6+
from pie_modules.annotations import BinaryRelation, LabeledSpan
7+
from pie_modules.documents import (
88
AnnotationLayer,
99
TextBasedDocument,
1010
TextDocumentWithLabeledSpansAndBinaryRelations,
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
1-
pie-datasets>=0.6.0,<0.11.0
1+
pie-datasets @ git+https://github.com/ArneBinder/pie-datasets@use_pie_core
2+
#pie-modules>=0.13.5,<0.16.0
3+
pie-modules @ git+https://github.com/ArneBinder/pie-modules@use_pie_core

dataset_builders/pie/comagc/comagc.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@
33
from typing import Any, Dict, Optional
44

55
import datasets
6-
from pytorch_ie import AnnotationLayer, Document, annotation_field
7-
from pytorch_ie.annotations import BinaryRelation, LabeledSpan, Span
8-
from pytorch_ie.documents import TextDocumentWithLabeledSpansAndBinaryRelations
6+
from pie_core import AnnotationLayer, Document, annotation_field
7+
from pie_modules.annotations import BinaryRelation, LabeledSpan, Span
8+
from pie_modules.documents import TextDocumentWithLabeledSpansAndBinaryRelations
99

1010
from pie_datasets import ArrowBasedBuilder
1111

Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
1-
pie-datasets>=0.6.0,<0.11.0
1+
pie-datasets @ git+https://github.com/ArneBinder/pie-datasets@use_pie_core
2+
#pie-modules>=0.13.5,<0.16.0
3+
pie-modules @ git+https://github.com/ArneBinder/pie-modules@use_pie_core

dataset_builders/pie/conll2003/conll2003.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@
22
from typing import List, Sequence, Tuple
33

44
import datasets
5+
from pie_core import AnnotationLayer, annotation_field
6+
from pie_modules.annotations import LabeledSpan
7+
from pie_modules.documents import TextBasedDocument, TextDocumentWithLabeledSpans
58
from pie_modules.utils.sequence_tagging import tag_sequence_to_token_spans
6-
from pytorch_ie.annotations import LabeledSpan
7-
from pytorch_ie.core import AnnotationList, annotation_field
8-
from pytorch_ie.documents import TextDocument, TextDocumentWithLabeledSpans
99

1010
from pie_datasets import GeneratorBasedBuilder
1111

@@ -33,8 +33,8 @@ def tokens_and_tags_to_text_and_labeled_spans(
3333

3434

3535
@dataclass
36-
class CoNLL2003Document(TextDocument):
37-
entities: AnnotationList[LabeledSpan] = annotation_field(target="text")
36+
class CoNLL2003Document(TextBasedDocument):
37+
entities: AnnotationLayer[LabeledSpan] = annotation_field(target="text")
3838

3939

4040
class Conll2003(GeneratorBasedBuilder):
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
1-
pie-datasets>=0.8.1,<0.11.0
2-
pie-modules>=0.15.4,<0.16.0
1+
pie-datasets @ git+https://github.com/ArneBinder/pie-datasets@use_pie_core
2+
#pie-modules>=0.15.4,<0.16.0
3+
pie-modules @ git+https://github.com/ArneBinder/pie-modules@use_pie_core

dataset_builders/pie/conll2012_ontonotesv5/conll2012_ontonotesv5.py

+14-14
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@
33
from typing import Any, Dict, List, Optional, Tuple
44

55
import datasets
6-
from pytorch_ie.annotations import LabeledSpan, NaryRelation, Span
7-
from pytorch_ie.core import Annotation, AnnotationList, annotation_field
8-
from pytorch_ie.documents import (
6+
from pie_core import Annotation, AnnotationLayer, annotation_field
7+
from pie_modules.annotations import LabeledSpan, NaryRelation, Span
8+
from pie_modules.documents import (
99
TextDocumentWithLabeledSpansAndLabeledPartitions,
1010
TokenBasedDocument,
1111
)
@@ -43,17 +43,17 @@ class Predicate(Span):
4343
@dataclasses.dataclass
4444
class Conll2012OntonotesV5Document(TokenBasedDocument):
4545
pos_tags: Optional[List[str]] = None
46-
sentences: AnnotationList[Span] = annotation_field(target="tokens")
47-
parse_trees: AnnotationList[Attribute] = annotation_field(target="sentences")
48-
speakers: AnnotationList[Attribute] = annotation_field(target="sentences")
49-
parts: AnnotationList[LabeledSpan] = annotation_field(target="tokens")
50-
coref_mentions: AnnotationList[Span] = annotation_field(target="tokens")
51-
coref_clusters: AnnotationList[SpanSet] = annotation_field(target="coref_mentions")
52-
srl_arguments: AnnotationList[Span] = annotation_field(target="tokens")
53-
srl_relations: AnnotationList[NaryRelation] = annotation_field(target="srl_arguments")
54-
entities: AnnotationList[LabeledSpan] = annotation_field(target="tokens")
55-
predicates: AnnotationList[Predicate] = annotation_field(target="tokens")
56-
word_senses: AnnotationList[LabeledSpan] = annotation_field(target="tokens")
46+
sentences: AnnotationLayer[Span] = annotation_field(target="tokens")
47+
parse_trees: AnnotationLayer[Attribute] = annotation_field(target="sentences")
48+
speakers: AnnotationLayer[Attribute] = annotation_field(target="sentences")
49+
parts: AnnotationLayer[LabeledSpan] = annotation_field(target="tokens")
50+
coref_mentions: AnnotationLayer[Span] = annotation_field(target="tokens")
51+
coref_clusters: AnnotationLayer[SpanSet] = annotation_field(target="coref_mentions")
52+
srl_arguments: AnnotationLayer[Span] = annotation_field(target="tokens")
53+
srl_relations: AnnotationLayer[NaryRelation] = annotation_field(target="srl_arguments")
54+
entities: AnnotationLayer[LabeledSpan] = annotation_field(target="tokens")
55+
predicates: AnnotationLayer[Predicate] = annotation_field(target="tokens")
56+
word_senses: AnnotationLayer[LabeledSpan] = annotation_field(target="tokens")
5757

5858

5959
def bio2spans(bio: List[str], offset: int = 0) -> List[LabeledSpan]:
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
1-
pie-datasets>=0.3.0,<0.11.0
1+
pie-datasets @ git+https://github.com/ArneBinder/pie-datasets@use_pie_core
2+
#pie-modules>=0.13.5,<0.16.0
3+
pie-modules @ git+https://github.com/ArneBinder/pie-modules@use_pie_core

dataset_builders/pie/drugprot/drugprot.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
from typing import Any, Dict, Optional, Union
33

44
import datasets
5-
from pytorch_ie.annotations import BinaryRelation, LabeledSpan
6-
from pytorch_ie.documents import (
5+
from pie_modules.annotations import BinaryRelation, LabeledSpan
6+
from pie_modules.documents import (
77
AnnotationLayer,
88
TextBasedDocument,
99
TextDocumentWithLabeledSpansAndBinaryRelations,
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
1-
pie-datasets>=0.9.0,<0.11.0
1+
pie-datasets @ git+https://github.com/ArneBinder/pie-datasets@use_pie_core
2+
#pie-modules>=0.13.5,<0.16.0
3+
pie-modules @ git+https://github.com/ArneBinder/pie-modules@use_pie_core

dataset_builders/pie/imdb/imdb.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
from typing import Any, Dict
33

44
import datasets
5-
from pytorch_ie.annotations import Label
6-
from pytorch_ie.documents import TextDocumentWithLabel
5+
from pie_modules.annotations import Label
6+
from pie_modules.documents import TextDocumentWithLabel
77

88
from pie_datasets import GeneratorBasedBuilder
99

+3-1
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
1-
pie-datasets>=0.8.1,<0.11.0
1+
pie-datasets @ git+https://github.com/ArneBinder/pie-datasets@use_pie_core
2+
#pie-modules>=0.13.5,<0.16.0
3+
pie-modules @ git+https://github.com/ArneBinder/pie-modules@use_pie_core
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1-
pie-datasets>=0.6.0,<0.11.0
2-
pie-modules>=0.10.8,<0.16.0
1+
pie-datasets @ git+https://github.com/ArneBinder/pie-datasets@use_pie_core
2+
#pie-modules>=0.13.5,<0.16.0
3+
pie-modules @ git+https://github.com/ArneBinder/pie-modules@use_pie_core
34
networkx>=3.0.0,<4.0.0

dataset_builders/pie/sciarg/sciarg.py

+66-8
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,28 @@
1+
import dataclasses
12
import logging
23
from typing import Union
34

5+
from pie_core import AnnotationLayer, Document, annotation_field
46
from pie_modules.document.processing import (
57
RegexPartitioner,
68
RelationArgumentSorter,
79
SpansViaRelationMerger,
810
TextSpanTrimmer,
911
)
10-
from pytorch_ie.core import Document
11-
from pytorch_ie.documents import (
12+
from pie_modules.documents import (
1213
TextDocumentWithLabeledMultiSpansAndBinaryRelations,
1314
TextDocumentWithLabeledMultiSpansBinaryRelationsAndLabeledPartitions,
1415
TextDocumentWithLabeledSpansAndBinaryRelations,
1516
TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
1617
)
1718

1819
from pie_datasets.builders import BratBuilder, BratConfig
19-
from pie_datasets.builders.brat import BratDocument, BratDocumentWithMergedSpans
20+
from pie_datasets.builders.brat import (
21+
BratAttribute,
22+
BratDocument,
23+
BratDocumentWithMergedSpans,
24+
BratNote,
25+
)
2026
from pie_datasets.core.dataset import DocumentConvertersType
2127
from pie_datasets.document.processing import Caster, Pipeline
2228

@@ -26,6 +32,35 @@
2632
SPLIT_PATHS = {"train": "compiled_corpus"}
2733

2834

35+
@dataclasses.dataclass
36+
class ConvertedBratDocument(TextDocumentWithLabeledMultiSpansAndBinaryRelations):
37+
span_attributes: AnnotationLayer[BratAttribute] = annotation_field(
38+
target="labeled_multi_spans"
39+
)
40+
relation_attributes: AnnotationLayer[BratAttribute] = annotation_field(
41+
target="binary_relations"
42+
)
43+
notes: AnnotationLayer[BratNote] = annotation_field(
44+
targets=[
45+
"labeled_multi_spans",
46+
"binary_relations",
47+
"span_attributes",
48+
"relation_attributes",
49+
]
50+
)
51+
52+
53+
@dataclasses.dataclass
54+
class ConvertedBratDocumentWithMergedSpans(TextDocumentWithLabeledSpansAndBinaryRelations):
55+
span_attributes: AnnotationLayer[BratAttribute] = annotation_field(target="labeled_spans")
56+
relation_attributes: AnnotationLayer[BratAttribute] = annotation_field(
57+
target="binary_relations"
58+
)
59+
notes: AnnotationLayer[BratNote] = annotation_field(
60+
targets=["labeled_spans", "binary_relations", "span_attributes", "relation_attributes"]
61+
)
62+
63+
2964
def get_common_converter_pipeline_steps(target_document_type: type[Document]) -> dict:
3065
return dict(
3166
cast=Caster(
@@ -106,13 +141,36 @@ class SciArg(BratBuilder):
106141
def _generate_document(self, example, **kwargs):
107142
document = super()._generate_document(example, **kwargs)
108143
if self.config.resolve_parts_of_same:
109-
document = SpansViaRelationMerger(
110-
relation_layer="relations",
144+
# we need to convert the document to a different type to be able to merge the spans:
145+
# SpansViaRelationMerger expects the spans to be of type LabeledSpan,
146+
# but the document has spans of type BratSpan
147+
converted_doc = document.as_type(
148+
ConvertedBratDocumentWithMergedSpans,
149+
field_mapping={
150+
"spans": "labeled_spans",
151+
"relations": "binary_relations",
152+
},
153+
keep_remaining=True,
154+
)
155+
merged_document = SpansViaRelationMerger(
156+
relation_layer="binary_relations",
111157
link_relation_label="parts_of_same",
112158
create_multi_spans=True,
113-
result_document_type=BratDocument,
114-
result_field_mapping={"spans": "spans", "relations": "relations"},
115-
)(document)
159+
result_document_type=ConvertedBratDocument,
160+
result_field_mapping={
161+
"labeled_spans": "labeled_multi_spans",
162+
"binary_relations": "binary_relations",
163+
"span_attributes": "span_attributes",
164+
"relation_attributes": "relation_attributes",
165+
"notes": "notes",
166+
},
167+
)(converted_doc)
168+
# convert back to BratDocument
169+
document = merged_document.as_type(
170+
BratDocument,
171+
field_mapping={"labeled_multi_spans": "spans", "binary_relations": "relations"},
172+
keep_remaining=True,
173+
)
116174
else:
117175
# some documents have duplicate relations, remove them
118176
remove_duplicate_relations(document)
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
1-
pie-datasets>=0.6.0,<0.11.0
2-
pie-modules>=0.15.4,<0.16.0
1+
pie-datasets @ git+https://github.com/ArneBinder/pie-datasets@use_pie_core
2+
#pie-modules>=0.15.4,<0.16.0
3+
pie-modules @ git+https://github.com/ArneBinder/pie-modules@use_pie_core

0 commit comments

Comments
 (0)