Skip to content

Commit eb10473

Browse files
authored
Merge pull request #1 from Ben-Epstein/as_hf_dataset
support returning hf dataset
2 parents 6ae3453 + 8dc1116 commit eb10473

File tree

6 files changed

+86
-13
lines changed

6 files changed

+86
-13
lines changed

README.md

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ on using huggingface transformers. But Spacy's Entity format is the most intuiti
66
format for tagging entities for NER.
77

88
This repo is a simple converter that leverages `spacy.gold.biluo_tags_from_offsets`
9-
and the SpaCy [`tokenizations`](https://github.com/explosion/tokenizations) repo that
9+
and the SpaCy [`tokenizations`](https://github.com/explosion/tokenizations) repo that
1010
creates a 1-line function to convert spacy
1111
formatted spans to `tokens` and `ner_tags` that can be fed into any
1212
Token Classification Transformer
@@ -26,8 +26,8 @@ For example:
2626
text = "Hello, my name is Ben"
2727
spans = [{"start": 18, "end": 21, "label": "person"}, ...]
2828
```
29-
30-
This is the common structure of output data from labeling tools like LabelStudio or LabelBox, because it's easy and human interpretable.
29+
30+
This is the common structure of output data from labeling tools like LabelStudio or LabelBox, because it's easy and human interpretable.
3131

3232
Huggingface format refers to the BIO/BILOU/BIOES tagging format commonly used for fine-tuning transformers. The input text is tokenized, and each token
3333
is given a tag to denote whether or not it's a label (and it's location, Beginning, Inside etc). Here's an example: https://huggingface.co/datasets/wikiann
@@ -37,7 +37,7 @@ For more information about this tagging system, see [wikipedia](https://en.wikip
3737

3838

3939
This format is tricky, though, because it is entirely dependant on the tokenizer used. Tokens are not simply space separated words. Each tokenizer has a specific vocabulary of tokens that break down works into unique sub-words. So moving from character level spans to token level tags is a very
40-
manual process. That's a core reason I built this tool.
40+
manual process. That's a core reason I built this tool.
4141

4242
## Installation
4343
```shell
@@ -48,7 +48,6 @@ python -m spacy download en_core_web_sm
4848
## Usage
4949
```python
5050
from spacy_to_hf import spacy_to_hf
51-
from datasets import Dataset
5251
5352
span_data = [
5453
{
@@ -62,10 +61,17 @@ span_data = [
6261
]
6362
hf_data = spacy_to_hf(span_data, "bert-base-cased")
6463
print(list(zip(hf_data["tokens"][0], hf_data["ner_tags"][0])))
65-
ds = Dataset.from_dict(hf_data)
6664
```
6765

68-
From here, you can label-index your ner_tags and prepare for fine-tuning
66+
Or, if you want to immediately start fine-tuning or upload this to huggingface, you can
67+
run
68+
```python
69+
ds = spacy_to_hf(span_data, "bert-base-cased", as_hf_dataset=True)
70+
71+
print(ds.features["ner_tags"].feature.names)
72+
```
73+
This will return your data as a HuggingFace `Dataset` and will automatically
74+
string-index your `ner_tags` into a `ClassLabel` object
6975

7076
## Project Setup
7177
Project setup is credited to [@anthonycorletti](https://github.com/anthonycorletti) and his awesome [project template repo](https://github.com/anthonycorletti/python-project-template)

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ dependencies = [
1616
"spacy <3",
1717
"pytokenizations",
1818
"transformers",
19+
"datasets",
1920
"flax" # As the backend for transformers. Smaller/Faster than torch or tf
2021
]
2122
[[project.authors]]

spacy_to_hf/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "0.0.1"
1+
__version__ = "0.0.2"
22

33
from spacy_to_hf.conversion import spacy_to_hf
44

spacy_to_hf/conversion.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,19 @@
1-
from typing import Collection, Dict, List, Sequence
1+
from typing import Collection, Dict, List, Sequence, Union
22

33
import spacy
4+
from datasets import Dataset
45
from spacy.gold import biluo_tags_from_offsets
56
from tokenizations import get_alignments
67
from transformers import AutoTokenizer
78

8-
from spacy_to_hf.utils import map_spacy_to_hf_tags
9+
from spacy_to_hf.utils import dict_to_dataset, map_spacy_to_hf_tags
910

1011

1112
def spacy_to_hf(
12-
spacy_data: List[Dict[str, Sequence[Collection[str]]]], tokenizer: str
13-
) -> Dict[str, List[str]]:
13+
spacy_data: List[Dict[str, Sequence[Collection[str]]]],
14+
tokenizer: str,
15+
as_hf_dataset: bool = False,
16+
) -> Union[Dataset, Dict[str, List[List[str]]]]:
1417
"""Maps spacy formatted spans to HF tokens in BILOU format
1518
1619
Input should be a list of dictionaries of 'text' and 'spans' keys
@@ -52,6 +55,14 @@ def spacy_to_hf(
5255
('##U', 'L-university')]
5356
5457
58+
:param spacy_data: The spacy formatted span data. Must be a list containing
59+
"text" key and "spans" key. "spans" must be a list of dictionaries with
60+
"start", "end", and "label"
61+
:param tokenizer: The tokenizer/model you will be training with in huggingface.
62+
A good option could be "bert-base-uncased"
63+
:param as_hf_dataset: If this should return a formatted Huggingface Dataset. If
64+
True, the dataset will have `tokens` and `ner_tags` as columns, and `ner_tags`
65+
will be a ClassLabel
5566
"""
5667
assert all(
5768
sorted(row.keys()) == ["spans", "text"] for row in spacy_data
@@ -76,4 +87,6 @@ def spacy_to_hf(
7687
hf_tags = map_spacy_to_hf_tags(hf_to_spacy, spacy_tags)
7788
hf_data["tokens"].append(hf_tokens)
7889
hf_data["ner_tags"].append(hf_tags)
90+
if as_hf_dataset:
91+
return dict_to_dataset(hf_data)
7992
return hf_data

spacy_to_hf/utils.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1-
from typing import List
1+
from itertools import chain
2+
from typing import Dict, List
3+
4+
from datasets import ClassLabel, Dataset, Sequence
25

36

47
def next_token_is_same(tokens: List[List[int]], cur_idx: int, tok_num: int) -> bool:
@@ -128,3 +131,33 @@ def map_spacy_to_hf_tags(
128131

129132
hf_tags.extend(clean_hf_tags)
130133
return hf_tags
134+
135+
136+
def dict_to_dataset(hf_data: Dict[str, List[str]]) -> Dataset:
137+
"""Converts a dictionary of huggingface data into a well-formed Dataset
138+
139+
ex input:
140+
{
141+
"tokens": [["sentence", "1"], ["sentence", "Apple"]],
142+
"ner_tags": [["U-word", "O"], ["U-word", "U-ORG"]]
143+
}
144+
145+
This will create a huggingface dataset from the input, and also map the `ner_tags`
146+
into a ClassLabel object which is required for training.
147+
"""
148+
labels = sorted(set(chain.from_iterable(hf_data["ner_tags"])))
149+
# O is typically the first tag. Move it there
150+
if "O" in labels:
151+
labels.remove("O")
152+
labels.insert(0, "O")
153+
ds = Dataset.from_dict(hf_data)
154+
# https://github.com/python/mypy/issues/6239
155+
class_label = Sequence(feature=ClassLabel(num_classes=len(labels), names=labels))
156+
# First need to string index the ner_tags
157+
label_to_idx = dict(zip(labels, range(len(labels))))
158+
ds = ds.map(
159+
lambda row: {"ner_tags": [label_to_idx[tag] for tag in row["ner_tags"]]}
160+
)
161+
# Then we can create the ClassLabel
162+
ds = ds.cast_column("ner_tags", class_label)
163+
return ds

tests/test_conversion.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from typing import Dict, List
22

33
import pytest
4+
from datasets import Dataset
45

56
from spacy_to_hf import spacy_to_hf
67
from tests.constants import (
@@ -28,6 +29,25 @@ def test_spacy_to_hf(
2829
assert hf_data["ner_tags"][0] == hf_tags
2930

3031

32+
@pytest.mark.parametrize(
33+
"spacy_data,hf_tokens,hf_tags",
34+
[
35+
(SPACY_DATA_1, HF_TOKENS_1, HF_TAGS_1),
36+
(SPACY_DATA_2, HF_TOKENS_2, HF_TAGS_2),
37+
],
38+
)
39+
def test_spacy_to_hf_as_dataset(
40+
spacy_data: List[Dict], hf_tokens: List[str], hf_tags: List[str]
41+
) -> None:
42+
hf_data = spacy_to_hf(spacy_data, "bert-base-cased", as_hf_dataset=True)
43+
hf_non_o_tags = [i for i in hf_tags if i != "O"]
44+
sorted_tags = ["O"] + sorted(set(hf_non_o_tags))
45+
assert isinstance(hf_data, Dataset)
46+
assert hf_data.features["ner_tags"].feature.names == sorted_tags
47+
assert hf_data["tokens"][0] == hf_tokens
48+
assert hf_data["ner_tags"][0] == [sorted_tags.index(tag) for tag in hf_tags]
49+
50+
3151
def test_spacy_to_hf_spans_not_list() -> None:
3252
spacy_data = [
3353
{

0 commit comments

Comments
 (0)