Merge pull request #1 from Ben-Epstein/as_hf_dataset

Ben-Epstein · web-flow · commit eb1047304557 · 2023-05-08T11:29:31.000-06:00
support returning hf dataset
diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@ on using huggingface transformers. But Spacy's Entity format is the most intuiti
 format for tagging entities for NER.
 
 This repo is a simple converter that leverages `spacy.gold.biluo_tags_from_offsets`
-and the SpaCy [`tokenizations`](https://github.com/explosion/tokenizations) repo that 
+and the SpaCy [`tokenizations`](https://github.com/explosion/tokenizations) repo that
 creates a 1-line function to convert spacy
 formatted spans to `tokens` and `ner_tags` that can be fed into any
 Token Classification Transformer
@@ -26,8 +26,8 @@ For example:
 text = "Hello, my name is Ben"
 spans = [{"start": 18, "end": 21, "label": "person"}, ...]
 ```
-    
-This is the common structure of output data from labeling tools like LabelStudio or LabelBox, because it's easy and human interpretable. 
+
+This is the common structure of output data from labeling tools like LabelStudio or LabelBox, because it's easy and human interpretable.
 
 Huggingface format refers to the BIO/BILOU/BIOES tagging format commonly used for fine-tuning transformers. The input text is tokenized, and each token
 is given a tag to denote whether or not it's a label (and it's location, Beginning, Inside etc). Here's an example: https://huggingface.co/datasets/wikiann
@@ -37,7 +37,7 @@ For more information about this tagging system, see [wikipedia](https://en.wikip
 
 
 This format is tricky, though, because it is entirely dependant on the tokenizer used. Tokens are not simply space separated words. Each tokenizer has a specific vocabulary of tokens that break down works into unique sub-words. So moving from character level spans to token level tags is a very
-manual process. That's a core reason I built this tool. 
+manual process. That's a core reason I built this tool.
 
 ## Installation
 ```shell
@@ -48,7 +48,6 @@ python -m spacy download en_core_web_sm
 ## Usage
 ```python
 from spacy_to_hf import spacy_to_hf
-from datasets import Dataset
 
 span_data = [
     {
@@ -62,10 +61,17 @@ span_data = [
 ]
 hf_data = spacy_to_hf(span_data, "bert-base-cased")
 print(list(zip(hf_data["tokens"][0], hf_data["ner_tags"][0])))
-ds = Dataset.from_dict(hf_data)
 ```
 
-From here, you can label-index your ner_tags and prepare for fine-tuning
+Or, if you want to immediately start fine-tuning or upload this to huggingface, you can
+run
+```python
+ds = spacy_to_hf(span_data, "bert-base-cased", as_hf_dataset=True)
+
+print(ds.features["ner_tags"].feature.names)
+```
+This will return your data as a HuggingFace `Dataset` and will automatically
+string-index your `ner_tags` into a `ClassLabel` object
 
 ## Project Setup
 Project setup is credited to [@anthonycorletti](https://github.com/anthonycorletti) and his awesome [project template repo](https://github.com/anthonycorletti/python-project-template)
diff --git a/pyproject.toml b/pyproject.toml
@@ -16,6 +16,7 @@ dependencies = [
     "spacy <3",
     "pytokenizations",
     "transformers",
+    "datasets",
     "flax"  # As the backend for transformers. Smaller/Faster than torch or tf
 ]
 [[project.authors]]
diff --git a/spacy_to_hf/__init__.py b/spacy_to_hf/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.0.1"
+__version__ = "0.0.2"
 
 from spacy_to_hf.conversion import spacy_to_hf
 
diff --git a/spacy_to_hf/conversion.py b/spacy_to_hf/conversion.py
@@ -1,16 +1,19 @@
-from typing import Collection, Dict, List, Sequence
+from typing import Collection, Dict, List, Sequence, Union
 
 import spacy
+from datasets import Dataset
 from spacy.gold import biluo_tags_from_offsets
 from tokenizations import get_alignments
 from transformers import AutoTokenizer
 
-from spacy_to_hf.utils import map_spacy_to_hf_tags
+from spacy_to_hf.utils import dict_to_dataset, map_spacy_to_hf_tags
 
 
 def spacy_to_hf(
-    spacy_data: List[Dict[str, Sequence[Collection[str]]]], tokenizer: str
-) -> Dict[str, List[str]]:
+    spacy_data: List[Dict[str, Sequence[Collection[str]]]],
+    tokenizer: str,
+    as_hf_dataset: bool = False,
+) -> Union[Dataset, Dict[str, List[List[str]]]]:
     """Maps spacy formatted spans to HF tokens in BILOU format
 
     Input should be a list of dictionaries of 'text' and 'spans' keys
@@ -52,6 +55,14 @@ def spacy_to_hf(
                      ('##U', 'L-university')]
 
 
+    :param spacy_data: The spacy formatted span data. Must be a list containing
+        "text" key and "spans" key. "spans" must be a list of dictionaries with
+        "start", "end", and "label"
+    :param tokenizer: The tokenizer/model you will be training with in huggingface.
+        A good option could be "bert-base-uncased"
+    :param as_hf_dataset: If this should return a formatted Huggingface Dataset. If
+        True, the dataset will have `tokens` and `ner_tags` as columns, and `ner_tags`
+        will be a ClassLabel
     """
     assert all(
         sorted(row.keys()) == ["spans", "text"] for row in spacy_data
@@ -76,4 +87,6 @@ def spacy_to_hf(
         hf_tags = map_spacy_to_hf_tags(hf_to_spacy, spacy_tags)
         hf_data["tokens"].append(hf_tokens)
         hf_data["ner_tags"].append(hf_tags)
+    if as_hf_dataset:
+        return dict_to_dataset(hf_data)
     return hf_data
diff --git a/spacy_to_hf/utils.py b/spacy_to_hf/utils.py
@@ -1,4 +1,7 @@
-from typing import List
+from itertools import chain
+from typing import Dict, List
+
+from datasets import ClassLabel, Dataset, Sequence
 
 
 def next_token_is_same(tokens: List[List[int]], cur_idx: int, tok_num: int) -> bool:
@@ -128,3 +131,33 @@ def map_spacy_to_hf_tags(
 
         hf_tags.extend(clean_hf_tags)
     return hf_tags
+
+
+def dict_to_dataset(hf_data: Dict[str, List[str]]) -> Dataset:
+    """Converts a dictionary of huggingface data into a well-formed Dataset
+
+    ex input:
+        {
+            "tokens": [["sentence", "1"], ["sentence", "Apple"]],
+            "ner_tags": [["U-word", "O"], ["U-word", "U-ORG"]]
+        }
+
+    This will create a huggingface dataset from the input, and also map the `ner_tags`
+    into a ClassLabel object which is required for training.
+    """
+    labels = sorted(set(chain.from_iterable(hf_data["ner_tags"])))
+    # O is typically the first tag. Move it there
+    if "O" in labels:
+        labels.remove("O")
+        labels.insert(0, "O")
+    ds = Dataset.from_dict(hf_data)
+    # https://github.com/python/mypy/issues/6239
+    class_label = Sequence(feature=ClassLabel(num_classes=len(labels), names=labels))
+    # First need to string index the ner_tags
+    label_to_idx = dict(zip(labels, range(len(labels))))
+    ds = ds.map(
+        lambda row: {"ner_tags": [label_to_idx[tag] for tag in row["ner_tags"]]}
+    )
+    # Then we can create the ClassLabel
+    ds = ds.cast_column("ner_tags", class_label)
+    return ds
diff --git a/tests/test_conversion.py b/tests/test_conversion.py
@@ -1,6 +1,7 @@
 from typing import Dict, List
 
 import pytest
+from datasets import Dataset
 
 from spacy_to_hf import spacy_to_hf
 from tests.constants import (
@@ -28,6 +29,25 @@ def test_spacy_to_hf(
     assert hf_data["ner_tags"][0] == hf_tags
 
 
+@pytest.mark.parametrize(
+    "spacy_data,hf_tokens,hf_tags",
+    [
+        (SPACY_DATA_1, HF_TOKENS_1, HF_TAGS_1),
+        (SPACY_DATA_2, HF_TOKENS_2, HF_TAGS_2),
+    ],
+)
+def test_spacy_to_hf_as_dataset(
+    spacy_data: List[Dict], hf_tokens: List[str], hf_tags: List[str]
+) -> None:
+    hf_data = spacy_to_hf(spacy_data, "bert-base-cased", as_hf_dataset=True)
+    hf_non_o_tags = [i for i in hf_tags if i != "O"]
+    sorted_tags = ["O"] + sorted(set(hf_non_o_tags))
+    assert isinstance(hf_data, Dataset)
+    assert hf_data.features["ner_tags"].feature.names == sorted_tags
+    assert hf_data["tokens"][0] == hf_tokens
+    assert hf_data["ner_tags"][0] == [sorted_tags.index(tag) for tag in hf_tags]
+
+
 def test_spacy_to_hf_spans_not_list() -> None:
     spacy_data = [
         {

Original file line number	Diff line number	Diff line change
`@@ -16,6 +16,7 @@ dependencies = [`
`16`	`16`	`"spacy <3",`
`17`	`17`	`"pytokenizations",`
`18`	`18`	`"transformers",`
	`19`	`+ "datasets",`
`19`	`20`	`"flax" # As the backend for transformers. Smaller/Faster than torch or tf`
`20`	`21`	`]`
`21`	`22`	`[[project.authors]]`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-__version__ = "0.0.1"`
	`1`	`+__version__ = "0.0.2"`
`2`	`2`
`3`	`3`	`from spacy_to_hf.conversion import spacy_to_hf`
`4`	`4`