Merge pull request #5 from Ben-Epstein/update-repo-312-release

Ben-Epstein · web-flow · commit 350ac844c197 · 2024-09-29T12:59:48.000-04:00
update for release
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -12,11 +12,11 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.10"]
+        python-version: ["3.10", "3.11", "3.12"]
       fail-fast: false
     steps:
       - name: checkout
-        uses: actions/checkout@v2
+        uses: actions/checkout@v4
         with:
           fetch-depth: 0
 
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@
 # See https://pre-commit.com/hooks.html for more hooks
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.4.0
+    rev: v4.6.0
     hooks:
       - id: check-added-large-files
       - id: check-toml
@@ -11,12 +11,8 @@ repos:
           - --unsafe
       - id: end-of-file-fixer
       - id: trailing-whitespace
-  - repo: https://github.com/psf/black
-    rev: 23.3.0
-    hooks:
-      - id: black
   - repo: https://github.com/charliermarsh/ruff-pre-commit
-    rev: v0.0.265
+    rev: v0.6.8
     hooks:
       - id: ruff
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -11,7 +11,7 @@ dynamic = ["version"]
 readme = "README.md"
 license = {text = 'Apache 2.0'}
 description = "Spacy to HF converter"
-requires-python = ">=3.7, <3.12"
+requires-python = ">=3.7, <3.13"
 dependencies = [
     "spacy-alignments",
     "spacy < 4",
@@ -25,7 +25,6 @@ email = "ben.epstein97+spacy-hf@gmail.com"
 
 [project.optional-dependencies]
 dev = [
-    "black >=21.10b0",
     "coverage >=6.1.1",
     "invoke >=2.0.0",
     "mypy >=0.910",
@@ -44,11 +43,13 @@ Documentation = "https://github.com/ben-epstein/spacy-to-hf"
 version = {attr = "spacy_to_hf.__version__"}
 
 [tool.ruff]
-line-length = 88
-ignore = ["D10"]
+line-length = 120
 include = ["*.py"]
-select = ["E", "F", "I"]
 target-version = "py310"
+
+[tool.ruff.lint]
+select = ["E", "F", "I"]
+ignore = ["D10"]
 extend-ignore = [
     "D203",
     "D204",
@@ -64,25 +65,10 @@ extend-ignore = [
     "D415",
 ]
 
-[tool.ruff.pydocstyle]
+
+[tool.ruff.lint.pydocstyle]
 convention = "google"
 
-[tool.black]
-target-version = ['py310']
-include = '\.pyi?$'
-exclude = '''
-/(
-    \.eggs
-  | \.git
-  | \.hg
-  | \.*_cache
-  | \.tox
-  | \.venv
-  | build
-  | dist
-  | __pycache__
-)/
-'''
 
 [tool.mypy]
 ignore_missing_imports = true
diff --git a/spacy_to_hf/__init__.py b/spacy_to_hf/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.0.3"
+__version__ = "0.0.4"
 
 from spacy_to_hf.conversion import spacy_to_hf
 
diff --git a/spacy_to_hf/conversion.py b/spacy_to_hf/conversion.py
@@ -82,8 +82,7 @@ def spacy_to_hf(
         spans = row["spans"]
         assert isinstance(spans, list), "Spans must be a list"
         assert all(
-            isinstance(span, dict) and sorted(span.keys()) == ["end", "label", "start"]
-            for span in spans
+            isinstance(span, dict) and sorted(span.keys()) == ["end", "label", "start"] for span in spans
         ), "All spans must have keys 'start', 'end', and 'label'"
         text = row["text"]
         doc = nlp(text)  # type: ignore
diff --git a/spacy_to_hf/utils.py b/spacy_to_hf/utils.py
@@ -34,9 +34,7 @@ def _get_label(tag: str) -> str:
     return tag.split("-")[1]
 
 
-def _handle_unit_tag(
-    tag: str, tokens: List[List[int]], cur_idx: int, tok_num: int
-) -> str:
+def _handle_unit_tag(tag: str, tokens: List[List[int]], cur_idx: int, tok_num: int) -> str:
     """Process a Unit tag
 
     If a Unit tagged token is broken into multiple sub-tokens, we want the first
@@ -54,9 +52,7 @@ def _handle_unit_tag(
     return clean_tag
 
 
-def _handle_begin_tag(
-    tag: str, tokens: List[List[int]], cur_idx: int, tok_num: int
-) -> str:
+def _handle_begin_tag(tag: str, tokens: List[List[int]], cur_idx: int, tok_num: int) -> str:
     """Process a Begin tag
 
     For Begin tagged tokens that are broken into sub-tokens, we know that there will be
@@ -69,9 +65,7 @@ def _handle_begin_tag(
     return clean_tag
 
 
-def _handle_last_tag(
-    tag: str, tokens: List[List[int]], cur_idx: int, tok_num: int
-) -> str:
+def _handle_last_tag(tag: str, tokens: List[List[int]], cur_idx: int, tok_num: int) -> str:
     """Process a Last tag
 
     For Last tagged tokens that are broken into sub-tokens, we know that there will be
@@ -84,9 +78,7 @@ def _handle_last_tag(
     return clean_tag
 
 
-def map_spacy_to_hf_tags(
-    hf_to_spacy: List[List[int]], spacy_tags: List[str]
-) -> List[str]:
+def map_spacy_to_hf_tags(hf_to_spacy: List[List[int]], spacy_tags: List[str]) -> List[str]:
     """Maps the spacy_tags to the required huggingface tags
 
     Leverages the hf_to_spacy map, showing how each huggingface token maps
@@ -155,9 +147,7 @@ def dict_to_dataset(hf_data: Dict[str, List[str]]) -> Dataset:
     class_label = Sequence(feature=ClassLabel(num_classes=len(labels), names=labels))
     # First need to string index the ner_tags
     label_to_idx = dict(zip(labels, range(len(labels))))
-    ds = ds.map(
-        lambda row: {"ner_tags": [label_to_idx[tag] for tag in row["ner_tags"]]}
-    )
+    ds = ds.map(lambda row: {"ner_tags": [label_to_idx[tag] for tag in row["ner_tags"]]})
     # Then we can create the ClassLabel
     ds = ds.cast_column("ner_tags", class_label)
     return ds
diff --git a/tasks.py b/tasks.py
@@ -76,17 +76,17 @@ def lint(ctx: Context) -> None:
     Check typing and formatting.
     """
     ctx.run(
-        f"mypy {SOURCES}",
+        f"ruff format {SOURCES} --check",
         pty=True,
         echo=True,
     )
     ctx.run(
-        f"black {SOURCES} --check",
+        f"ruff check {SOURCES}",
         pty=True,
         echo=True,
     )
     ctx.run(
-        f"ruff check {SOURCES}",
+        f"mypy {SOURCES}",
         pty=True,
         echo=True,
     )
@@ -98,7 +98,7 @@ def format(ctx: Context) -> None:
     Format the code.
     """
     ctx.run(
-        f"black {SOURCES}",
+        f"ruff format {SOURCES}",
         pty=True,
         echo=True,
     )
diff --git a/tests/test_conversion.py b/tests/test_conversion.py
@@ -21,9 +21,7 @@
         (SPACY_DATA_2, HF_TOKENS_2, HF_TAGS_2),
     ],
 )
-def test_spacy_to_hf(
-    spacy_data: List[Dict], hf_tokens: List[str], hf_tags: List[str]
-) -> None:
+def test_spacy_to_hf(spacy_data: List[Dict], hf_tokens: List[str], hf_tags: List[str]) -> None:
     hf_data = spacy_to_hf(spacy_data, "bert-base-cased")
     assert hf_data["tokens"][0] == hf_tokens
     assert hf_data["ner_tags"][0] == hf_tags
@@ -36,9 +34,7 @@ def test_spacy_to_hf(
         (SPACY_DATA_2, HF_TOKENS_2, HF_TAGS_2),
     ],
 )
-def test_spacy_to_hf_as_dataset(
-    spacy_data: List[Dict], hf_tokens: List[str], hf_tags: List[str]
-) -> None:
+def test_spacy_to_hf_as_dataset(spacy_data: List[Dict], hf_tokens: List[str], hf_tags: List[str]) -> None:
     hf_data = spacy_to_hf(spacy_data, "bert-base-cased", as_hf_dataset=True)
     hf_non_o_tags = [i for i in hf_tags if i != "O"]
     sorted_tags = ["O"] + sorted(set(hf_non_o_tags))

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-__version__ = "0.0.3"`
	`1`	`+__version__ = "0.0.4"`
`2`	`2`
`3`	`3`	`from spacy_to_hf.conversion import spacy_to_hf`
`4`	`4`