Skip to content

Commit 350ac84

Browse files
authored
Merge pull request #5 from Ben-Epstein/update-repo-312-release
update for release
2 parents 8aee01a + 042ff3d commit 350ac84

File tree

8 files changed

+25
-58
lines changed

8 files changed

+25
-58
lines changed

.github/workflows/test.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,11 @@ jobs:
1212
runs-on: ubuntu-latest
1313
strategy:
1414
matrix:
15-
python-version: ["3.10"]
15+
python-version: ["3.10", "3.11", "3.12"]
1616
fail-fast: false
1717
steps:
1818
- name: checkout
19-
uses: actions/checkout@v2
19+
uses: actions/checkout@v4
2020
with:
2121
fetch-depth: 0
2222

.pre-commit-config.yaml

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# See https://pre-commit.com/hooks.html for more hooks
33
repos:
44
- repo: https://github.com/pre-commit/pre-commit-hooks
5-
rev: v4.4.0
5+
rev: v4.6.0
66
hooks:
77
- id: check-added-large-files
88
- id: check-toml
@@ -11,12 +11,8 @@ repos:
1111
- --unsafe
1212
- id: end-of-file-fixer
1313
- id: trailing-whitespace
14-
- repo: https://github.com/psf/black
15-
rev: 23.3.0
16-
hooks:
17-
- id: black
1814
- repo: https://github.com/charliermarsh/ruff-pre-commit
19-
rev: v0.0.265
15+
rev: v0.6.8
2016
hooks:
2117
- id: ruff
2218

pyproject.toml

Lines changed: 8 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ dynamic = ["version"]
1111
readme = "README.md"
1212
license = {text = 'Apache 2.0'}
1313
description = "Spacy to HF converter"
14-
requires-python = ">=3.7, <3.12"
14+
requires-python = ">=3.7, <3.13"
1515
dependencies = [
1616
"spacy-alignments",
1717
"spacy < 4",
@@ -25,7 +25,6 @@ email = "[email protected]"
2525

2626
[project.optional-dependencies]
2727
dev = [
28-
"black >=21.10b0",
2928
"coverage >=6.1.1",
3029
"invoke >=2.0.0",
3130
"mypy >=0.910",
@@ -44,11 +43,13 @@ Documentation = "https://github.com/ben-epstein/spacy-to-hf"
4443
version = {attr = "spacy_to_hf.__version__"}
4544

4645
[tool.ruff]
47-
line-length = 88
48-
ignore = ["D10"]
46+
line-length = 120
4947
include = ["*.py"]
50-
select = ["E", "F", "I"]
5148
target-version = "py310"
49+
50+
[tool.ruff.lint]
51+
select = ["E", "F", "I"]
52+
ignore = ["D10"]
5253
extend-ignore = [
5354
"D203",
5455
"D204",
@@ -64,25 +65,10 @@ extend-ignore = [
6465
"D415",
6566
]
6667

67-
[tool.ruff.pydocstyle]
68+
69+
[tool.ruff.lint.pydocstyle]
6870
convention = "google"
6971

70-
[tool.black]
71-
target-version = ['py310']
72-
include = '\.pyi?$'
73-
exclude = '''
74-
/(
75-
\.eggs
76-
| \.git
77-
| \.hg
78-
| \.*_cache
79-
| \.tox
80-
| \.venv
81-
| build
82-
| dist
83-
| __pycache__
84-
)/
85-
'''
8672

8773
[tool.mypy]
8874
ignore_missing_imports = true

spacy_to_hf/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "0.0.3"
1+
__version__ = "0.0.4"
22

33
from spacy_to_hf.conversion import spacy_to_hf
44

spacy_to_hf/conversion.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -82,8 +82,7 @@ def spacy_to_hf(
8282
spans = row["spans"]
8383
assert isinstance(spans, list), "Spans must be a list"
8484
assert all(
85-
isinstance(span, dict) and sorted(span.keys()) == ["end", "label", "start"]
86-
for span in spans
85+
isinstance(span, dict) and sorted(span.keys()) == ["end", "label", "start"] for span in spans
8786
), "All spans must have keys 'start', 'end', and 'label'"
8887
text = row["text"]
8988
doc = nlp(text) # type: ignore

spacy_to_hf/utils.py

Lines changed: 5 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,7 @@ def _get_label(tag: str) -> str:
3434
return tag.split("-")[1]
3535

3636

37-
def _handle_unit_tag(
38-
tag: str, tokens: List[List[int]], cur_idx: int, tok_num: int
39-
) -> str:
37+
def _handle_unit_tag(tag: str, tokens: List[List[int]], cur_idx: int, tok_num: int) -> str:
4038
"""Process a Unit tag
4139
4240
If a Unit tagged token is broken into multiple sub-tokens, we want the first
@@ -54,9 +52,7 @@ def _handle_unit_tag(
5452
return clean_tag
5553

5654

57-
def _handle_begin_tag(
58-
tag: str, tokens: List[List[int]], cur_idx: int, tok_num: int
59-
) -> str:
55+
def _handle_begin_tag(tag: str, tokens: List[List[int]], cur_idx: int, tok_num: int) -> str:
6056
"""Process a Begin tag
6157
6258
For Begin tagged tokens that are broken into sub-tokens, we know that there will be
@@ -69,9 +65,7 @@ def _handle_begin_tag(
6965
return clean_tag
7066

7167

72-
def _handle_last_tag(
73-
tag: str, tokens: List[List[int]], cur_idx: int, tok_num: int
74-
) -> str:
68+
def _handle_last_tag(tag: str, tokens: List[List[int]], cur_idx: int, tok_num: int) -> str:
7569
"""Process a Last tag
7670
7771
For Last tagged tokens that are broken into sub-tokens, we know that there will be
@@ -84,9 +78,7 @@ def _handle_last_tag(
8478
return clean_tag
8579

8680

87-
def map_spacy_to_hf_tags(
88-
hf_to_spacy: List[List[int]], spacy_tags: List[str]
89-
) -> List[str]:
81+
def map_spacy_to_hf_tags(hf_to_spacy: List[List[int]], spacy_tags: List[str]) -> List[str]:
9082
"""Maps the spacy_tags to the required huggingface tags
9183
9284
Leverages the hf_to_spacy map, showing how each huggingface token maps
@@ -155,9 +147,7 @@ def dict_to_dataset(hf_data: Dict[str, List[str]]) -> Dataset:
155147
class_label = Sequence(feature=ClassLabel(num_classes=len(labels), names=labels))
156148
# First need to string index the ner_tags
157149
label_to_idx = dict(zip(labels, range(len(labels))))
158-
ds = ds.map(
159-
lambda row: {"ner_tags": [label_to_idx[tag] for tag in row["ner_tags"]]}
160-
)
150+
ds = ds.map(lambda row: {"ner_tags": [label_to_idx[tag] for tag in row["ner_tags"]]})
161151
# Then we can create the ClassLabel
162152
ds = ds.cast_column("ner_tags", class_label)
163153
return ds

tasks.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -76,17 +76,17 @@ def lint(ctx: Context) -> None:
7676
Check typing and formatting.
7777
"""
7878
ctx.run(
79-
f"mypy {SOURCES}",
79+
f"ruff format {SOURCES} --check",
8080
pty=True,
8181
echo=True,
8282
)
8383
ctx.run(
84-
f"black {SOURCES} --check",
84+
f"ruff check {SOURCES}",
8585
pty=True,
8686
echo=True,
8787
)
8888
ctx.run(
89-
f"ruff check {SOURCES}",
89+
f"mypy {SOURCES}",
9090
pty=True,
9191
echo=True,
9292
)
@@ -98,7 +98,7 @@ def format(ctx: Context) -> None:
9898
Format the code.
9999
"""
100100
ctx.run(
101-
f"black {SOURCES}",
101+
f"ruff format {SOURCES}",
102102
pty=True,
103103
echo=True,
104104
)

tests/test_conversion.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,7 @@
2121
(SPACY_DATA_2, HF_TOKENS_2, HF_TAGS_2),
2222
],
2323
)
24-
def test_spacy_to_hf(
25-
spacy_data: List[Dict], hf_tokens: List[str], hf_tags: List[str]
26-
) -> None:
24+
def test_spacy_to_hf(spacy_data: List[Dict], hf_tokens: List[str], hf_tags: List[str]) -> None:
2725
hf_data = spacy_to_hf(spacy_data, "bert-base-cased")
2826
assert hf_data["tokens"][0] == hf_tokens
2927
assert hf_data["ner_tags"][0] == hf_tags
@@ -36,9 +34,7 @@ def test_spacy_to_hf(
3634
(SPACY_DATA_2, HF_TOKENS_2, HF_TAGS_2),
3735
],
3836
)
39-
def test_spacy_to_hf_as_dataset(
40-
spacy_data: List[Dict], hf_tokens: List[str], hf_tags: List[str]
41-
) -> None:
37+
def test_spacy_to_hf_as_dataset(spacy_data: List[Dict], hf_tokens: List[str], hf_tags: List[str]) -> None:
4238
hf_data = spacy_to_hf(spacy_data, "bert-base-cased", as_hf_dataset=True)
4339
hf_non_o_tags = [i for i in hf_tags if i != "O"]
4440
sorted_tags = ["O"] + sorted(set(hf_non_o_tags))

0 commit comments

Comments
 (0)