Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update develop from master for v3.7 #13011

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
1d216a7
Update README for v3.6 (#12844)
adrianeboyd Jul 24, 2023
e2b8901
Add spacy-llm docs to website (#12782)
victorialslocum Jul 24, 2023
f8f489b
Switch from distutils to setuptools/sysconfig (#12853)
adrianeboyd Jul 24, 2023
98799d8
`SpanCat`: Remove invalid `threshold` config argument (#12860)
shadeMe Jul 26, 2023
51b9655
Added OdyCy to spaCy Universe (#12826)
x-tabdeveloping Jul 26, 2023
49055ed
Add cli for finding locations of registered func (#12757)
victorialslocum Jul 31, 2023
c9e9dcc
Add displaCy data structures to docs (2) (#12875)
svlandeg Jul 31, 2023
186889e
added entry for SaysWho (#12828)
afriedman412 Jul 31, 2023
a0a1956
Tests for CLI app - `init config` generates `train`-able config (#12173)
pmbaumgartner Jul 31, 2023
222bd3c
Display model's full base version string in incompatiblity warning (#…
shadeMe Aug 2, 2023
07407e0
fix the regular expression matching on the full text (#12883)
arplusman Aug 2, 2023
3b7faf4
fix (#12881)
svlandeg Aug 3, 2023
45af8a5
Update br tags (#12882)
adrianeboyd Aug 4, 2023
245e2dd
Allow pydantic v2 using transitional v1 support (#12888)
adrianeboyd Aug 8, 2023
c4e378d
Update CuPy extras (#12890)
adrianeboyd Aug 8, 2023
458bc5f
Set version to v3.6.1 (#12892)
adrianeboyd Aug 8, 2023
d50b8d5
Update examples.py (#12895)
denizcodeyaa Aug 11, 2023
64b8ee2
Update universe.json (#12904)
wjbmattingly Aug 14, 2023
76a9f9c
Docs: clarify abstract spacy.load examples (#12889)
adrianeboyd Aug 16, 2023
6dd5686
📝 Fix formula for receptive field in docs (#12918)
connorbrinton Aug 21, 2023
d8a32c1
docs: fix ngram_range_suggester max_size description (#12939)
pdhall99 Aug 29, 2023
52758e1
Add headers to netlify.toml [ci skip]
ines Aug 30, 2023
3e42648
Update large-language-models.mdx (#12944)
koaning Aug 30, 2023
065ead4
updated `add_pipe` docs (#12947)
davidberenstein1957 Sep 1, 2023
5c1f926
fix typo in link (#12948)
svlandeg Sep 1, 2023
6d1f6d9
Fix LLM usage example (#12950)
svlandeg Sep 4, 2023
cc78847
fix training.batch_size example (#12963)
magdaaniol Sep 6, 2023
def7013
Docs for spacy-llm 0.5.0 (#12968)
svlandeg Sep 8, 2023
013762b
Few spacy-llm doc fixes (#12969)
svlandeg Sep 8, 2023
8f0d6b0
Fix in BertTokenizer docs (#12955)
svlandeg Sep 13, 2023
e9f0485
Merge remote-tracking branch 'upstream/master' into chore/update-deve…
adrianeboyd Sep 25, 2023
4e3360a
add --spans-key option for CLI spancat evaluation (#12981)
evornov Sep 25, 2023
7db189d
Merge remote-tracking branch 'upstream/master' into chore/update-deve…
adrianeboyd Sep 25, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 35 additions & 37 deletions README.md

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ numpy>=1.15.0; python_version < "3.9"
numpy>=1.19.0; python_version >= "3.9"
requests>=2.13.0,<3.0.0
tqdm>=4.38.0,<5.0.0
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0
jinja2
langcodes>=3.2.0,<4.0.0
# Official Python utilities
Expand Down
4 changes: 3 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ install_requires =
numpy>=1.15.0; python_version < "3.9"
numpy>=1.19.0; python_version >= "3.9"
requests>=2.13.0,<3.0.0
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0
jinja2
# Official Python utilities
setuptools
Expand Down Expand Up @@ -116,6 +116,8 @@ cuda117 =
cupy-cuda117>=5.0.0b4,<13.0.0
cuda11x =
cupy-cuda11x>=11.0.0,<13.0.0
cuda12x =
cupy-cuda12x>=11.5.0,<13.0.0
cuda-autodetect =
cupy-wheel>=11.0.0,<13.0.0
apple =
Expand Down
31 changes: 3 additions & 28 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
#!/usr/bin/env python
from setuptools import Extension, setup, find_packages
import sys
import platform
import numpy
from distutils.command.build_ext import build_ext
from distutils.sysconfig import get_python_inc
from setuptools.command.build_ext import build_ext
from sysconfig import get_path
from pathlib import Path
import shutil
from Cython.Build import cythonize
Expand Down Expand Up @@ -88,30 +87,6 @@
}


def is_new_osx():
"""Check whether we're on OSX >= 10.7"""
if sys.platform != "darwin":
return False
mac_ver = platform.mac_ver()[0]
if mac_ver.startswith("10"):
minor_version = int(mac_ver.split(".")[1])
if minor_version >= 7:
return True
else:
return False
return False


if is_new_osx():
# On Mac, use libc++ because Apple deprecated use of
# libstdc
COMPILE_OPTIONS["other"].append("-stdlib=libc++")
LINK_OPTIONS["other"].append("-lc++")
# g++ (used by unix compiler on mac) links to libstdc++ as a default lib.
# See: https://stackoverflow.com/questions/1653047/avoid-linking-to-libstdc
LINK_OPTIONS["other"].append("-nodefaultlibs")


# By subclassing build_extensions we have the actual compiler that will be used which is really known only after finalize_options
# http://stackoverflow.com/questions/724664/python-distutils-how-to-get-a-compiler-that-is-going-to-be-used
class build_ext_options:
Expand Down Expand Up @@ -204,7 +179,7 @@ def setup_package():

include_dirs = [
numpy.get_include(),
get_python_inc(plat_specific=True),
get_path("include"),
]
ext_modules = []
ext_modules.append(
Expand Down
1 change: 1 addition & 0 deletions spacy/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from .debug_model import debug_model # noqa: F401
from .download import download # noqa: F401
from .evaluate import evaluate # noqa: F401
from .find_function import find_function # noqa: F401
from .find_threshold import find_threshold # noqa: F401
from .info import info # noqa: F401
from .init_config import fill_config, init_config # noqa: F401
Expand Down
3 changes: 2 additions & 1 deletion spacy/cli/assemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ def assemble_cli(

DOCS: https://spacy.io/api/cli#assemble
"""
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
if verbose:
util.logger.setLevel(logging.DEBUG)
# Make sure all files and paths exists if they are needed
if not config_path or (str(config_path) != "-" and not config_path.exists()):
msg.fail("Config file not found", config_path, exits=1)
Expand Down
2 changes: 2 additions & 0 deletions spacy/cli/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def evaluate_cli(
displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
per_component: bool = Opt(False, "--per-component", "-P", help="Return scores per component, only applicable when an output JSON file is specified."),
spans_key: str = Opt("sc", "--spans-key", "-sk", help="Spans key to use when evaluating Doc.spans"),
# fmt: on
):
"""
Expand All @@ -53,6 +54,7 @@ def evaluate_cli(
displacy_limit=displacy_limit,
per_component=per_component,
silent=False,
spans_key=spans_key,
)


Expand Down
69 changes: 69 additions & 0 deletions spacy/cli/find_function.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
from typing import Optional, Tuple

from catalogue import RegistryError
from wasabi import msg

from ..util import registry
from ._util import Arg, Opt, app


@app.command("find-function")
def find_function_cli(
# fmt: off
func_name: str = Arg(..., help="Name of the registered function."),
registry_name: Optional[str] = Opt(None, "--registry", "-r", help="Name of the catalogue registry."),
# fmt: on
):
"""
Find the module, path and line number to the file the registered
function is defined in, if available.

func_name (str): Name of the registered function.
registry_name (Optional[str]): Name of the catalogue registry.

DOCS: https://spacy.io/api/cli#find-function
"""
if not registry_name:
registry_names = registry.get_registry_names()
for name in registry_names:
if registry.has(name, func_name):
registry_name = name
break

if not registry_name:
msg.fail(
f"Couldn't find registered function: '{func_name}'",
exits=1,
)

assert registry_name is not None
find_function(func_name, registry_name)


def find_function(func_name: str, registry_name: str) -> Tuple[str, int]:
registry_desc = None
try:
registry_desc = registry.find(registry_name, func_name)
except RegistryError as e:
msg.fail(
f"Couldn't find registered function: '{func_name}' in registry '{registry_name}'",
)
msg.fail(f"{e}", exits=1)
assert registry_desc is not None

registry_path = None
line_no = None
if registry_desc["file"]:
registry_path = registry_desc["file"]
line_no = registry_desc["line_no"]

if not registry_path or not line_no:
msg.fail(
f"Couldn't find path to registered function: '{func_name}' in registry '{registry_name}'",
exits=1,
)
assert registry_path is not None
assert line_no is not None

msg.good(f"Found registered function '{func_name}' at {registry_path}:{line_no}")
return str(registry_path), int(line_no)
4 changes: 2 additions & 2 deletions spacy/cli/find_threshold.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ def find_threshold_cli(

DOCS: https://spacy.io/api/cli#find-threshold
"""

util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
if verbose:
util.logger.setLevel(logging.DEBUG)
import_code(code_path)
find_threshold(
model=model,
Expand Down
9 changes: 6 additions & 3 deletions spacy/cli/init_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ def init_vectors_cli(
you can use in the [initialize] block of your config to initialize
a model with vectors.
"""
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
if verbose:
util.logger.setLevel(logging.DEBUG)
msg.info(f"Creating blank nlp object for language '{lang}'")
nlp = util.get_lang_class(lang)()
if jsonl_loc is not None:
Expand Down Expand Up @@ -87,7 +88,8 @@ def init_pipeline_cli(
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
# fmt: on
):
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
if verbose:
util.logger.setLevel(logging.DEBUG)
overrides = parse_config_overrides(ctx.args)
import_code(code_path)
setup_gpu(use_gpu)
Expand Down Expand Up @@ -116,7 +118,8 @@ def init_labels_cli(
"""Generate JSON files for the labels in the data. This helps speed up the
training process, since spaCy won't have to preprocess the data to
extract the labels."""
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
if verbose:
util.logger.setLevel(logging.DEBUG)
if not output_path.exists():
output_path.mkdir(parents=True)
overrides = parse_config_overrides(ctx.args)
Expand Down
2 changes: 1 addition & 1 deletion spacy/cli/package.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,7 +403,7 @@ def _format_sources(data: Any) -> str:
if author:
result += " ({})".format(author)
sources.append(result)
return "<br />".join(sources)
return "<br>".join(sources)


def _format_accuracy(data: Dict[str, Any], exclude: List[str] = ["speed"]) -> str:
Expand Down
3 changes: 2 additions & 1 deletion spacy/cli/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ def train_cli(

DOCS: https://spacy.io/api/cli#train
"""
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
if verbose:
util.logger.setLevel(logging.DEBUG)
overrides = parse_config_overrides(ctx.args)
import_code(code_path)
train(config_path, output_path, use_gpu=use_gpu, overrides=overrides)
Expand Down
6 changes: 4 additions & 2 deletions spacy/displacy/render.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,8 @@ def render(
self.lang = settings.get("lang", DEFAULT_LANG)
render_id = f"{id_prefix}-{i}"
svg = self.render_svg(render_id, p["words"], p["arcs"])
if p.get("title"):
svg = TPL_TITLE.format(title=p.get("title")) + svg
rendered.append(svg)
if page:
content = "".join([TPL_FIGURE.format(content=svg) for svg in rendered])
Expand Down Expand Up @@ -565,7 +567,7 @@ def render_ents(
for i, fragment in enumerate(fragments):
markup += escape_html(fragment)
if len(fragments) > 1 and i != len(fragments) - 1:
markup += "</br>"
markup += "<br>"
if self.ents is None or label.upper() in self.ents:
color = self.colors.get(label.upper(), self.default_color)
ent_settings = {
Expand All @@ -583,7 +585,7 @@ def render_ents(
for i, fragment in enumerate(fragments):
markup += escape_html(fragment)
if len(fragments) > 1 and i != len(fragments) - 1:
markup += "</br>"
markup += "<br>"
markup = TPL_ENTS.format(content=markup, dir=self.direction)
if title:
markup = TPL_TITLE.format(title=title) + markup
Expand Down
3 changes: 3 additions & 0 deletions spacy/lang/tr/examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,7 @@
"Türkiye'nin başkenti neresi?",
"Bakanlar Kurulu 180 günlük eylem planını açıkladı.",
"Merkez Bankası, beklentiler doğrultusunda faizlerde değişikliğe gitmedi.",
"Cemal Sureya kimdir?",
"Bunlari Biliyor muydunuz?",
"Altinoluk Turkiye haritasinin neresinde yer alir?",
]
4 changes: 2 additions & 2 deletions spacy/ml/models/tok2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,8 @@ def build_hash_embed_cnn_tok2vec(
are between 2 and 8.
window_size (int): The number of tokens on either side to concatenate during
the convolutions. The receptive field of the CNN will be
depth * (window_size * 2 + 1), so a 4-layer network with window_size of
2 will be sensitive to 20 words at a time. Recommended value is 1.
depth * window_size * 2 + 1, so a 4-layer network with window_size of
2 will be sensitive to 17 words at a time. Recommended value is 1.
embed_size (int): The number of rows in the hash embedding tables. This can
be surprisingly small, due to the use of the hash embeddings. Recommended
values are between 2000 and 10000.
Expand Down
8 changes: 6 additions & 2 deletions spacy/pipeline/_edit_tree_internals/schemas.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
from collections import defaultdict
from typing import Any, Dict, List, Union

from pydantic import BaseModel, Field, ValidationError
from pydantic.types import StrictBool, StrictInt, StrictStr
try:
from pydantic.v1 import BaseModel, Field, ValidationError
from pydantic.v1.types import StrictBool, StrictInt, StrictStr
except ImportError:
from pydantic import BaseModel, Field, ValidationError # type: ignore
from pydantic.types import StrictBool, StrictInt, StrictStr # type: ignore


class MatchNodeSchema(BaseModel):
Expand Down
41 changes: 28 additions & 13 deletions spacy/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,34 @@
Union,
)

from pydantic import (
BaseModel,
ConstrainedStr,
Field,
StrictBool,
StrictFloat,
StrictInt,
StrictStr,
ValidationError,
create_model,
validator,
)
from pydantic.main import ModelMetaclass
try:
from pydantic.v1 import (
BaseModel,
ConstrainedStr,
Field,
StrictBool,
StrictFloat,
StrictInt,
StrictStr,
ValidationError,
create_model,
validator,
)
from pydantic.v1.main import ModelMetaclass
except ImportError:
from pydantic import ( # type: ignore
BaseModel,
ConstrainedStr,
Field,
StrictBool,
StrictFloat,
StrictInt,
StrictStr,
ValidationError,
create_model,
validator,
)
from pydantic.main import ModelMetaclass # type: ignore
from thinc.api import ConfigValidationError, Model, Optimizer
from thinc.config import Promise

Expand Down
7 changes: 6 additions & 1 deletion spacy/tests/pipeline/test_initialize.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
import pytest
from pydantic import StrictBool

try:
from pydantic.v1 import StrictBool
except ImportError:
from pydantic import StrictBool # type: ignore

from thinc.api import ConfigValidationError

from spacy.lang.en import English
Expand Down
7 changes: 6 additions & 1 deletion spacy/tests/pipeline/test_pipe_factories.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
import pytest
from pydantic import StrictInt, StrictStr

try:
from pydantic.v1 import StrictInt, StrictStr
except ImportError:
from pydantic import StrictInt, StrictStr # type: ignore

from thinc.api import ConfigValidationError, Linear, Model

import spacy
Expand Down
Loading