From 8113cfb2573ec64be08fef22d6682e81c0416bd1 Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Wed, 5 Jul 2023 13:36:04 +0200
Subject: [PATCH 01/39] `Language.replace_listeners`: Pass the replaced
 listener and the `tok2vec` pipe to the callback (#12785)

* `Language.replace_listeners`: Pass the replaced listener and the `tok2vec` pipe to the callback

* Update developer docs

* `isort` fixes

* Add error message to assertion

* Add clarification to dev docs

* Replace assertion with exception

* Doc fixes
---
 extra/DEVELOPER_DOCS/Listeners.md | 39 +++++++++++++++++++++----------
 spacy/errors.py                   |  2 ++
 spacy/language.py                 | 17 ++++++++++++--
 3 files changed, 44 insertions(+), 14 deletions(-)

diff --git a/extra/DEVELOPER_DOCS/Listeners.md b/extra/DEVELOPER_DOCS/Listeners.md
index 3a71082e0b6..72c03688069 100644
--- a/extra/DEVELOPER_DOCS/Listeners.md
+++ b/extra/DEVELOPER_DOCS/Listeners.md
@@ -1,14 +1,17 @@
 # Listeners
 
-1. [Overview](#1-overview)
-2. [Initialization](#2-initialization)
-   - [A. Linking listeners to the embedding component](#2a-linking-listeners-to-the-embedding-component)
-   - [B. Shape inference](#2b-shape-inference)
-3. [Internal communication](#3-internal-communication)
-   - [A. During prediction](#3a-during-prediction)
-   - [B. During training](#3b-during-training)
-   - [C. Frozen components](#3c-frozen-components)
-4. [Replacing listener with standalone](#4-replacing-listener-with-standalone)
+- [1. Overview](#1-overview)
+- [2. Initialization](#2-initialization)
+  - [2A. Linking listeners to the embedding component](#2a-linking-listeners-to-the-embedding-component)
+  - [2B. Shape inference](#2b-shape-inference)
+- [3. Internal communication](#3-internal-communication)
+  - [3A. During prediction](#3a-during-prediction)
+  - [3B. During training](#3b-during-training)
+    - [Training with multiple listeners](#training-with-multiple-listeners)
+  - [3C. Frozen components](#3c-frozen-components)
+    - [The Tok2Vec or Transformer is frozen](#the-tok2vec-or-transformer-is-frozen)
+    - [The upstream component is frozen](#the-upstream-component-is-frozen)
+- [4. Replacing listener with standalone](#4-replacing-listener-with-standalone)
 
 ## 1. Overview
 
@@ -62,7 +65,7 @@ of this `find_listener()` method will specifically identify sublayers of a model
 
 If it's a Transformer-based pipeline, a
 [`transformer` component](https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py)
-has a similar implementation but its `find_listener()` function will specifically look for `TransformerListener` 
+has a similar implementation but its `find_listener()` function will specifically look for `TransformerListener`
 sublayers of downstream components.
 
 ### 2B. Shape inference
@@ -154,7 +157,7 @@ as a tagger or a parser. This used to be impossible before 3.1, but has become s
 embedding component in the [`annotating_components`](https://spacy.io/usage/training#annotating-components)
 list of the config. This works like any other "annotating component" because it relies on the `Doc` attributes.
 
-However, if the `Tok2Vec` or `Transformer` is frozen, and not present in `annotating_components`, and a related 
+However, if the `Tok2Vec` or `Transformer` is frozen, and not present in `annotating_components`, and a related
 listener isn't frozen, then a `W086` warning is shown and further training of the pipeline will likely end with `E954`.
 
 #### The upstream component is frozen
@@ -216,5 +219,17 @@ new_model = tok2vec_model.attrs["replace_listener"](new_model)
 ```
 
 The new config and model are then properly stored on the `nlp` object.
-Note that this functionality (running the replacement for a transformer listener) was broken prior to 
+Note that this functionality (running the replacement for a transformer listener) was broken prior to
 `spacy-transformers` 1.0.5.
+
+In spaCy 3.7, `Language.replace_listeners` was updated to pass the following additional arguments to the `replace_listener` callback:
+the listener to be replaced and the `tok2vec`/`transformer` pipe from which the new model was copied. To maintain backwards-compatiblity,
+the method only passes these extra arguments for callbacks that support them:
+
+```
+def replace_listener_pre_37(copied_tok2vec_model):
+  ...
+
+def replace_listener_post_37(copied_tok2vec_model, replaced_listener, tok2vec_pipe):
+  ...
+```
diff --git a/spacy/errors.py b/spacy/errors.py
index db1a886aa8f..a2f8ca85c3b 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -981,6 +981,8 @@ class Errors(metaclass=ErrorsWithCodes):
              " 'min_length': {min_length}, 'max_length': {max_length}")
     E1054 = ("The text, including whitespace, must match between reference and "
              "predicted docs when training {component}.")
+    E1055 = ("The 'replace_listener' callback expects {num_params} parameters, "
+             "but only callbacks with one or three parameters are supported")
 
 
 # Deprecated model shortcuts, only used in errors and warnings
diff --git a/spacy/language.py b/spacy/language.py
index fd616483be8..6a848bf9aa9 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1,4 +1,5 @@
 import functools
+import inspect
 import itertools
 import multiprocessing as mp
 import random
@@ -2033,8 +2034,20 @@ def replace_listeners(
             # Go over the listener layers and replace them
             for listener in pipe_listeners:
                 new_model = tok2vec_model.copy()
-                if "replace_listener" in tok2vec_model.attrs:
-                    new_model = tok2vec_model.attrs["replace_listener"](new_model)
+                replace_listener_func = tok2vec_model.attrs.get("replace_listener")
+                if replace_listener_func is not None:
+                    # Pass the extra args to the callback without breaking compatibility with
+                    # old library versions that only expect a single parameter.
+                    num_params = len(
+                        inspect.signature(replace_listener_func).parameters
+                    )
+                    if num_params == 1:
+                        new_model = replace_listener_func(new_model)
+                    elif num_params == 3:
+                        new_model = replace_listener_func(new_model, listener, tok2vec)
+                    else:
+                        raise ValueError(Errors.E1055.format(num_params=num_params))
+
                 util.replace_model_node(pipe.model, listener, new_model)  # type: ignore[attr-defined]
                 tok2vec.remove_listener(listener, pipe_name)
 

From d195923164823d8ce207506862b35c60463188ea Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Thu, 6 Jul 2023 18:29:03 +0200
Subject: [PATCH 02/39] Set version to `3.7.0.dev0` (#12799)

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index cad6158dac3..71a728128fc 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.6.0"
+__version__ = "3.7.0.dev0"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"

From 991bcc111e1a35cc96dba32ac08c212b0b360384 Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Fri, 7 Jul 2023 08:09:57 +0200
Subject: [PATCH 03/39] disable tests until 3.7 models are available

---
 .github/workflows/tests.yml | 54 ++++++++++++++++++-------------------
 spacy/tests/test_cli.py     |  2 ++
 2 files changed, 29 insertions(+), 27 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index d60c90c1c91..f177fbcb679 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -111,22 +111,22 @@ jobs:
       - name: Test import
         run: python -W error -c "import spacy"
 
-      - name: "Test download CLI"
-        run: |
-          python -m spacy download ca_core_news_sm
-          python -m spacy download ca_core_news_md
-          python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
-        if: matrix.python_version == '3.9'
-
-      - name: "Test download_url in info CLI"
-        run: |
-          python -W error -m spacy info ca_core_news_sm | grep -q download_url
-        if: matrix.python_version == '3.9'
-
-      - name: "Test no warnings on load (#11713)"
-        run: |
-          python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
-        if: matrix.python_version == '3.9'
+#      - name: "Test download CLI"
+#        run: |
+#          python -m spacy download ca_core_news_sm
+#          python -m spacy download ca_core_news_md
+#          python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
+#        if: matrix.python_version == '3.9'
+#
+#      - name: "Test download_url in info CLI"
+#        run: |
+#          python -W error -m spacy info ca_core_news_sm | grep -q download_url
+#        if: matrix.python_version == '3.9'
+#
+#      - name: "Test no warnings on load (#11713)"
+#        run: |
+#          python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
+#        if: matrix.python_version == '3.9'
 
       - name: "Test convert CLI"
         run: |
@@ -150,17 +150,17 @@ jobs:
           python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
         if: matrix.python_version == '3.9'
 
-      - name: "Test assemble CLI"
-        run: |
-          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
-          PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
-        if: matrix.python_version == '3.9'
-
-      - name: "Test assemble CLI vectors warning"
-        run: |
-          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
-          python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
-        if: matrix.python_version == '3.9'
+#      - name: "Test assemble CLI"
+#        run: |
+#          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
+#          PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
+#        if: matrix.python_version == '3.9'
+#
+#      - name: "Test assemble CLI vectors warning"
+#        run: |
+#          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
+#          python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
+#        if: matrix.python_version == '3.9'
 
       - name: "Install test requirements"
         run: |
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 8e1c9ca3215..f5a7aadb8bd 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -697,6 +697,7 @@ def test_string_to_list_intify(value):
     assert string_to_list(value, intify=True) == [1, 2, 3]
 
 
+@pytest.mark.skip(reason="Temporarily skip before 3.7 models are published")
 def test_download_compatibility():
     spec = SpecifierSet("==" + about.__version__)
     spec.prereleases = False
@@ -707,6 +708,7 @@ def test_download_compatibility():
         assert get_minor_version(about.__version__) == get_minor_version(version)
 
 
+@pytest.mark.skip(reason="Temporarily skip before 3.7 models are published")
 def test_validate_compatibility_table():
     spec = SpecifierSet("==" + about.__version__)
     spec.prereleases = False

From b1b20bf69df6113b51fd94b5249188d9b9e4c1b4 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Fri, 7 Jul 2023 09:10:27 +0200
Subject: [PATCH 04/39] Replace projects functionality with weasel (#12769)

* Setting up weasel branch (#12456)

* remove project-specific functionality

* remove project-specific tests

* remove project-specific schemas

* remove project-specific information in about

* remove project-specific functions in util.py

* remove project-specific error strings

* remove project-specific CLI commands

* black formatting

* restore some functions that are used beyond projects

* remove project imports

* remove imports

* remove remote_storage tests

* remove one more project unit test

* update for PR 12394

* remove get_hash and get_checksum

* remove upload_ and download_file methods

* remove ensure_pathy

* revert clumsy fingers

* reinstate E970

* feat: use weasel as spacy project command (#12473)

* feat: use weasel as spacy project command

* build: use constrained requirement for weasel

* feat: add weasel to the library requirements

* build: update weasel to new version

* build: use specific weasel tag

* build: use weasel-0.1.0rc1 from PyPI

* fix: remove weasel from requirements.txt

* fix: requirements.txt and setup.cfg need to reflect each other

* feat: remove legacy spacy project code

* bump version

* further merge fixes

* isort

---------

Co-authored-by: Basile Dura <bdura@users.noreply.github.com>
---
 requirements.txt                    |   1 +
 setup.cfg                           |   1 +
 spacy/about.py                      |   2 -
 spacy/cli/__init__.py               |   7 -
 spacy/cli/_util.py                  | 329 +-----------------------
 spacy/cli/project/__init__.py       |   0
 spacy/cli/project/assets.py         | 217 ----------------
 spacy/cli/project/clone.py          | 124 ---------
 spacy/cli/project/document.py       | 115 ---------
 spacy/cli/project/dvc.py            | 220 ----------------
 spacy/cli/project/pull.py           |  67 -----
 spacy/cli/project/push.py           |  69 -----
 spacy/cli/project/remote_storage.py | 212 ----------------
 spacy/cli/project/run.py            | 379 ----------------------------
 spacy/errors.py                     |   2 -
 spacy/schemas.py                    |  60 -----
 spacy/tests/test_cli.py             | 293 +--------------------
 spacy/util.py                       |  24 --
 18 files changed, 9 insertions(+), 2113 deletions(-)
 delete mode 100644 spacy/cli/project/__init__.py
 delete mode 100644 spacy/cli/project/assets.py
 delete mode 100644 spacy/cli/project/clone.py
 delete mode 100644 spacy/cli/project/document.py
 delete mode 100644 spacy/cli/project/dvc.py
 delete mode 100644 spacy/cli/project/pull.py
 delete mode 100644 spacy/cli/project/push.py
 delete mode 100644 spacy/cli/project/remote_storage.py
 delete mode 100644 spacy/cli/project/run.py

diff --git a/requirements.txt b/requirements.txt
index a007f495e62..f5050fee2b1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,6 +12,7 @@ catalogue>=2.0.6,<2.1.0
 typer>=0.3.0,<0.10.0
 pathy>=0.10.0
 smart-open>=5.2.1,<7.0.0
+weasel>=0.1.0,<0.2.0
 # Third party dependencies
 numpy>=1.15.0
 requests>=2.13.0,<3.0.0
diff --git a/setup.cfg b/setup.cfg
index 45734888fdb..048bb37197e 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -51,6 +51,7 @@ install_requires =
     wasabi>=0.9.1,<1.2.0
     srsly>=2.4.3,<3.0.0
     catalogue>=2.0.6,<2.1.0
+    weasel>=0.1.0,<0.2.0
     # Third-party dependencies
     typer>=0.3.0,<0.10.0
     pathy>=0.10.0
diff --git a/spacy/about.py b/spacy/about.py
index 71a728128fc..d816926fdd9 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -3,5 +3,3 @@
 __version__ = "3.7.0.dev0"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
-__projects__ = "https://github.com/explosion/projects"
-__projects_branch__ = "v3"
diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py
index 549a27616d8..4fc076f9a23 100644
--- a/spacy/cli/__init__.py
+++ b/spacy/cli/__init__.py
@@ -21,13 +21,6 @@
 from .package import package  # noqa: F401
 from .pretrain import pretrain  # noqa: F401
 from .profile import profile  # noqa: F401
-from .project.assets import project_assets  # noqa: F401
-from .project.clone import project_clone  # noqa: F401
-from .project.document import project_document  # noqa: F401
-from .project.dvc import project_update_dvc  # noqa: F401
-from .project.pull import project_pull  # noqa: F401
-from .project.push import project_push  # noqa: F401
-from .project.run import project_run  # noqa: F401
 from .train import train_cli  # noqa: F401
 from .validate import validate  # noqa: F401
 
diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index eff897316d4..bc6c53cd96c 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -25,10 +25,11 @@
 from thinc.util import gpu_is_available
 from typer.main import get_command
 from wasabi import Printer, msg
+from weasel import app as project_cli
 
 from .. import about
 from ..compat import Literal
-from ..schemas import ProjectConfigSchema, validate
+from ..schemas import validate
 from ..util import (
     ENV_VARS,
     SimpleFrozenDict,
@@ -48,7 +49,6 @@
 WHEEL_SUFFIX = "-py3-none-any.whl"
 
 PROJECT_FILE = "project.yml"
-PROJECT_LOCK = "project.lock"
 COMMAND = "python -m spacy"
 NAME = "spacy"
 HELP = """spaCy Command-line Interface
@@ -74,11 +74,10 @@
 
 app = typer.Typer(name=NAME, help=HELP)
 benchmark_cli = typer.Typer(name="benchmark", help=BENCHMARK_HELP, no_args_is_help=True)
-project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True)
 debug_cli = typer.Typer(name="debug", help=DEBUG_HELP, no_args_is_help=True)
 init_cli = typer.Typer(name="init", help=INIT_HELP, no_args_is_help=True)
 
-app.add_typer(project_cli)
+app.add_typer(project_cli, name="project", help=PROJECT_HELP, no_args_is_help=True)
 app.add_typer(debug_cli)
 app.add_typer(benchmark_cli)
 app.add_typer(init_cli)
@@ -153,148 +152,6 @@ def _parse_override(value: Any) -> Any:
         return str(value)
 
 
-def load_project_config(
-    path: Path, interpolate: bool = True, overrides: Dict[str, Any] = SimpleFrozenDict()
-) -> Dict[str, Any]:
-    """Load the project.yml file from a directory and validate it. Also make
-    sure that all directories defined in the config exist.
-
-    path (Path): The path to the project directory.
-    interpolate (bool): Whether to substitute project variables.
-    overrides (Dict[str, Any]): Optional config overrides.
-    RETURNS (Dict[str, Any]): The loaded project.yml.
-    """
-    config_path = path / PROJECT_FILE
-    if not config_path.exists():
-        msg.fail(f"Can't find {PROJECT_FILE}", config_path, exits=1)
-    invalid_err = f"Invalid {PROJECT_FILE}. Double-check that the YAML is correct."
-    try:
-        config = srsly.read_yaml(config_path)
-    except ValueError as e:
-        msg.fail(invalid_err, e, exits=1)
-    errors = validate(ProjectConfigSchema, config)
-    if errors:
-        msg.fail(invalid_err)
-        print("\n".join(errors))
-        sys.exit(1)
-    validate_project_version(config)
-    validate_project_commands(config)
-    if interpolate:
-        err = f"{PROJECT_FILE} validation error"
-        with show_validation_error(title=err, hint_fill=False):
-            config = substitute_project_variables(config, overrides)
-    # Make sure directories defined in config exist
-    for subdir in config.get("directories", []):
-        dir_path = path / subdir
-        if not dir_path.exists():
-            dir_path.mkdir(parents=True)
-    return config
-
-
-def substitute_project_variables(
-    config: Dict[str, Any],
-    overrides: Dict[str, Any] = SimpleFrozenDict(),
-    key: str = "vars",
-    env_key: str = "env",
-) -> Dict[str, Any]:
-    """Interpolate variables in the project file using the config system.
-
-    config (Dict[str, Any]): The project config.
-    overrides (Dict[str, Any]): Optional config overrides.
-    key (str): Key containing variables in project config.
-    env_key (str): Key containing environment variable mapping in project config.
-    RETURNS (Dict[str, Any]): The interpolated project config.
-    """
-    config.setdefault(key, {})
-    config.setdefault(env_key, {})
-    # Substitute references to env vars with their values
-    for config_var, env_var in config[env_key].items():
-        config[env_key][config_var] = _parse_override(os.environ.get(env_var, ""))
-    # Need to put variables in the top scope again so we can have a top-level
-    # section "project" (otherwise, a list of commands in the top scope wouldn't)
-    # be allowed by Thinc's config system
-    cfg = Config({"project": config, key: config[key], env_key: config[env_key]})
-    cfg = Config().from_str(cfg.to_str(), overrides=overrides)
-    interpolated = cfg.interpolate()
-    return dict(interpolated["project"])
-
-
-def validate_project_version(config: Dict[str, Any]) -> None:
-    """If the project defines a compatible spaCy version range, chec that it's
-    compatible with the current version of spaCy.
-
-    config (Dict[str, Any]): The loaded config.
-    """
-    spacy_version = config.get("spacy_version", None)
-    if spacy_version and not is_compatible_version(about.__version__, spacy_version):
-        err = (
-            f"The {PROJECT_FILE} specifies a spaCy version range ({spacy_version}) "
-            f"that's not compatible with the version of spaCy you're running "
-            f"({about.__version__}). You can edit version requirement in the "
-            f"{PROJECT_FILE} to load it, but the project may not run as expected."
-        )
-        msg.fail(err, exits=1)
-
-
-def validate_project_commands(config: Dict[str, Any]) -> None:
-    """Check that project commands and workflows are valid, don't contain
-    duplicates, don't clash  and only refer to commands that exist.
-
-    config (Dict[str, Any]): The loaded config.
-    """
-    command_names = [cmd["name"] for cmd in config.get("commands", [])]
-    workflows = config.get("workflows", {})
-    duplicates = set([cmd for cmd in command_names if command_names.count(cmd) > 1])
-    if duplicates:
-        err = f"Duplicate commands defined in {PROJECT_FILE}: {', '.join(duplicates)}"
-        msg.fail(err, exits=1)
-    for workflow_name, workflow_steps in workflows.items():
-        if workflow_name in command_names:
-            err = f"Can't use workflow name '{workflow_name}': name already exists as a command"
-            msg.fail(err, exits=1)
-        for step in workflow_steps:
-            if step not in command_names:
-                msg.fail(
-                    f"Unknown command specified in workflow '{workflow_name}': {step}",
-                    f"Workflows can only refer to commands defined in the 'commands' "
-                    f"section of the {PROJECT_FILE}.",
-                    exits=1,
-                )
-
-
-def get_hash(data, exclude: Iterable[str] = tuple()) -> str:
-    """Get the hash for a JSON-serializable object.
-
-    data: The data to hash.
-    exclude (Iterable[str]): Top-level keys to exclude if data is a dict.
-    RETURNS (str): The hash.
-    """
-    if isinstance(data, dict):
-        data = {k: v for k, v in data.items() if k not in exclude}
-    data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8")
-    return hashlib.md5(data_str).hexdigest()
-
-
-def get_checksum(path: Union[Path, str]) -> str:
-    """Get the checksum for a file or directory given its file path. If a
-    directory path is provided, this uses all files in that directory.
-
-    path (Union[Path, str]): The file or directory path.
-    RETURNS (str): The checksum.
-    """
-    path = Path(path)
-    if not (path.is_file() or path.is_dir()):
-        msg.fail(f"Can't get checksum for {path}: not a file or directory", exits=1)
-    if path.is_file():
-        return hashlib.md5(Path(path).read_bytes()).hexdigest()
-    else:
-        # TODO: this is currently pretty slow
-        dir_checksum = hashlib.md5()
-        for sub_file in sorted(fp for fp in path.rglob("*") if fp.is_file()):
-            dir_checksum.update(sub_file.read_bytes())
-        return dir_checksum.hexdigest()
-
-
 @contextmanager
 def show_validation_error(
     file_path: Optional[Union[str, Path]] = None,
@@ -352,166 +209,10 @@ def import_code(code_path: Optional[Union[Path, str]]) -> None:
             msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
 
 
-def upload_file(src: Path, dest: Union[str, "FluidPath"]) -> None:
-    """Upload a file.
-
-    src (Path): The source path.
-    url (str): The destination URL to upload to.
-    """
-    import smart_open
-
-    # Create parent directories for local paths
-    if isinstance(dest, Path):
-        if not dest.parent.exists():
-            dest.parent.mkdir(parents=True)
-
-    dest = str(dest)
-    with smart_open.open(dest, mode="wb") as output_file:
-        with src.open(mode="rb") as input_file:
-            output_file.write(input_file.read())
-
-
-def download_file(
-    src: Union[str, "FluidPath"], dest: Path, *, force: bool = False
-) -> None:
-    """Download a file using smart_open.
-
-    url (str): The URL of the file.
-    dest (Path): The destination path.
-    force (bool): Whether to force download even if file exists.
-        If False, the download will be skipped.
-    """
-    import smart_open
-
-    if dest.exists() and not force:
-        return None
-    src = str(src)
-    with smart_open.open(src, mode="rb", compression="disable") as input_file:
-        with dest.open(mode="wb") as output_file:
-            shutil.copyfileobj(input_file, output_file)
-
-
-def ensure_pathy(path):
-    """Temporary helper to prevent importing Pathy globally (which can cause
-    slow and annoying Google Cloud warning)."""
-    from pathy import Pathy  # noqa: F811
-
-    return Pathy.fluid(path)
-
-
-def git_checkout(
-    repo: str, subpath: str, dest: Path, *, branch: str = "master", sparse: bool = False
-):
-    git_version = get_git_version()
-    if dest.exists():
-        msg.fail("Destination of checkout must not exist", exits=1)
-    if not dest.parent.exists():
-        msg.fail("Parent of destination of checkout must exist", exits=1)
-    if sparse and git_version >= (2, 22):
-        return git_sparse_checkout(repo, subpath, dest, branch)
-    elif sparse:
-        # Only show warnings if the user explicitly wants sparse checkout but
-        # the Git version doesn't support it
-        err_old = (
-            f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) "
-            f"that doesn't fully support sparse checkout yet."
-        )
-        err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled."
-        msg.warn(
-            f"{err_unk if git_version == (0, 0) else err_old} "
-            f"This means that more files than necessary may be downloaded "
-            f"temporarily. To only download the files needed, make sure "
-            f"you're using Git v2.22 or above."
-        )
-    with make_tempdir() as tmp_dir:
-        cmd = f"git -C {tmp_dir} clone {repo} . -b {branch}"
-        run_command(cmd, capture=True)
-        # We need Path(name) to make sure we also support subdirectories
-        try:
-            source_path = tmp_dir / Path(subpath)
-            if not is_subpath_of(tmp_dir, source_path):
-                err = f"'{subpath}' is a path outside of the cloned repository."
-                msg.fail(err, repo, exits=1)
-            shutil.copytree(str(source_path), str(dest))
-        except FileNotFoundError:
-            err = f"Can't clone {subpath}. Make sure the directory exists in the repo (branch '{branch}')"
-            msg.fail(err, repo, exits=1)
-
-
-def git_sparse_checkout(repo, subpath, dest, branch):
-    # We're using Git, partial clone and sparse checkout to
-    # only clone the files we need
-    # This ends up being RIDICULOUS. omg.
-    # So, every tutorial and SO post talks about 'sparse checkout'...But they
-    # go and *clone* the whole repo. Worthless. And cloning part of a repo
-    # turns out to be completely broken. The only way to specify a "path" is..
-    # a path *on the server*? The contents of which, specifies the paths. Wat.
-    # Obviously this is hopelessly broken and insecure, because you can query
-    # arbitrary paths on the server! So nobody enables this.
-    # What we have to do is disable *all* files. We could then just checkout
-    # the path, and it'd "work", but be hopelessly slow...Because it goes and
-    # transfers every missing object one-by-one. So the final piece is that we
-    # need to use some weird git internals to fetch the missings in bulk, and
-    # *that* we can do by path.
-    # We're using Git and sparse checkout to only clone the files we need
-    with make_tempdir() as tmp_dir:
-        # This is the "clone, but don't download anything" part.
-        cmd = (
-            f"git clone {repo} {tmp_dir} --no-checkout --depth 1 "
-            f"-b {branch} --filter=blob:none"
-        )
-        run_command(cmd)
-        # Now we need to find the missing filenames for the subpath we want.
-        # Looking for this 'rev-list' command in the git --help? Hah.
-        cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}"
-        ret = run_command(cmd, capture=True)
-        git_repo = _http_to_git(repo)
-        # Now pass those missings into another bit of git internals
-        missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
-        if not missings:
-            err = (
-                f"Could not find any relevant files for '{subpath}'. "
-                f"Did you specify a correct and complete path within repo '{repo}' "
-                f"and branch {branch}?"
-            )
-            msg.fail(err, exits=1)
-        cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}"
-        run_command(cmd, capture=True)
-        # And finally, we can checkout our subpath
-        cmd = f"git -C {tmp_dir} checkout {branch} {subpath}"
-        run_command(cmd, capture=True)
-
-        # Get a subdirectory of the cloned path, if appropriate
-        source_path = tmp_dir / Path(subpath)
-        if not is_subpath_of(tmp_dir, source_path):
-            err = f"'{subpath}' is a path outside of the cloned repository."
-            msg.fail(err, repo, exits=1)
-
-        shutil.move(str(source_path), str(dest))
-
-
-def git_repo_branch_exists(repo: str, branch: str) -> bool:
-    """Uses 'git ls-remote' to check if a repository and branch exists
-
-    repo (str): URL to get repo.
-    branch (str): Branch on repo to check.
-    RETURNS (bool): True if repo:branch exists.
-    """
-    get_git_version()
-    cmd = f"git ls-remote {repo} {branch}"
-    # We might be tempted to use `--exit-code` with `git ls-remote`, but
-    # `run_command` handles the `returncode` for us, so we'll rely on
-    # the fact that stdout returns '' if the requested branch doesn't exist
-    ret = run_command(cmd, capture=True)
-    exists = ret.stdout != ""
-    return exists
-
-
 def get_git_version(
     error: str = "Could not run 'git'. Make sure it's installed and the executable is available.",
 ) -> Tuple[int, int]:
     """Get the version of git and raise an error if calling 'git --version' fails.
-
     error (str): The error message to show.
     RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns
         (0, 0) if the version couldn't be determined.
@@ -527,30 +228,6 @@ def get_git_version(
     return int(version[0]), int(version[1])
 
 
-def _http_to_git(repo: str) -> str:
-    if repo.startswith("http://"):
-        repo = repo.replace(r"http://", r"https://")
-    if repo.startswith(r"https://"):
-        repo = repo.replace("https://", "git@").replace("/", ":", 1)
-        if repo.endswith("/"):
-            repo = repo[:-1]
-        repo = f"{repo}.git"
-    return repo
-
-
-def is_subpath_of(parent, child):
-    """
-    Check whether `child` is a path contained within `parent`.
-    """
-    # Based on https://stackoverflow.com/a/37095733 .
-
-    # In Python 3.9, the `Path.is_relative_to()` method will supplant this, so
-    # we can stop using crusty old os.path functions.
-    parent_realpath = os.path.realpath(parent)
-    child_realpath = os.path.realpath(child)
-    return os.path.commonpath([parent_realpath, child_realpath]) == parent_realpath
-
-
 @overload
 def string_to_list(value: str, intify: Literal[False] = ...) -> List[str]:
     ...
diff --git a/spacy/cli/project/__init__.py b/spacy/cli/project/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/spacy/cli/project/assets.py b/spacy/cli/project/assets.py
deleted file mode 100644
index aa270598621..00000000000
--- a/spacy/cli/project/assets.py
+++ /dev/null
@@ -1,217 +0,0 @@
-import os
-import re
-import shutil
-from pathlib import Path
-from typing import Any, Dict, Optional
-
-import requests
-import typer
-from wasabi import msg
-
-from ...util import ensure_path, working_dir
-from .._util import (
-    PROJECT_FILE,
-    Arg,
-    Opt,
-    SimpleFrozenDict,
-    download_file,
-    get_checksum,
-    get_git_version,
-    git_checkout,
-    load_project_config,
-    parse_config_overrides,
-    project_cli,
-)
-
-# Whether assets are extra if `extra` is not set.
-EXTRA_DEFAULT = False
-
-
-@project_cli.command(
-    "assets",
-    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
-)
-def project_assets_cli(
-    # fmt: off
-    ctx: typer.Context,  # This is only used to read additional arguments
-    project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
-    sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+."),
-    extra: bool = Opt(False, "--extra", "-e", help="Download all assets, including those marked as 'extra'.")
-    # fmt: on
-):
-    """Fetch project assets like datasets and pretrained weights. Assets are
-    defined in the "assets" section of the project.yml. If a checksum is
-    provided in the project.yml, the file is only downloaded if no local file
-    with the same checksum exists.
-
-    DOCS: https://spacy.io/api/cli#project-assets
-    """
-    overrides = parse_config_overrides(ctx.args)
-    project_assets(
-        project_dir,
-        overrides=overrides,
-        sparse_checkout=sparse_checkout,
-        extra=extra,
-    )
-
-
-def project_assets(
-    project_dir: Path,
-    *,
-    overrides: Dict[str, Any] = SimpleFrozenDict(),
-    sparse_checkout: bool = False,
-    extra: bool = False,
-) -> None:
-    """Fetch assets for a project using DVC if possible.
-
-    project_dir (Path): Path to project directory.
-    sparse_checkout (bool): Use sparse checkout for assets provided via Git, to only check out and clone the files
-                            needed.
-    extra (bool): Whether to download all assets, including those marked as 'extra'.
-    """
-    project_path = ensure_path(project_dir)
-    config = load_project_config(project_path, overrides=overrides)
-    assets = [
-        asset
-        for asset in config.get("assets", [])
-        if extra or not asset.get("extra", EXTRA_DEFAULT)
-    ]
-    if not assets:
-        msg.warn(
-            f"No assets specified in {PROJECT_FILE} (if assets are marked as extra, download them with --extra)",
-            exits=0,
-        )
-    msg.info(f"Fetching {len(assets)} asset(s)")
-
-    for asset in assets:
-        dest = (project_dir / asset["dest"]).resolve()
-        checksum = asset.get("checksum")
-        if "git" in asset:
-            git_err = (
-                f"Cloning spaCy project templates requires Git and the 'git' command. "
-                f"Make sure it's installed and that the executable is available."
-            )
-            get_git_version(error=git_err)
-            if dest.exists():
-                # If there's already a file, check for checksum
-                if checksum and checksum == get_checksum(dest):
-                    msg.good(
-                        f"Skipping download with matching checksum: {asset['dest']}"
-                    )
-                    continue
-                else:
-                    if dest.is_dir():
-                        shutil.rmtree(dest)
-                    else:
-                        dest.unlink()
-            if "repo" not in asset["git"] or asset["git"]["repo"] is None:
-                msg.fail(
-                    "A git asset must include 'repo', the repository address.", exits=1
-                )
-            if "path" not in asset["git"] or asset["git"]["path"] is None:
-                msg.fail(
-                    "A git asset must include 'path' - use \"\" to get the entire repository.",
-                    exits=1,
-                )
-            git_checkout(
-                asset["git"]["repo"],
-                asset["git"]["path"],
-                dest,
-                branch=asset["git"].get("branch"),
-                sparse=sparse_checkout,
-            )
-            msg.good(f"Downloaded asset {dest}")
-        else:
-            url = asset.get("url")
-            if not url:
-                # project.yml defines asset without URL that the user has to place
-                check_private_asset(dest, checksum)
-                continue
-            fetch_asset(project_path, url, dest, checksum)
-
-
-def check_private_asset(dest: Path, checksum: Optional[str] = None) -> None:
-    """Check and validate assets without a URL (private assets that the user
-    has to provide themselves) and give feedback about the checksum.
-
-    dest (Path): Destination path of the asset.
-    checksum (Optional[str]): Optional checksum of the expected file.
-    """
-    if not Path(dest).exists():
-        err = f"No URL provided for asset. You need to add this file yourself: {dest}"
-        msg.warn(err)
-    else:
-        if not checksum:
-            msg.good(f"Asset already exists: {dest}")
-        elif checksum == get_checksum(dest):
-            msg.good(f"Asset exists with matching checksum: {dest}")
-        else:
-            msg.fail(f"Asset available but with incorrect checksum: {dest}")
-
-
-def fetch_asset(
-    project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
-) -> None:
-    """Fetch an asset from a given URL or path. If a checksum is provided and a
-    local file exists, it's only re-downloaded if the checksum doesn't match.
-
-    project_path (Path): Path to project directory.
-    url (str): URL or path to asset.
-    checksum (Optional[str]): Optional expected checksum of local file.
-    RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
-        the asset failed.
-    """
-    dest_path = (project_path / dest).resolve()
-    if dest_path.exists():
-        # If there's already a file, check for checksum
-        if checksum:
-            if checksum == get_checksum(dest_path):
-                msg.good(f"Skipping download with matching checksum: {dest}")
-                return
-        else:
-            # If there's not a checksum, make sure the file is a possibly valid size
-            if os.path.getsize(dest_path) == 0:
-                msg.warn(f"Asset exists but with size of 0 bytes, deleting: {dest}")
-                os.remove(dest_path)
-    # We might as well support the user here and create parent directories in
-    # case the asset dir isn't listed as a dir to create in the project.yml
-    if not dest_path.parent.exists():
-        dest_path.parent.mkdir(parents=True)
-    with working_dir(project_path):
-        url = convert_asset_url(url)
-        try:
-            download_file(url, dest_path)
-            msg.good(f"Downloaded asset {dest}")
-        except requests.exceptions.RequestException as e:
-            if Path(url).exists() and Path(url).is_file():
-                # If it's a local file, copy to destination
-                shutil.copy(url, str(dest_path))
-                msg.good(f"Copied local asset {dest}")
-            else:
-                msg.fail(f"Download failed: {dest}", e)
-    if checksum and checksum != get_checksum(dest_path):
-        msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}")
-
-
-def convert_asset_url(url: str) -> str:
-    """Check and convert the asset URL if needed.
-
-    url (str): The asset URL.
-    RETURNS (str): The converted URL.
-    """
-    # If the asset URL is a regular GitHub URL it's likely a mistake
-    if (
-        re.match(r"(http(s?)):\/\/github.com", url)
-        and "releases/download" not in url
-        and "/raw/" not in url
-    ):
-        converted = url.replace("github.com", "raw.githubusercontent.com")
-        converted = re.sub(r"/(tree|blob)/", "/", converted)
-        msg.warn(
-            "Downloading from a regular GitHub URL. This will only download "
-            "the source of the page, not the actual file. Converting the URL "
-            "to a raw URL.",
-            converted,
-        )
-        return converted
-    return url
diff --git a/spacy/cli/project/clone.py b/spacy/cli/project/clone.py
deleted file mode 100644
index 2ee27c92adb..00000000000
--- a/spacy/cli/project/clone.py
+++ /dev/null
@@ -1,124 +0,0 @@
-import re
-import subprocess
-from pathlib import Path
-from typing import Optional
-
-from wasabi import msg
-
-from ... import about
-from ...util import ensure_path
-from .._util import (
-    COMMAND,
-    PROJECT_FILE,
-    Arg,
-    Opt,
-    get_git_version,
-    git_checkout,
-    git_repo_branch_exists,
-    project_cli,
-)
-
-DEFAULT_REPO = about.__projects__
-DEFAULT_PROJECTS_BRANCH = about.__projects_branch__
-DEFAULT_BRANCHES = ["main", "master"]
-
-
-@project_cli.command("clone")
-def project_clone_cli(
-    # fmt: off
-    name: str = Arg(..., help="The name of the template to clone"),
-    dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False),
-    repo: str = Opt(DEFAULT_REPO, "--repo", "-r", help="The repository to clone from"),
-    branch: Optional[str] = Opt(None, "--branch", "-b", help=f"The branch to clone from. If not provided, will attempt {', '.join(DEFAULT_BRANCHES)}"),
-    sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse Git checkout to only check out and clone the files needed. Requires Git v22.2+.")
-    # fmt: on
-):
-    """Clone a project template from a repository. Calls into "git" and will
-    only download the files from the given subdirectory. The GitHub repo
-    defaults to the official spaCy template repo, but can be customized
-    (including using a private repo).
-
-    DOCS: https://spacy.io/api/cli#project-clone
-    """
-    if dest is None:
-        dest = Path.cwd() / Path(name).parts[-1]
-    if repo == DEFAULT_REPO and branch is None:
-        branch = DEFAULT_PROJECTS_BRANCH
-
-    if branch is None:
-        for default_branch in DEFAULT_BRANCHES:
-            if git_repo_branch_exists(repo, default_branch):
-                branch = default_branch
-                break
-        if branch is None:
-            default_branches_msg = ", ".join(f"'{b}'" for b in DEFAULT_BRANCHES)
-            msg.fail(
-                "No branch provided and attempted default "
-                f"branches {default_branches_msg} do not exist.",
-                exits=1,
-            )
-    else:
-        if not git_repo_branch_exists(repo, branch):
-            msg.fail(f"repo: {repo} (branch: {branch}) does not exist.", exits=1)
-    assert isinstance(branch, str)
-    project_clone(name, dest, repo=repo, branch=branch, sparse_checkout=sparse_checkout)
-
-
-def project_clone(
-    name: str,
-    dest: Path,
-    *,
-    repo: str = about.__projects__,
-    branch: str = about.__projects_branch__,
-    sparse_checkout: bool = False,
-) -> None:
-    """Clone a project template from a repository.
-
-    name (str): Name of subdirectory to clone.
-    dest (Path): Destination path of cloned project.
-    repo (str): URL of Git repo containing project templates.
-    branch (str): The branch to clone from
-    """
-    dest = ensure_path(dest)
-    check_clone(name, dest, repo)
-    project_dir = dest.resolve()
-    repo_name = re.sub(r"(http(s?)):\/\/github.com/", "", repo)
-    try:
-        git_checkout(repo, name, dest, branch=branch, sparse=sparse_checkout)
-    except subprocess.CalledProcessError:
-        err = f"Could not clone '{name}' from repo '{repo_name}' (branch '{branch}')"
-        msg.fail(err, exits=1)
-    msg.good(f"Cloned '{name}' from '{repo_name}' (branch '{branch}')", project_dir)
-    if not (project_dir / PROJECT_FILE).exists():
-        msg.warn(f"No {PROJECT_FILE} found in directory")
-    else:
-        msg.good(f"Your project is now ready!")
-        print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
-
-
-def check_clone(name: str, dest: Path, repo: str) -> None:
-    """Check and validate that the destination path can be used to clone. Will
-    check that Git is available and that the destination path is suitable.
-
-    name (str): Name of the directory to clone from the repo.
-    dest (Path): Local destination of cloned directory.
-    repo (str): URL of the repo to clone from.
-    """
-    git_err = (
-        f"Cloning spaCy project templates requires Git and the 'git' command. "
-        f"To clone a project without Git, copy the files from the '{name}' "
-        f"directory in the {repo} to {dest} manually."
-    )
-    get_git_version(error=git_err)
-    if not dest:
-        msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
-    if dest.exists():
-        # Directory already exists (not allowed, clone needs to create it)
-        msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1)
-    if not dest.parent.exists():
-        # We're not creating parents, parent dir should exist
-        msg.fail(
-            f"Can't clone project, parent directory doesn't exist: {dest.parent}. "
-            f"Create the necessary folder(s) first before continuing.",
-            exits=1,
-        )
diff --git a/spacy/cli/project/document.py b/spacy/cli/project/document.py
deleted file mode 100644
index 80107d27acf..00000000000
--- a/spacy/cli/project/document.py
+++ /dev/null
@@ -1,115 +0,0 @@
-from pathlib import Path
-
-from wasabi import MarkdownRenderer, msg
-
-from ...util import working_dir
-from .._util import PROJECT_FILE, Arg, Opt, load_project_config, project_cli
-
-DOCS_URL = "https://spacy.io"
-INTRO_PROJECT = f"""The [`{PROJECT_FILE}`]({PROJECT_FILE}) defines the data assets required by the
-project, as well as the available commands and workflows. For details, see the
-[spaCy projects documentation]({DOCS_URL}/usage/projects)."""
-INTRO_COMMANDS = f"""The following commands are defined by the project. They
-can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-run).
-Commands are only re-run if their inputs have changed."""
-INTRO_WORKFLOWS = f"""The following workflows are defined by the project. They
-can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-run)
-and will run the specified commands in order. Commands are only re-run if their
-inputs have changed."""
-INTRO_ASSETS = f"""The following assets are defined by the project. They can
-be fetched by running [`spacy project assets`]({DOCS_URL}/api/cli#project-assets)
-in the project directory."""
-# These markers are added to the Markdown and can be used to update the file in
-# place if it already exists. Only the auto-generated part will be replaced.
-MARKER_START = "<!-- SPACY PROJECT: AUTO-GENERATED DOCS START (do not remove) -->"
-MARKER_END = "<!-- SPACY PROJECT: AUTO-GENERATED DOCS END (do not remove) -->"
-# If this marker is used in an existing README, it's ignored and not replaced
-MARKER_IGNORE = "<!-- SPACY PROJECT: IGNORE -->"
-
-
-@project_cli.command("document")
-def project_document_cli(
-    # fmt: off
-    project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
-    output_file: Path = Opt("-", "--output", "-o", help="Path to output Markdown file for output. Defaults to - for standard output"),
-    no_emoji: bool = Opt(False, "--no-emoji", "-NE", help="Don't use emoji")
-    # fmt: on
-):
-    """
-    Auto-generate a README.md for a project. If the content is saved to a file,
-    hidden markers are added so you can add custom content before or after the
-    auto-generated section and only the auto-generated docs will be replaced
-    when you re-run the command.
-
-    DOCS: https://spacy.io/api/cli#project-document
-    """
-    project_document(project_dir, output_file, no_emoji=no_emoji)
-
-
-def project_document(
-    project_dir: Path, output_file: Path, *, no_emoji: bool = False
-) -> None:
-    is_stdout = str(output_file) == "-"
-    config = load_project_config(project_dir)
-    md = MarkdownRenderer(no_emoji=no_emoji)
-    md.add(MARKER_START)
-    title = config.get("title")
-    description = config.get("description")
-    md.add(md.title(1, f"spaCy Project{f': {title}' if title else ''}", "🪐"))
-    if description:
-        md.add(description)
-    md.add(md.title(2, PROJECT_FILE, "📋"))
-    md.add(INTRO_PROJECT)
-    # Commands
-    cmds = config.get("commands", [])
-    data = [(md.code(cmd["name"]), cmd.get("help", "")) for cmd in cmds]
-    if data:
-        md.add(md.title(3, "Commands", "⏯"))
-        md.add(INTRO_COMMANDS)
-        md.add(md.table(data, ["Command", "Description"]))
-    # Workflows
-    wfs = config.get("workflows", {}).items()
-    data = [(md.code(n), " &rarr; ".join(md.code(w) for w in stp)) for n, stp in wfs]
-    if data:
-        md.add(md.title(3, "Workflows", "⏭"))
-        md.add(INTRO_WORKFLOWS)
-        md.add(md.table(data, ["Workflow", "Steps"]))
-    # Assets
-    assets = config.get("assets", [])
-    data = []
-    for a in assets:
-        source = "Git" if a.get("git") else "URL" if a.get("url") else "Local"
-        dest_path = a["dest"]
-        dest = md.code(dest_path)
-        if source == "Local":
-            # Only link assets if they're in the repo
-            with working_dir(project_dir) as p:
-                if (p / dest_path).exists():
-                    dest = md.link(dest, dest_path)
-        data.append((dest, source, a.get("description", "")))
-    if data:
-        md.add(md.title(3, "Assets", "🗂"))
-        md.add(INTRO_ASSETS)
-        md.add(md.table(data, ["File", "Source", "Description"]))
-    md.add(MARKER_END)
-    # Output result
-    if is_stdout:
-        print(md.text)
-    else:
-        content = md.text
-        if output_file.exists():
-            with output_file.open("r", encoding="utf8") as f:
-                existing = f.read()
-            if MARKER_IGNORE in existing:
-                msg.warn("Found ignore marker in existing file: skipping", output_file)
-                return
-            if MARKER_START in existing and MARKER_END in existing:
-                msg.info("Found existing file: only replacing auto-generated docs")
-                before = existing.split(MARKER_START)[0]
-                after = existing.split(MARKER_END)[1]
-                content = f"{before}{content}{after}"
-            else:
-                msg.warn("Replacing existing file")
-        with output_file.open("w", encoding="utf8") as f:
-            f.write(content)
-        msg.good("Saved project documentation", output_file)
diff --git a/spacy/cli/project/dvc.py b/spacy/cli/project/dvc.py
deleted file mode 100644
index 9ad55c43302..00000000000
--- a/spacy/cli/project/dvc.py
+++ /dev/null
@@ -1,220 +0,0 @@
-"""This module contains helpers and subcommands for integrating spaCy projects
-with Data Version Controk (DVC). https://dvc.org"""
-import subprocess
-from pathlib import Path
-from typing import Any, Dict, Iterable, List, Optional
-
-from wasabi import msg
-
-from ...util import (
-    SimpleFrozenList,
-    join_command,
-    run_command,
-    split_command,
-    working_dir,
-)
-from .._util import (
-    COMMAND,
-    NAME,
-    PROJECT_FILE,
-    Arg,
-    Opt,
-    get_hash,
-    load_project_config,
-    project_cli,
-)
-
-DVC_CONFIG = "dvc.yaml"
-DVC_DIR = ".dvc"
-UPDATE_COMMAND = "dvc"
-DVC_CONFIG_COMMENT = f"""# This file is auto-generated by spaCy based on your {PROJECT_FILE}. If you've
-# edited your {PROJECT_FILE}, you can regenerate this file by running:
-# {COMMAND} project {UPDATE_COMMAND}"""
-
-
-@project_cli.command(UPDATE_COMMAND)
-def project_update_dvc_cli(
-    # fmt: off
-    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
-    workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."),
-    verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
-    quiet: bool = Opt(False, "--quiet", "-q", help="Print less info"),
-    force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
-    # fmt: on
-):
-    """Auto-generate Data Version Control (DVC) config. A DVC
-    project can only define one pipeline, so you need to specify one workflow
-    defined in the project.yml. If no workflow is specified, the first defined
-    workflow is used. The DVC config will only be updated if the project.yml
-    changed.
-
-    DOCS: https://spacy.io/api/cli#project-dvc
-    """
-    project_update_dvc(project_dir, workflow, verbose=verbose, quiet=quiet, force=force)
-
-
-def project_update_dvc(
-    project_dir: Path,
-    workflow: Optional[str] = None,
-    *,
-    verbose: bool = False,
-    quiet: bool = False,
-    force: bool = False,
-) -> None:
-    """Update the auto-generated Data Version Control (DVC) config file. A DVC
-    project can only define one pipeline, so you need to specify one workflow
-    defined in the project.yml. Will only update the file if the checksum changed.
-
-    project_dir (Path): The project directory.
-    workflow (Optional[str]): Optional name of workflow defined in project.yml.
-        If not set, the first workflow will be used.
-    verbose (bool): Print more info.
-    quiet (bool): Print less info.
-    force (bool): Force update DVC config.
-    """
-    config = load_project_config(project_dir)
-    updated = update_dvc_config(
-        project_dir, config, workflow, verbose=verbose, quiet=quiet, force=force
-    )
-    help_msg = "To execute the workflow with DVC, run: dvc repro"
-    if updated:
-        msg.good(f"Updated DVC config from {PROJECT_FILE}", help_msg)
-    else:
-        msg.info(f"No changes found in {PROJECT_FILE}, no update needed", help_msg)
-
-
-def update_dvc_config(
-    path: Path,
-    config: Dict[str, Any],
-    workflow: Optional[str] = None,
-    verbose: bool = False,
-    quiet: bool = False,
-    force: bool = False,
-) -> bool:
-    """Re-run the DVC commands in dry mode and update dvc.yaml file in the
-    project directory. The file is auto-generated based on the config. The
-    first line of the auto-generated file specifies the hash of the config
-    dict, so if any of the config values change, the DVC config is regenerated.
-
-    path (Path): The path to the project directory.
-    config (Dict[str, Any]): The loaded project.yml.
-    verbose (bool): Whether to print additional info (via DVC).
-    quiet (bool): Don't output anything (via DVC).
-    force (bool): Force update, even if hashes match.
-    RETURNS (bool): Whether the DVC config file was updated.
-    """
-    ensure_dvc(path)
-    workflows = config.get("workflows", {})
-    workflow_names = list(workflows.keys())
-    check_workflows(workflow_names, workflow)
-    if not workflow:
-        workflow = workflow_names[0]
-    config_hash = get_hash(config)
-    path = path.resolve()
-    dvc_config_path = path / DVC_CONFIG
-    if dvc_config_path.exists():
-        # Check if the file was generated using the current config, if not, redo
-        with dvc_config_path.open("r", encoding="utf8") as f:
-            ref_hash = f.readline().strip().replace("# ", "")
-        if ref_hash == config_hash and not force:
-            return False  # Nothing has changed in project.yml, don't need to update
-        dvc_config_path.unlink()
-    dvc_commands = []
-    config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
-
-    # some flags that apply to every command
-    flags = []
-    if verbose:
-        flags.append("--verbose")
-    if quiet:
-        flags.append("--quiet")
-
-    for name in workflows[workflow]:
-        command = config_commands[name]
-        deps = command.get("deps", [])
-        outputs = command.get("outputs", [])
-        outputs_no_cache = command.get("outputs_no_cache", [])
-        if not deps and not outputs and not outputs_no_cache:
-            continue
-        # Default to the working dir as the project path since dvc.yaml is auto-generated
-        # and we don't want arbitrary paths in there
-        project_cmd = ["python", "-m", NAME, "project", "run", name]
-        deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
-        outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
-        outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
-
-        dvc_cmd = ["run", *flags, "-n", name, "-w", str(path), "--no-exec"]
-        if command.get("no_skip"):
-            dvc_cmd.append("--always-changed")
-        full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
-        dvc_commands.append(join_command(full_cmd))
-
-    if not dvc_commands:
-        # If we don't check for this, then there will be an error when reading the
-        # config, since DVC wouldn't create it.
-        msg.fail(
-            "No usable commands for DVC found. This can happen if none of your "
-            "commands have dependencies or outputs.",
-            exits=1,
-        )
-
-    with working_dir(path):
-        for c in dvc_commands:
-            dvc_command = "dvc " + c
-            run_command(dvc_command)
-    with dvc_config_path.open("r+", encoding="utf8") as f:
-        content = f.read()
-        f.seek(0, 0)
-        f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}")
-    return True
-
-
-def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None:
-    """Validate workflows provided in project.yml and check that a given
-    workflow can be used to generate a DVC config.
-
-    workflows (List[str]): Names of the available workflows.
-    workflow (Optional[str]): The name of the workflow to convert.
-    """
-    if not workflows:
-        msg.fail(
-            f"No workflows defined in {PROJECT_FILE}. To generate a DVC config, "
-            f"define at least one list of commands.",
-            exits=1,
-        )
-    if workflow is not None and workflow not in workflows:
-        msg.fail(
-            f"Workflow '{workflow}' not defined in {PROJECT_FILE}. "
-            f"Available workflows: {', '.join(workflows)}",
-            exits=1,
-        )
-    if not workflow:
-        msg.warn(
-            f"No workflow specified for DVC pipeline. Using the first workflow "
-            f"defined in {PROJECT_FILE}: '{workflows[0]}'"
-        )
-
-
-def ensure_dvc(project_dir: Path) -> None:
-    """Ensure that the "dvc" command is available and that the current project
-    directory is an initialized DVC project.
-    """
-    try:
-        subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
-    except Exception:
-        msg.fail(
-            "To use spaCy projects with DVC (Data Version Control), DVC needs "
-            "to be installed and the 'dvc' command needs to be available",
-            "You can install the Python package from pip (pip install dvc) or "
-            "conda (conda install -c conda-forge dvc). For more details, see the "
-            "documentation: https://dvc.org/doc/install",
-            exits=1,
-        )
-    if not (project_dir / ".dvc").exists():
-        msg.fail(
-            "Project not initialized as a DVC project",
-            "To initialize a DVC project, you can run 'dvc init' in the project "
-            "directory. For more details, see the documentation: "
-            "https://dvc.org/doc/command-reference/init",
-            exits=1,
-        )
diff --git a/spacy/cli/project/pull.py b/spacy/cli/project/pull.py
deleted file mode 100644
index e9be74df7f4..00000000000
--- a/spacy/cli/project/pull.py
+++ /dev/null
@@ -1,67 +0,0 @@
-from pathlib import Path
-
-from wasabi import msg
-
-from .._util import Arg, load_project_config, logger, project_cli
-from .remote_storage import RemoteStorage, get_command_hash
-from .run import update_lockfile
-
-
-@project_cli.command("pull")
-def project_pull_cli(
-    # fmt: off
-    remote: str = Arg("default", help="Name or path of remote storage"),
-    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
-    # fmt: on
-):
-    """Retrieve available precomputed outputs from a remote storage.
-    You can alias remotes in your project.yml by mapping them to storage paths.
-    A storage can be anything that the smart-open library can upload to, e.g.
-    AWS, Google Cloud Storage, SSH, local directories etc.
-
-    DOCS: https://spacy.io/api/cli#project-pull
-    """
-    for url, output_path in project_pull(project_dir, remote):
-        if url is not None:
-            msg.good(f"Pulled {output_path} from {url}")
-
-
-def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
-    # TODO: We don't have tests for this :(. It would take a bit of mockery to
-    # set up. I guess see if it breaks first?
-    config = load_project_config(project_dir)
-    if remote in config.get("remotes", {}):
-        remote = config["remotes"][remote]
-    storage = RemoteStorage(project_dir, remote)
-    commands = list(config.get("commands", []))
-    # We use a while loop here because we don't know how the commands
-    # will be ordered. A command might need dependencies from one that's later
-    # in the list.
-    while commands:
-        for i, cmd in enumerate(list(commands)):
-            logger.debug("CMD: %s.", cmd["name"])
-            deps = [project_dir / dep for dep in cmd.get("deps", [])]
-            if all(dep.exists() for dep in deps):
-                cmd_hash = get_command_hash("", "", deps, cmd["script"])
-                for output_path in cmd.get("outputs", []):
-                    url = storage.pull(output_path, command_hash=cmd_hash)
-                    logger.debug(
-                        "URL: %s for %s with command hash %s",
-                        url,
-                        output_path,
-                        cmd_hash,
-                    )
-                    yield url, output_path
-
-                out_locs = [project_dir / out for out in cmd.get("outputs", [])]
-                if all(loc.exists() for loc in out_locs):
-                    update_lockfile(project_dir, cmd)
-                # We remove the command from the list here, and break, so that
-                # we iterate over the loop again.
-                commands.pop(i)
-                break
-            else:
-                logger.debug("Dependency missing. Skipping %s outputs.", cmd["name"])
-        else:
-            # If we didn't break the for loop, break the while loop.
-            break
diff --git a/spacy/cli/project/push.py b/spacy/cli/project/push.py
deleted file mode 100644
index a7915e54741..00000000000
--- a/spacy/cli/project/push.py
+++ /dev/null
@@ -1,69 +0,0 @@
-from pathlib import Path
-
-from wasabi import msg
-
-from .._util import Arg, load_project_config, logger, project_cli
-from .remote_storage import RemoteStorage, get_command_hash, get_content_hash
-
-
-@project_cli.command("push")
-def project_push_cli(
-    # fmt: off
-    remote: str = Arg("default", help="Name or path of remote storage"),
-    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
-    # fmt: on
-):
-    """Persist outputs to a remote storage. You can alias remotes in your
-    project.yml by mapping them to storage paths. A storage can be anything that
-    the smart-open library can upload to, e.g. AWS, Google Cloud Storage, SSH,
-    local directories etc.
-
-    DOCS: https://spacy.io/api/cli#project-push
-    """
-    for output_path, url in project_push(project_dir, remote):
-        if url is None:
-            msg.info(f"Skipping {output_path}")
-        else:
-            msg.good(f"Pushed {output_path} to {url}")
-
-
-def project_push(project_dir: Path, remote: str):
-    """Persist outputs to a remote storage. You can alias remotes in your project.yml
-    by mapping them to storage paths. A storage can be anything that the smart-open
-    library can upload to, e.g. gcs, aws, ssh, local directories etc
-    """
-    config = load_project_config(project_dir)
-    if remote in config.get("remotes", {}):
-        remote = config["remotes"][remote]
-    storage = RemoteStorage(project_dir, remote)
-    for cmd in config.get("commands", []):
-        logger.debug("CMD: %s", cmd["name"])
-        deps = [project_dir / dep for dep in cmd.get("deps", [])]
-        if any(not dep.exists() for dep in deps):
-            logger.debug("Dependency missing. Skipping %s outputs", cmd["name"])
-            continue
-        cmd_hash = get_command_hash(
-            "", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"]
-        )
-        logger.debug("CMD_HASH: %s", cmd_hash)
-        for output_path in cmd.get("outputs", []):
-            output_loc = project_dir / output_path
-            if output_loc.exists() and _is_not_empty_dir(output_loc):
-                url = storage.push(
-                    output_path,
-                    command_hash=cmd_hash,
-                    content_hash=get_content_hash(output_loc),
-                )
-                logger.debug(
-                    "URL: %s for output %s with cmd_hash %s", url, output_path, cmd_hash
-                )
-                yield output_path, url
-
-
-def _is_not_empty_dir(loc: Path):
-    if not loc.is_dir():
-        return True
-    elif any(_is_not_empty_dir(child) for child in loc.iterdir()):
-        return True
-    else:
-        return False
diff --git a/spacy/cli/project/remote_storage.py b/spacy/cli/project/remote_storage.py
deleted file mode 100644
index 84235a90d39..00000000000
--- a/spacy/cli/project/remote_storage.py
+++ /dev/null
@@ -1,212 +0,0 @@
-import hashlib
-import os
-import site
-import tarfile
-import urllib.parse
-from pathlib import Path
-from typing import TYPE_CHECKING, Dict, List, Optional
-
-from wasabi import msg
-
-from ... import about
-from ...errors import Errors
-from ...git_info import GIT_VERSION
-from ...util import ENV_VARS, check_bool_env_var, get_minor_version
-from .._util import (
-    download_file,
-    ensure_pathy,
-    get_checksum,
-    get_hash,
-    make_tempdir,
-    upload_file,
-)
-
-if TYPE_CHECKING:
-    from pathy import FluidPath  # noqa: F401
-
-
-class RemoteStorage:
-    """Push and pull outputs to and from a remote file storage.
-
-    Remotes can be anything that `smart-open` can support: AWS, GCS, file system,
-    ssh, etc.
-    """
-
-    def __init__(self, project_root: Path, url: str, *, compression="gz"):
-        self.root = project_root
-        self.url = ensure_pathy(url)
-        self.compression = compression
-
-    def push(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
-        """Compress a file or directory within a project and upload it to a remote
-        storage. If an object exists at the full URL, nothing is done.
-
-        Within the remote storage, files are addressed by their project path
-        (url encoded) and two user-supplied hashes, representing their creation
-        context and their file contents. If the URL already exists, the data is
-        not uploaded. Paths are archived and compressed prior to upload.
-        """
-        loc = self.root / path
-        if not loc.exists():
-            raise IOError(f"Cannot push {loc}: does not exist.")
-        url = self.make_url(path, command_hash, content_hash)
-        if url.exists():
-            return url
-        tmp: Path
-        with make_tempdir() as tmp:
-            tar_loc = tmp / self.encode_name(str(path))
-            mode_string = f"w:{self.compression}" if self.compression else "w"
-            with tarfile.open(tar_loc, mode=mode_string) as tar_file:
-                tar_file.add(str(loc), arcname=str(path))
-            upload_file(tar_loc, url)
-        return url
-
-    def pull(
-        self,
-        path: Path,
-        *,
-        command_hash: Optional[str] = None,
-        content_hash: Optional[str] = None,
-    ) -> Optional["FluidPath"]:
-        """Retrieve a file from the remote cache. If the file already exists,
-        nothing is done.
-
-        If the command_hash and/or content_hash are specified, only matching
-        results are returned. If no results are available, an error is raised.
-        """
-        dest = self.root / path
-        if dest.exists():
-            return None
-        url = self.find(path, command_hash=command_hash, content_hash=content_hash)
-        if url is None:
-            return url
-        else:
-            # Make sure the destination exists
-            if not dest.parent.exists():
-                dest.parent.mkdir(parents=True)
-            tmp: Path
-            with make_tempdir() as tmp:
-                tar_loc = tmp / url.parts[-1]
-                download_file(url, tar_loc)
-                mode_string = f"r:{self.compression}" if self.compression else "r"
-                with tarfile.open(tar_loc, mode=mode_string) as tar_file:
-                    # This requires that the path is added correctly, relative
-                    # to root. This is how we set things up in push()
-
-                    # Disallow paths outside the current directory for the tar
-                    # file (CVE-2007-4559, directory traversal vulnerability)
-                    def is_within_directory(directory, target):
-                        abs_directory = os.path.abspath(directory)
-                        abs_target = os.path.abspath(target)
-                        prefix = os.path.commonprefix([abs_directory, abs_target])
-                        return prefix == abs_directory
-
-                    def safe_extract(tar, path):
-                        for member in tar.getmembers():
-                            member_path = os.path.join(path, member.name)
-                            if not is_within_directory(path, member_path):
-                                raise ValueError(Errors.E852)
-                        tar.extractall(path)
-
-                    safe_extract(tar_file, self.root)
-        return url
-
-    def find(
-        self,
-        path: Path,
-        *,
-        command_hash: Optional[str] = None,
-        content_hash: Optional[str] = None,
-    ) -> Optional["FluidPath"]:
-        """Find the best matching version of a file within the storage,
-        or `None` if no match can be found. If both the creation and content hash
-        are specified, only exact matches will be returned. Otherwise, the most
-        recent matching file is preferred.
-        """
-        name = self.encode_name(str(path))
-        urls = []
-        if command_hash is not None and content_hash is not None:
-            url = self.url / name / command_hash / content_hash
-            urls = [url] if url.exists() else []
-        elif command_hash is not None:
-            if (self.url / name / command_hash).exists():
-                urls = list((self.url / name / command_hash).iterdir())
-        else:
-            if (self.url / name).exists():
-                for sub_dir in (self.url / name).iterdir():
-                    urls.extend(sub_dir.iterdir())
-                if content_hash is not None:
-                    urls = [url for url in urls if url.parts[-1] == content_hash]
-        if len(urls) >= 2:
-            try:
-                urls.sort(key=lambda x: x.stat().last_modified)  # type: ignore
-            except Exception:
-                msg.warn(
-                    "Unable to sort remote files by last modified. The file(s) "
-                    "pulled from the cache may not be the most recent."
-                )
-        return urls[-1] if urls else None
-
-    def make_url(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
-        """Construct a URL from a subpath, a creation hash and a content hash."""
-        return self.url / self.encode_name(str(path)) / command_hash / content_hash
-
-    def encode_name(self, name: str) -> str:
-        """Encode a subpath into a URL-safe name."""
-        return urllib.parse.quote_plus(name)
-
-
-def get_content_hash(loc: Path) -> str:
-    return get_checksum(loc)
-
-
-def get_command_hash(
-    site_hash: str, env_hash: str, deps: List[Path], cmd: List[str]
-) -> str:
-    """Create a hash representing the execution of a command. This includes the
-    currently installed packages, whatever environment variables have been marked
-    as relevant, and the command.
-    """
-    if check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION):
-        spacy_v = GIT_VERSION
-    else:
-        spacy_v = str(get_minor_version(about.__version__) or "")
-    dep_checksums = [get_checksum(dep) for dep in sorted(deps)]
-    hashes = [spacy_v, site_hash, env_hash] + dep_checksums
-    hashes.extend(cmd)
-    creation_bytes = "".join(hashes).encode("utf8")
-    return hashlib.md5(creation_bytes).hexdigest()
-
-
-def get_site_hash():
-    """Hash the current Python environment's site-packages contents, including
-    the name and version of the libraries. The list we're hashing is what
-    `pip freeze` would output.
-    """
-    site_dirs = site.getsitepackages()
-    if site.ENABLE_USER_SITE:
-        site_dirs.extend(site.getusersitepackages())
-    packages = set()
-    for site_dir in site_dirs:
-        site_dir = Path(site_dir)
-        for subpath in site_dir.iterdir():
-            if subpath.parts[-1].endswith("dist-info"):
-                packages.add(subpath.parts[-1].replace(".dist-info", ""))
-    package_bytes = "".join(sorted(packages)).encode("utf8")
-    return hashlib.md5sum(package_bytes).hexdigest()
-
-
-def get_env_hash(env: Dict[str, str]) -> str:
-    """Construct a hash of the environment variables that will be passed into
-    the commands.
-
-    Values in the env dict may be references to the current os.environ, using
-    the syntax $ENV_VAR to mean os.environ[ENV_VAR]
-    """
-    env_vars = {}
-    for key, value in env.items():
-        if value.startswith("$"):
-            env_vars[key] = os.environ.get(value[1:], "")
-        else:
-            env_vars[key] = value
-    return get_hash(env_vars)
diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py
deleted file mode 100644
index 43972a2026a..00000000000
--- a/spacy/cli/project/run.py
+++ /dev/null
@@ -1,379 +0,0 @@
-import os.path
-import sys
-from pathlib import Path
-from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
-
-import srsly
-import typer
-from wasabi import msg
-from wasabi.util import locale_escape
-
-from ... import about
-from ...git_info import GIT_VERSION
-from ...util import (
-    ENV_VARS,
-    SimpleFrozenDict,
-    SimpleFrozenList,
-    check_bool_env_var,
-    is_cwd,
-    is_minor_version_match,
-    join_command,
-    run_command,
-    split_command,
-    working_dir,
-)
-from .._util import (
-    COMMAND,
-    PROJECT_FILE,
-    PROJECT_LOCK,
-    Arg,
-    Opt,
-    get_checksum,
-    get_hash,
-    load_project_config,
-    parse_config_overrides,
-    project_cli,
-)
-
-
-@project_cli.command(
-    "run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
-)
-def project_run_cli(
-    # fmt: off
-    ctx: typer.Context,  # This is only used to read additional arguments
-    subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"),
-    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
-    force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"),
-    dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute scripts"),
-    show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
-    # fmt: on
-):
-    """Run a named command or workflow defined in the project.yml. If a workflow
-    name is specified, all commands in the workflow are run, in order. If
-    commands define dependencies and/or outputs, they will only be re-run if
-    state has changed.
-
-    DOCS: https://spacy.io/api/cli#project-run
-    """
-    if show_help or not subcommand:
-        print_run_help(project_dir, subcommand)
-    else:
-        overrides = parse_config_overrides(ctx.args)
-        project_run(project_dir, subcommand, overrides=overrides, force=force, dry=dry)
-
-
-def project_run(
-    project_dir: Path,
-    subcommand: str,
-    *,
-    overrides: Dict[str, Any] = SimpleFrozenDict(),
-    force: bool = False,
-    dry: bool = False,
-    capture: bool = False,
-    skip_requirements_check: bool = False,
-) -> None:
-    """Run a named script defined in the project.yml. If the script is part
-    of the default pipeline (defined in the "run" section), DVC is used to
-    execute the command, so it can determine whether to rerun it. It then
-    calls into "exec" to execute it.
-
-    project_dir (Path): Path to project directory.
-    subcommand (str): Name of command to run.
-    overrides (Dict[str, Any]): Optional config overrides.
-    force (bool): Force re-running, even if nothing changed.
-    dry (bool): Perform a dry run and don't execute commands.
-    capture (bool): Whether to capture the output and errors of individual commands.
-        If False, the stdout and stderr will not be redirected, and if there's an error,
-        sys.exit will be called with the return code. You should use capture=False
-        when you want to turn over execution to the command, and capture=True
-        when you want to run the command more like a function.
-    skip_requirements_check (bool): Whether to skip the requirements check.
-    """
-    config = load_project_config(project_dir, overrides=overrides)
-    commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
-    workflows = config.get("workflows", {})
-    validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
-
-    req_path = project_dir / "requirements.txt"
-    if not skip_requirements_check:
-        if config.get("check_requirements", True) and os.path.exists(req_path):
-            with req_path.open() as requirements_file:
-                _check_requirements([req.strip() for req in requirements_file])
-
-    if subcommand in workflows:
-        msg.info(f"Running workflow '{subcommand}'")
-        for cmd in workflows[subcommand]:
-            project_run(
-                project_dir,
-                cmd,
-                overrides=overrides,
-                force=force,
-                dry=dry,
-                capture=capture,
-                skip_requirements_check=True,
-            )
-    else:
-        cmd = commands[subcommand]
-        for dep in cmd.get("deps", []):
-            if not (project_dir / dep).exists():
-                err = f"Missing dependency specified by command '{subcommand}': {dep}"
-                err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
-                err_exits = 1 if not dry else None
-                msg.fail(err, err_help, exits=err_exits)
-        check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
-        with working_dir(project_dir) as current_dir:
-            msg.divider(subcommand)
-            rerun = check_rerun(current_dir, cmd, check_spacy_commit=check_spacy_commit)
-            if not rerun and not force:
-                msg.info(f"Skipping '{cmd['name']}': nothing changed")
-            else:
-                run_commands(cmd["script"], dry=dry, capture=capture)
-                if not dry:
-                    update_lockfile(current_dir, cmd)
-
-
-def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
-    """Simulate a CLI help prompt using the info available in the project.yml.
-
-    project_dir (Path): The project directory.
-    subcommand (Optional[str]): The subcommand or None. If a subcommand is
-        provided, the subcommand help is shown. Otherwise, the top-level help
-        and a list of available commands is printed.
-    """
-    config = load_project_config(project_dir)
-    config_commands = config.get("commands", [])
-    commands = {cmd["name"]: cmd for cmd in config_commands}
-    workflows = config.get("workflows", {})
-    project_loc = "" if is_cwd(project_dir) else project_dir
-    if subcommand:
-        validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
-        print(f"Usage: {COMMAND} project run {subcommand} {project_loc}")
-        if subcommand in commands:
-            help_text = commands[subcommand].get("help")
-            if help_text:
-                print(f"\n{help_text}\n")
-        elif subcommand in workflows:
-            steps = workflows[subcommand]
-            print(f"\nWorkflow consisting of {len(steps)} commands:")
-            steps_data = [
-                (f"{i + 1}. {step}", commands[step].get("help", ""))
-                for i, step in enumerate(steps)
-            ]
-            msg.table(steps_data)
-            help_cmd = f"{COMMAND} project run [COMMAND] {project_loc} --help"
-            print(f"For command details, run: {help_cmd}")
-    else:
-        print("")
-        title = config.get("title")
-        if title:
-            print(f"{locale_escape(title)}\n")
-        if config_commands:
-            print(f"Available commands in {PROJECT_FILE}")
-            print(f"Usage: {COMMAND} project run [COMMAND] {project_loc}")
-            msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
-        if workflows:
-            print(f"Available workflows in {PROJECT_FILE}")
-            print(f"Usage: {COMMAND} project run [WORKFLOW] {project_loc}")
-            msg.table([(name, " -> ".join(steps)) for name, steps in workflows.items()])
-
-
-def run_commands(
-    commands: Iterable[str] = SimpleFrozenList(),
-    silent: bool = False,
-    dry: bool = False,
-    capture: bool = False,
-) -> None:
-    """Run a sequence of commands in a subprocess, in order.
-
-    commands (List[str]): The string commands.
-    silent (bool): Don't print the commands.
-    dry (bool): Perform a dry run and don't execut anything.
-    capture (bool): Whether to capture the output and errors of individual commands.
-        If False, the stdout and stderr will not be redirected, and if there's an error,
-        sys.exit will be called with the return code. You should use capture=False
-        when you want to turn over execution to the command, and capture=True
-        when you want to run the command more like a function.
-    """
-    for c in commands:
-        command = split_command(c)
-        # Not sure if this is needed or a good idea. Motivation: users may often
-        # use commands in their config that reference "python" and we want to
-        # make sure that it's always executing the same Python that spaCy is
-        # executed with and the pip in the same env, not some other Python/pip.
-        # Also ensures cross-compatibility if user 1 writes "python3" (because
-        # that's how it's set up on their system), and user 2 without the
-        # shortcut tries to re-run the command.
-        if len(command) and command[0] in ("python", "python3"):
-            command[0] = sys.executable
-        elif len(command) and command[0] in ("pip", "pip3"):
-            command = [sys.executable, "-m", "pip", *command[1:]]
-        if not silent:
-            print(f"Running command: {join_command(command)}")
-        if not dry:
-            run_command(command, capture=capture)
-
-
-def validate_subcommand(
-    commands: Sequence[str], workflows: Sequence[str], subcommand: str
-) -> None:
-    """Check that a subcommand is valid and defined. Raises an error otherwise.
-
-    commands (Sequence[str]): The available commands.
-    subcommand (str): The subcommand.
-    """
-    if not commands and not workflows:
-        msg.fail(f"No commands or workflows defined in {PROJECT_FILE}", exits=1)
-    if subcommand not in commands and subcommand not in workflows:
-        help_msg = []
-        if subcommand in ["assets", "asset"]:
-            help_msg.append("Did you mean to run: python -m spacy project assets?")
-        if commands:
-            help_msg.append(f"Available commands: {', '.join(commands)}")
-        if workflows:
-            help_msg.append(f"Available workflows: {', '.join(workflows)}")
-        msg.fail(
-            f"Can't find command or workflow '{subcommand}' in {PROJECT_FILE}",
-            ". ".join(help_msg),
-            exits=1,
-        )
-
-
-def check_rerun(
-    project_dir: Path,
-    command: Dict[str, Any],
-    *,
-    check_spacy_version: bool = True,
-    check_spacy_commit: bool = False,
-) -> bool:
-    """Check if a command should be rerun because its settings or inputs/outputs
-    changed.
-
-    project_dir (Path): The current project directory.
-    command (Dict[str, Any]): The command, as defined in the project.yml.
-    strict_version (bool):
-    RETURNS (bool): Whether to re-run the command.
-    """
-    # Always rerun if no-skip is set
-    if command.get("no_skip", False):
-        return True
-    lock_path = project_dir / PROJECT_LOCK
-    if not lock_path.exists():  # We don't have a lockfile, run command
-        return True
-    data = srsly.read_yaml(lock_path)
-    if command["name"] not in data:  # We don't have info about this command
-        return True
-    entry = data[command["name"]]
-    # Always run commands with no outputs (otherwise they'd always be skipped)
-    if not entry.get("outs", []):
-        return True
-    # Always rerun if spaCy version or commit hash changed
-    spacy_v = entry.get("spacy_version")
-    commit = entry.get("spacy_git_version")
-    if check_spacy_version and not is_minor_version_match(spacy_v, about.__version__):
-        info = f"({spacy_v} in {PROJECT_LOCK}, {about.__version__} current)"
-        msg.info(f"Re-running '{command['name']}': spaCy minor version changed {info}")
-        return True
-    if check_spacy_commit and commit != GIT_VERSION:
-        info = f"({commit} in {PROJECT_LOCK}, {GIT_VERSION} current)"
-        msg.info(f"Re-running '{command['name']}': spaCy commit changed {info}")
-        return True
-    # If the entry in the lockfile matches the lockfile entry that would be
-    # generated from the current command, we don't rerun because it means that
-    # all inputs/outputs, hashes and scripts are the same and nothing changed
-    lock_entry = get_lock_entry(project_dir, command)
-    exclude = ["spacy_version", "spacy_git_version"]
-    return get_hash(lock_entry, exclude=exclude) != get_hash(entry, exclude=exclude)
-
-
-def update_lockfile(project_dir: Path, command: Dict[str, Any]) -> None:
-    """Update the lockfile after running a command. Will create a lockfile if
-    it doesn't yet exist and will add an entry for the current command, its
-    script and dependencies/outputs.
-
-    project_dir (Path): The current project directory.
-    command (Dict[str, Any]): The command, as defined in the project.yml.
-    """
-    lock_path = project_dir / PROJECT_LOCK
-    if not lock_path.exists():
-        srsly.write_yaml(lock_path, {})
-        data = {}
-    else:
-        data = srsly.read_yaml(lock_path)
-    data[command["name"]] = get_lock_entry(project_dir, command)
-    srsly.write_yaml(lock_path, data)
-
-
-def get_lock_entry(project_dir: Path, command: Dict[str, Any]) -> Dict[str, Any]:
-    """Get a lockfile entry for a given command. An entry includes the command,
-    the script (command steps) and a list of dependencies and outputs with
-    their paths and file hashes, if available. The format is based on the
-    dvc.lock files, to keep things consistent.
-
-    project_dir (Path): The current project directory.
-    command (Dict[str, Any]): The command, as defined in the project.yml.
-    RETURNS (Dict[str, Any]): The lockfile entry.
-    """
-    deps = get_fileinfo(project_dir, command.get("deps", []))
-    outs = get_fileinfo(project_dir, command.get("outputs", []))
-    outs_nc = get_fileinfo(project_dir, command.get("outputs_no_cache", []))
-    return {
-        "cmd": f"{COMMAND} run {command['name']}",
-        "script": command["script"],
-        "deps": deps,
-        "outs": [*outs, *outs_nc],
-        "spacy_version": about.__version__,
-        "spacy_git_version": GIT_VERSION,
-    }
-
-
-def get_fileinfo(project_dir: Path, paths: List[str]) -> List[Dict[str, Optional[str]]]:
-    """Generate the file information for a list of paths (dependencies, outputs).
-    Includes the file path and the file's checksum.
-
-    project_dir (Path): The current project directory.
-    paths (List[str]): The file paths.
-    RETURNS (List[Dict[str, str]]): The lockfile entry for a file.
-    """
-    data = []
-    for path in paths:
-        file_path = project_dir / path
-        md5 = get_checksum(file_path) if file_path.exists() else None
-        data.append({"path": path, "md5": md5})
-    return data
-
-
-def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]:
-    """Checks whether requirements are installed and free of version conflicts.
-    requirements (List[str]): List of requirements.
-    RETURNS (Tuple[bool, bool]): Whether (1) any packages couldn't be imported, (2) any packages with version conflicts
-        exist.
-    """
-    import pkg_resources
-
-    failed_pkgs_msgs: List[str] = []
-    conflicting_pkgs_msgs: List[str] = []
-
-    for req in requirements:
-        try:
-            pkg_resources.require(req)
-        except pkg_resources.DistributionNotFound as dnf:
-            failed_pkgs_msgs.append(dnf.report())
-        except pkg_resources.VersionConflict as vc:
-            conflicting_pkgs_msgs.append(vc.report())
-        except Exception:
-            msg.warn(
-                f"Unable to check requirement: {req} "
-                "Checks are currently limited to requirement specifiers "
-                "(PEP 508)"
-            )
-
-    if len(failed_pkgs_msgs) or len(conflicting_pkgs_msgs):
-        msg.warn(
-            title="Missing requirements or requirement conflicts detected. Make sure your Python environment is set up "
-            "correctly and you installed all requirements specified in your project's requirements.txt: "
-        )
-        for pgk_msg in failed_pkgs_msgs + conflicting_pkgs_msgs:
-            msg.text(pgk_msg)
-
-    return len(failed_pkgs_msgs) > 0, len(conflicting_pkgs_msgs) > 0
diff --git a/spacy/errors.py b/spacy/errors.py
index a2f8ca85c3b..225cb9c86ae 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -557,8 +557,6 @@ class Errors(metaclass=ErrorsWithCodes):
             "floret vectors, not {mode} vectors.")
     E851 = ("The 'textcat' component labels should only have values of 0 or 1, "
             "but found value of '{val}'.")
-    E852 = ("The tar file pulled from the remote attempted an unsafe path "
-            "traversal.")
     E853 = ("Unsupported component factory name '{name}'. The character '.' is "
             "not permitted in factory names.")
     E854 = ("Unable to set doc.ents. Check that the 'ents_filter' does not "
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 22f45372c23..22c25e99d04 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -465,66 +465,6 @@ class Config:
     "initialize": ConfigSchemaInit,
 }
 
-
-# Project config Schema
-
-
-class ProjectConfigAssetGitItem(BaseModel):
-    # fmt: off
-    repo: StrictStr = Field(..., title="URL of Git repo to download from")
-    path: StrictStr = Field(..., title="File path or sub-directory to download (used for sparse checkout)")
-    branch: StrictStr = Field("master", title="Branch to clone from")
-    # fmt: on
-
-
-class ProjectConfigAssetURL(BaseModel):
-    # fmt: off
-    dest: StrictStr = Field(..., title="Destination of downloaded asset")
-    url: Optional[StrictStr] = Field(None, title="URL of asset")
-    checksum: Optional[str] = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
-    description: StrictStr = Field("", title="Description of asset")
-    # fmt: on
-
-
-class ProjectConfigAssetGit(BaseModel):
-    # fmt: off
-    git: ProjectConfigAssetGitItem = Field(..., title="Git repo information")
-    checksum: Optional[str] = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
-    description: Optional[StrictStr] = Field(None, title="Description of asset")
-    # fmt: on
-
-
-class ProjectConfigCommand(BaseModel):
-    # fmt: off
-    name: StrictStr = Field(..., title="Name of command")
-    help: Optional[StrictStr] = Field(None, title="Command description")
-    script: List[StrictStr] = Field([], title="List of CLI commands to run, in order")
-    deps: List[StrictStr] = Field([], title="File dependencies required by this command")
-    outputs: List[StrictStr] = Field([], title="Outputs produced by this command")
-    outputs_no_cache: List[StrictStr] = Field([], title="Outputs not tracked by DVC (DVC only)")
-    no_skip: bool = Field(False, title="Never skip this command, even if nothing changed")
-    # fmt: on
-
-    class Config:
-        title = "A single named command specified in a project config"
-        extra = "forbid"
-
-
-class ProjectConfigSchema(BaseModel):
-    # fmt: off
-    vars: Dict[StrictStr, Any] = Field({}, title="Optional variables to substitute in commands")
-    env: Dict[StrictStr, Any] = Field({}, title="Optional variable names to substitute in commands, mapped to environment variable names")
-    assets: List[Union[ProjectConfigAssetURL, ProjectConfigAssetGit]] = Field([], title="Data assets")
-    workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order")
-    commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")
-    title: Optional[str] = Field(None, title="Project title")
-    spacy_version: Optional[StrictStr] = Field(None, title="spaCy version range that the project is compatible with")
-    # fmt: on
-
-    class Config:
-        title = "Schema for project configuration file"
-
-
 # Recommendations for init config workflows
 
 
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index f5a7aadb8bd..9b4f6851e47 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -11,21 +11,13 @@
 from click import NoSuchOption
 from packaging.specifiers import SpecifierSet
 from thinc.api import Config, ConfigValidationError
+from weasel.cli.remote_storage import RemoteStorage
+from weasel.cli.run import _check_requirements
 
 import spacy
 from spacy import about
 from spacy.cli import info
-from spacy.cli._util import (
-    download_file,
-    is_subpath_of,
-    load_project_config,
-    parse_config_overrides,
-    string_to_list,
-    substitute_project_variables,
-    upload_file,
-    validate_project_commands,
-    walk_directory,
-)
+from spacy.cli._util import parse_config_overrides, string_to_list, walk_directory
 from spacy.cli.apply import apply
 from spacy.cli.debug_data import (
     _compile_gold,
@@ -43,13 +35,11 @@
 from spacy.cli.init_config import RECOMMENDATIONS, fill_config, init_config
 from spacy.cli.init_pipeline import _init_labels
 from spacy.cli.package import _is_permitted_package_name, get_third_party_dependencies
-from spacy.cli.project.remote_storage import RemoteStorage
-from spacy.cli.project.run import _check_requirements
 from spacy.cli.validate import get_model_pkgs
 from spacy.lang.en import English
 from spacy.lang.nl import Dutch
 from spacy.language import Language
-from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
+from spacy.schemas import RecommendationSchema, validate
 from spacy.tokens import Doc, DocBin
 from spacy.tokens.span import Span
 from spacy.training import Example, docs_to_json, offsets_to_biluo_tags
@@ -134,25 +124,6 @@ def test_issue7055():
     assert "model" in filled_cfg["components"]["ner"]
 
 
-@pytest.mark.issue(11235)
-def test_issue11235():
-    """
-    Test that the cli handles interpolation in the directory names correctly when loading project config.
-    """
-    lang_var = "en"
-    variables = {"lang": lang_var}
-    commands = [{"name": "x", "script": ["hello ${vars.lang}"]}]
-    directories = ["cfg", "${vars.lang}_model"]
-    project = {"commands": commands, "vars": variables, "directories": directories}
-    with make_tempdir() as d:
-        srsly.write_yaml(d / "project.yml", project)
-        cfg = load_project_config(d)
-        # Check that the directories are interpolated and created correctly
-        assert os.path.exists(d / "cfg")
-        assert os.path.exists(d / f"{lang_var}_model")
-    assert cfg["commands"][0]["script"][0] == f"hello {lang_var}"
-
-
 @pytest.mark.issue(12566)
 @pytest.mark.parametrize(
     "factory,output_file",
@@ -443,136 +414,6 @@ def test_cli_converters_conll_ner_to_docs():
         assert ent.text in ["New York City", "London"]
 
 
-def test_project_config_validation_full():
-    config = {
-        "vars": {"some_var": 20},
-        "directories": ["assets", "configs", "corpus", "scripts", "training"],
-        "assets": [
-            {
-                "dest": "x",
-                "extra": True,
-                "url": "https://example.com",
-                "checksum": "63373dd656daa1fd3043ce166a59474c",
-            },
-            {
-                "dest": "y",
-                "git": {
-                    "repo": "https://github.com/example/repo",
-                    "branch": "develop",
-                    "path": "y",
-                },
-            },
-            {
-                "dest": "z",
-                "extra": False,
-                "url": "https://example.com",
-                "checksum": "63373dd656daa1fd3043ce166a59474c",
-            },
-        ],
-        "commands": [
-            {
-                "name": "train",
-                "help": "Train a model",
-                "script": ["python -m spacy train config.cfg -o training"],
-                "deps": ["config.cfg", "corpus/training.spcy"],
-                "outputs": ["training/model-best"],
-            },
-            {"name": "test", "script": ["pytest", "custom.py"], "no_skip": True},
-        ],
-        "workflows": {"all": ["train", "test"], "train": ["train"]},
-    }
-    errors = validate(ProjectConfigSchema, config)
-    assert not errors
-
-
-@pytest.mark.parametrize(
-    "config",
-    [
-        {"commands": [{"name": "a"}, {"name": "a"}]},
-        {"commands": [{"name": "a"}], "workflows": {"a": []}},
-        {"commands": [{"name": "a"}], "workflows": {"b": ["c"]}},
-    ],
-)
-def test_project_config_validation1(config):
-    with pytest.raises(SystemExit):
-        validate_project_commands(config)
-
-
-@pytest.mark.parametrize(
-    "config,n_errors",
-    [
-        ({"commands": {"a": []}}, 1),
-        ({"commands": [{"help": "..."}]}, 1),
-        ({"commands": [{"name": "a", "extra": "b"}]}, 1),
-        ({"commands": [{"extra": "b"}]}, 2),
-        ({"commands": [{"name": "a", "deps": [123]}]}, 1),
-    ],
-)
-def test_project_config_validation2(config, n_errors):
-    errors = validate(ProjectConfigSchema, config)
-    assert len(errors) == n_errors
-
-
-@pytest.mark.parametrize(
-    "int_value",
-    [10, pytest.param("10", marks=pytest.mark.xfail)],
-)
-def test_project_config_interpolation(int_value):
-    variables = {"a": int_value, "b": {"c": "foo", "d": True}}
-    commands = [
-        {"name": "x", "script": ["hello ${vars.a} ${vars.b.c}"]},
-        {"name": "y", "script": ["${vars.b.c} ${vars.b.d}"]},
-    ]
-    project = {"commands": commands, "vars": variables}
-    with make_tempdir() as d:
-        srsly.write_yaml(d / "project.yml", project)
-        cfg = load_project_config(d)
-    assert type(cfg) == dict
-    assert type(cfg["commands"]) == list
-    assert cfg["commands"][0]["script"][0] == "hello 10 foo"
-    assert cfg["commands"][1]["script"][0] == "foo true"
-    commands = [{"name": "x", "script": ["hello ${vars.a} ${vars.b.e}"]}]
-    project = {"commands": commands, "vars": variables}
-    with pytest.raises(ConfigValidationError):
-        substitute_project_variables(project)
-
-
-@pytest.mark.parametrize(
-    "greeting",
-    [342, "everyone", "tout le monde", pytest.param("42", marks=pytest.mark.xfail)],
-)
-def test_project_config_interpolation_override(greeting):
-    variables = {"a": "world"}
-    commands = [
-        {"name": "x", "script": ["hello ${vars.a}"]},
-    ]
-    overrides = {"vars.a": greeting}
-    project = {"commands": commands, "vars": variables}
-    with make_tempdir() as d:
-        srsly.write_yaml(d / "project.yml", project)
-        cfg = load_project_config(d, overrides=overrides)
-    assert type(cfg) == dict
-    assert type(cfg["commands"]) == list
-    assert cfg["commands"][0]["script"][0] == f"hello {greeting}"
-
-
-def test_project_config_interpolation_env():
-    variables = {"a": 10}
-    env_var = "SPACY_TEST_FOO"
-    env_vars = {"foo": env_var}
-    commands = [{"name": "x", "script": ["hello ${vars.a} ${env.foo}"]}]
-    project = {"commands": commands, "vars": variables, "env": env_vars}
-    with make_tempdir() as d:
-        srsly.write_yaml(d / "project.yml", project)
-        cfg = load_project_config(d)
-    assert cfg["commands"][0]["script"][0] == "hello 10 "
-    os.environ[env_var] = "123"
-    with make_tempdir() as d:
-        srsly.write_yaml(d / "project.yml", project)
-        cfg = load_project_config(d)
-    assert cfg["commands"][0]["script"][0] == "hello 10 123"
-
-
 @pytest.mark.parametrize(
     "args,expected",
     [
@@ -784,21 +625,6 @@ def test_factory(nlp, name):
     get_third_party_dependencies(nlp.config)
 
 
-@pytest.mark.parametrize(
-    "parent,child,expected",
-    [
-        ("/tmp", "/tmp", True),
-        ("/tmp", "/", False),
-        ("/tmp", "/tmp/subdir", True),
-        ("/tmp", "/tmpdir", False),
-        ("/tmp", "/tmp/subdir/..", True),
-        ("/tmp", "/tmp/..", False),
-    ],
-)
-def test_is_subpath_of(parent, child, expected):
-    assert is_subpath_of(parent, child) == expected
-
-
 @pytest.mark.slow
 @pytest.mark.parametrize(
     "factory_name,pipe_name",
@@ -1044,60 +870,6 @@ def test_applycli_user_data():
         assert result[0]._.ext == val
 
 
-def test_local_remote_storage():
-    with make_tempdir() as d:
-        filename = "a.txt"
-
-        content_hashes = ("aaaa", "cccc", "bbbb")
-        for i, content_hash in enumerate(content_hashes):
-            # make sure that each subsequent file has a later timestamp
-            if i > 0:
-                time.sleep(1)
-            content = f"{content_hash} content"
-            loc_file = d / "root" / filename
-            if not loc_file.parent.exists():
-                loc_file.parent.mkdir(parents=True)
-            with loc_file.open(mode="w") as file_:
-                file_.write(content)
-
-            # push first version to remote storage
-            remote = RemoteStorage(d / "root", str(d / "remote"))
-            remote.push(filename, "aaaa", content_hash)
-
-            # retrieve with full hashes
-            loc_file.unlink()
-            remote.pull(filename, command_hash="aaaa", content_hash=content_hash)
-            with loc_file.open(mode="r") as file_:
-                assert file_.read() == content
-
-            # retrieve with command hash
-            loc_file.unlink()
-            remote.pull(filename, command_hash="aaaa")
-            with loc_file.open(mode="r") as file_:
-                assert file_.read() == content
-
-            # retrieve with content hash
-            loc_file.unlink()
-            remote.pull(filename, content_hash=content_hash)
-            with loc_file.open(mode="r") as file_:
-                assert file_.read() == content
-
-            # retrieve with no hashes
-            loc_file.unlink()
-            remote.pull(filename)
-            with loc_file.open(mode="r") as file_:
-                assert file_.read() == content
-
-
-def test_local_remote_storage_pull_missing():
-    # pulling from a non-existent remote pulls nothing gracefully
-    with make_tempdir() as d:
-        filename = "a.txt"
-        remote = RemoteStorage(d / "root", str(d / "remote"))
-        assert remote.pull(filename, command_hash="aaaa") is None
-        assert remote.pull(filename) is None
-
-
 def test_cli_find_threshold(capsys):
     def make_examples(nlp: Language) -> List[Example]:
         docs: List[Example] = []
@@ -1208,63 +980,6 @@ def init_nlp(
                 )
 
 
-@pytest.mark.filterwarnings("ignore::DeprecationWarning")
-@pytest.mark.parametrize(
-    "reqs,output",
-    [
-        [
-            """
-            spacy
-
-            # comment
-
-            thinc""",
-            (False, False),
-        ],
-        [
-            """# comment
-            --some-flag
-            spacy""",
-            (False, False),
-        ],
-        [
-            """# comment
-            --some-flag
-            spacy; python_version >= '3.6'""",
-            (False, False),
-        ],
-        [
-            """# comment
-             spacyunknowndoesnotexist12345""",
-            (True, False),
-        ],
-    ],
-)
-def test_project_check_requirements(reqs, output):
-    import pkg_resources
-
-    # excessive guard against unlikely package name
-    try:
-        pkg_resources.require("spacyunknowndoesnotexist12345")
-    except pkg_resources.DistributionNotFound:
-        assert output == _check_requirements([req.strip() for req in reqs.split("\n")])
-
-
-def test_upload_download_local_file():
-    with make_tempdir() as d1, make_tempdir() as d2:
-        filename = "f.txt"
-        content = "content"
-        local_file = d1 / filename
-        remote_file = d2 / filename
-        with local_file.open(mode="w") as file_:
-            file_.write(content)
-        upload_file(local_file, remote_file)
-        local_file.unlink()
-        download_file(remote_file, local_file)
-        with local_file.open(mode="r") as file_:
-            assert file_.read() == content
-
-
 def test_walk_directory():
     with make_tempdir() as d:
         files = [
diff --git a/spacy/util.py b/spacy/util.py
index 762699a9756..a2a033cbc0d 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -101,7 +101,6 @@
 
 class ENV_VARS:
     CONFIG_OVERRIDES = "SPACY_CONFIG_OVERRIDES"
-    PROJECT_USE_GIT_VERSION = "SPACY_PROJECT_USE_GIT_VERSION"
 
 
 class registry(thinc.registry):
@@ -974,23 +973,12 @@ def replace_model_node(model: Model, target: Model, replacement: Model) -> None:
 
 def split_command(command: str) -> List[str]:
     """Split a string command using shlex. Handles platform compatibility.
-
     command (str) : The command to split
     RETURNS (List[str]): The split command.
     """
     return shlex.split(command, posix=not is_windows)
 
 
-def join_command(command: List[str]) -> str:
-    """Join a command using shlex. shlex.join is only available for Python 3.8+,
-    so we're using a workaround here.
-
-    command (List[str]): The command to join.
-    RETURNS (str): The joined command
-    """
-    return " ".join(shlex.quote(cmd) for cmd in command)
-
-
 def run_command(
     command: Union[str, List[str]],
     *,
@@ -999,7 +987,6 @@ def run_command(
 ) -> subprocess.CompletedProcess:
     """Run a command on the command line as a subprocess. If the subprocess
     returns a non-zero exit code, a system exit is performed.
-
     command (str / List[str]): The command. If provided as a string, the
         string will be split using shlex.split.
     stdin (Optional[Any]): stdin to read from or None.
@@ -1050,7 +1037,6 @@ def run_command(
 @contextmanager
 def working_dir(path: Union[str, Path]) -> Iterator[Path]:
     """Change current working directory and returns to previous on exit.
-
     path (str / Path): The directory to navigate to.
     YIELDS (Path): The absolute path to the current working directory. This
         should be used if the block needs to perform actions within the working
@@ -1069,7 +1055,6 @@ def working_dir(path: Union[str, Path]) -> Iterator[Path]:
 def make_tempdir() -> Generator[Path, None, None]:
     """Execute a block in a temporary directory and remove the directory and
     its contents at the end of the with block.
-
     YIELDS (Path): The path of the temp directory.
     """
     d = Path(tempfile.mkdtemp())
@@ -1087,15 +1072,6 @@ def force_remove(rmfunc, path, ex):
         warnings.warn(Warnings.W091.format(dir=d, msg=e))
 
 
-def is_cwd(path: Union[Path, str]) -> bool:
-    """Check whether a path is the current working directory.
-
-    path (Union[Path, str]): The directory path.
-    RETURNS (bool): Whether the path is the current working directory.
-    """
-    return str(Path(path).resolve()).lower() == str(Path.cwd().resolve()).lower()
-
-
 def is_in_jupyter() -> bool:
     """Check if user is running spaCy from a Jupyter notebook by detecting the
     IPython kernel. Mainly used for the displaCy visualizer.

From 4f8daa4f003785e23cf2612683f78476dac7baca Mon Sep 17 00:00:00 2001
From: Jacobo Myerston <43222279+jmyerston@users.noreply.github.com>
Date: Thu, 20 Jul 2023 02:16:01 -0700
Subject: [PATCH 05/39] Add Left and Right Pointing Angle Brackets as
 punctuation to ancient Greek (#12829)

* Update universe.json

* Update universe.json

add some missing commas in the greCy's description.

* Update punctuation.py

Add mathematical left and right angle brackets as punctuation for ancient Greek for better tokenization.
---
 spacy/lang/grc/punctuation.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/spacy/lang/grc/punctuation.py b/spacy/lang/grc/punctuation.py
index 8e9fc8bf29c..59037617d38 100644
--- a/spacy/lang/grc/punctuation.py
+++ b/spacy/lang/grc/punctuation.py
@@ -15,6 +15,7 @@
     [
         "†",
         "⸏",
+        "〈",
     ]
     + LIST_PUNCT
     + LIST_ELLIPSES
@@ -31,6 +32,7 @@
     + [
         "†",
         "⸎",
+        "〉",
         r"(?<=[\u1F00-\u1FFF\u0370-\u03FF])[\-\.⸏]",
     ]
 )

From 5888afa8840fc73afe09d45745c69de0b7828328 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 24 Jul 2023 10:32:56 +0200
Subject: [PATCH 06/39] Update numpy build constraints for numpy 1.25 (#12839)

* Update numpy build constraints for numpy 1.25

Starting in numpy 1.25 (see
https://github.com/numpy/numpy/releases/tag/v1.25.0), the numpy C API is
backwards-compatible by default.

For python 3.9+, we should be able to drop the specific numpy build
requirements and use `numpy>=1.25`, which is currently
backwards-compatible to `numpy>=1.19`.

In the future, the python <3.9 requirements could be dropped and the
lower numpy pin could correspond to the oldest supported version for the
current lower python pin.

* Turn off fail-fast

* Revert "Turn off fail-fast"

This reverts commit 4306f516bc4a6b3437b5393ff1b6b6ae54957d2d.

* Update for python 3.6

* Fix typo
---
 build-constraints.txt                    | 5 +----
 pyproject.toml                           | 3 ++-
 requirements.txt                         | 3 ++-
 setup.cfg                                | 8 +++++++-
 spacy/tests/package/test_requirements.py | 3 ++-
 5 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/build-constraints.txt b/build-constraints.txt
index c1e82f1b074..5540d634d69 100644
--- a/build-constraints.txt
+++ b/build-constraints.txt
@@ -3,7 +3,4 @@ numpy==1.15.0; python_version<='3.7' and platform_machine!='aarch64'
 numpy==1.19.2; python_version<='3.7' and platform_machine=='aarch64'
 numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
 numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
-numpy==1.19.3; python_version=='3.9'
-numpy==1.21.3; python_version=='3.10'
-numpy==1.23.2; python_version=='3.11'
-numpy; python_version>='3.12'
+numpy>=1.25.0; python_version>='3.9'
diff --git a/pyproject.toml b/pyproject.toml
index dcb5cf10d18..c611c6c1c93 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,8 @@ requires = [
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
     "thinc>=8.1.8,<8.2.0",
-    "numpy>=1.15.0",
+    "numpy>=1.15.0; python_version < '3.9'",
+    "numpy>=1.25.0; python_version >= '3.9'",
 ]
 build-backend = "setuptools.build_meta"
 
diff --git a/requirements.txt b/requirements.txt
index 2123ae976d3..9e787a22312 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,7 +14,8 @@ pathy>=0.10.0
 smart-open>=5.2.1,<7.0.0
 weasel>=0.1.0,<0.2.0
 # Third party dependencies
-numpy>=1.15.0
+numpy>=1.15.0; python_version < "3.9"
+numpy>=1.19.0; python_version >= "3.9"
 requests>=2.13.0,<3.0.0
 tqdm>=4.38.0,<5.0.0
 pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
diff --git a/setup.cfg b/setup.cfg
index 048bb37197e..d94c9c73bee 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -32,8 +32,13 @@ project_urls =
 zip_safe = false
 include_package_data = true
 python_requires = >=3.6
+# NOTE: This section is superseded by pyproject.toml and will be removed in
+# spaCy v4
 setup_requires =
     cython>=0.25,<3.0
+    # The newest supported pip for python 3.6 has bugs related to markers in
+    # this section, so this does not contain the same constraints as
+    # pyproject.toml
     numpy>=1.15.0
     # We also need our Cython packages here to compile against
     cymem>=2.0.2,<2.1.0
@@ -57,7 +62,8 @@ install_requires =
     pathy>=0.10.0
     smart-open>=5.2.1,<7.0.0
     tqdm>=4.38.0,<5.0.0
-    numpy>=1.15.0
+    numpy>=1.15.0; python_version < "3.9"
+    numpy>=1.19.0; python_version >= "3.9"
     requests>=2.13.0,<3.0.0
     pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
     jinja2
diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py
index fab1e8218e6..ff07c5b454a 100644
--- a/spacy/tests/package/test_requirements.py
+++ b/spacy/tests/package/test_requirements.py
@@ -4,8 +4,8 @@
 
 def test_build_dependencies():
     # Check that library requirements are pinned exactly the same across different setup files.
-    # TODO: correct checks for numpy rather than ignoring
     libs_ignore_requirements = [
+        "numpy",
         "pytest",
         "pytest-timeout",
         "mock",
@@ -23,6 +23,7 @@ def test_build_dependencies():
     ]
     # ignore language-specific packages that shouldn't be installed by all
     libs_ignore_setup = [
+        "numpy",
         "fugashi",
         "natto-py",
         "pythainlp",

From 9ffa5d8a1582bfb9720585792fe5294c36d55370 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 28 Jul 2023 15:48:36 +0200
Subject: [PATCH 07/39] Remove ray extra (#12870)

---
 setup.cfg | 2 --
 1 file changed, 2 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index d94c9c73bee..116e40f2cc9 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -82,8 +82,6 @@ lookups =
     spacy_lookups_data>=1.0.3,<1.1.0
 transformers =
     spacy_transformers>=1.1.2,<1.3.0
-ray =
-    spacy_ray>=0.1.0,<1.0.0
 cuda =
     cupy>=5.0.0b4,<13.0.0
 cuda80 =

From 0fe43f40f1390092dc265c9f2b2cef58ae06cc58 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 1 Aug 2023 15:46:08 +0200
Subject: [PATCH 08/39] Support registered vectors (#12492)

* Support registered vectors

* Format

* Auto-fill [nlp] on load from config and from bytes/disk

* Only auto-fill [nlp]

* Undo all changes to Language.from_disk

* Expand BaseVectors

These methods are needed in various places for training and vector
similarity.

* isort

* More linting

* Only fill [nlp.vectors]

* Update spacy/vocab.pyx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Revert changes to test related to auto-filling [nlp]

* Add vectors registry

* Rephrase error about vocab methods for vectors

* Switch to dummy implementation for BaseVectors.to_ops

* Add initial draft of docs

* Remove example from BaseVectors docs

* Apply suggestions from code review

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update website/docs/api/basevectors.mdx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Fix type and lint bpemb example

* Update website/docs/api/basevectors.mdx

---------

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/default_config.cfg                      |   3 +
 spacy/errors.py                               |   2 +
 spacy/language.py                             |  18 +-
 spacy/ml/staticvectors.py                     |  11 +-
 spacy/schemas.py                              |   1 +
 spacy/util.py                                 |   1 +
 spacy/vectors.pyx                             |  75 ++++++++-
 spacy/vocab.pyx                               |  18 +-
 website/docs/api/basevectors.mdx              | 143 ++++++++++++++++
 website/docs/api/vectors.mdx                  |   9 +-
 .../docs/usage/embeddings-transformers.mdx    | 159 ++++++++++++++++++
 website/meta/sidebars.json                    |   1 +
 12 files changed, 425 insertions(+), 16 deletions(-)
 create mode 100644 website/docs/api/basevectors.mdx

diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg
index 694fb732f43..b005eef4023 100644
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@@ -26,6 +26,9 @@ batch_size = 1000
 [nlp.tokenizer]
 @tokenizers = "spacy.Tokenizer.v1"
 
+[nlp.vectors]
+@vectors = "spacy.Vectors.v1"
+
 # The pipeline components and their models
 [components]
 
diff --git a/spacy/errors.py b/spacy/errors.py
index 225cb9c86ae..14ec669a308 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -553,6 +553,8 @@ class Errors(metaclass=ErrorsWithCodes):
             "during training, make sure to include it in 'annotating components'")
 
     # New errors added in v3.x
+    E849 = ("The vocab only supports {method} for vectors of type "
+            "spacy.vectors.Vectors, not {vectors_type}.")
     E850 = ("The PretrainVectors objective currently only supports default or "
             "floret vectors, not {mode} vectors.")
     E851 = ("The 'textcat' component labels should only have values of 0 or 1, "
diff --git a/spacy/language.py b/spacy/language.py
index b144b2c324a..26152b90a48 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -65,6 +65,7 @@
     registry,
     warn_if_jupyter_cupy,
 )
+from .vectors import BaseVectors
 from .vocab import Vocab, create_vocab
 
 PipeCallable = Callable[[Doc], Doc]
@@ -158,6 +159,7 @@ def __init__(
         max_length: int = 10**6,
         meta: Dict[str, Any] = {},
         create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
+        create_vectors: Optional[Callable[["Vocab"], BaseVectors]] = None,
         batch_size: int = 1000,
         **kwargs,
     ) -> None:
@@ -198,6 +200,10 @@ def __init__(
         if vocab is True:
             vectors_name = meta.get("vectors", {}).get("name")
             vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name)
+            if not create_vectors:
+                vectors_cfg = {"vectors": self._config["nlp"]["vectors"]}
+                create_vectors = registry.resolve(vectors_cfg)["vectors"]
+            vocab.vectors = create_vectors(vocab)
         else:
             if (self.lang and vocab.lang) and (self.lang != vocab.lang):
                 raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
@@ -1765,6 +1771,10 @@ def from_config(
             ).merge(config)
         if "nlp" not in config:
             raise ValueError(Errors.E985.format(config=config))
+        # fill in [nlp.vectors] if not present (as a narrower alternative to
+        # auto-filling [nlp] from the default config)
+        if "vectors" not in config["nlp"]:
+            config["nlp"]["vectors"] = {"@vectors": "spacy.Vectors.v1"}
         config_lang = config["nlp"].get("lang")
         if config_lang is not None and config_lang != cls.lang:
             raise ValueError(
@@ -1796,6 +1806,7 @@ def from_config(
             filled["nlp"], validate=validate, schema=ConfigSchemaNlp
         )
         create_tokenizer = resolved_nlp["tokenizer"]
+        create_vectors = resolved_nlp["vectors"]
         before_creation = resolved_nlp["before_creation"]
         after_creation = resolved_nlp["after_creation"]
         after_pipeline_creation = resolved_nlp["after_pipeline_creation"]
@@ -1816,7 +1827,12 @@ def from_config(
         # inside stuff like the spacy train function. If we loaded them here,
         # then we would load them twice at runtime: once when we make from config,
         # and then again when we load from disk.
-        nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer, meta=meta)
+        nlp = lang_cls(
+            vocab=vocab,
+            create_tokenizer=create_tokenizer,
+            create_vectors=create_vectors,
+            meta=meta,
+        )
         if after_creation is not None:
             nlp = after_creation(nlp)
             if not isinstance(nlp, cls):
diff --git a/spacy/ml/staticvectors.py b/spacy/ml/staticvectors.py
index b75240c5d1e..1a1b0a0fffd 100644
--- a/spacy/ml/staticvectors.py
+++ b/spacy/ml/staticvectors.py
@@ -9,7 +9,7 @@
 from ..attrs import ORTH
 from ..errors import Errors, Warnings
 from ..tokens import Doc
-from ..vectors import Mode
+from ..vectors import Mode, Vectors
 from ..vocab import Vocab
 
 
@@ -48,11 +48,14 @@ def forward(
     key_attr: int = getattr(vocab.vectors, "attr", ORTH)
     keys = model.ops.flatten([cast(Ints1d, doc.to_array(key_attr)) for doc in docs])
     W = cast(Floats2d, model.ops.as_contig(model.get_param("W")))
-    if vocab.vectors.mode == Mode.default:
+    if isinstance(vocab.vectors, Vectors) and vocab.vectors.mode == Mode.default:
         V = model.ops.asarray(vocab.vectors.data)
         rows = vocab.vectors.find(keys=keys)
         V = model.ops.as_contig(V[rows])
-    elif vocab.vectors.mode == Mode.floret:
+    elif isinstance(vocab.vectors, Vectors) and vocab.vectors.mode == Mode.floret:
+        V = vocab.vectors.get_batch(keys)
+        V = model.ops.as_contig(V)
+    elif hasattr(vocab.vectors, "get_batch"):
         V = vocab.vectors.get_batch(keys)
         V = model.ops.as_contig(V)
     else:
@@ -61,7 +64,7 @@ def forward(
         vectors_data = model.ops.gemm(V, W, trans2=True)
     except ValueError:
         raise RuntimeError(Errors.E896)
-    if vocab.vectors.mode == Mode.default:
+    if isinstance(vocab.vectors, Vectors) and vocab.vectors.mode == Mode.default:
         # Convert negative indices to 0-vectors
         # TODO: more options for UNK tokens
         vectors_data[rows < 0] = 0
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 22c25e99d04..3404687e1d1 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -397,6 +397,7 @@ class ConfigSchemaNlp(BaseModel):
     after_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after creation and before the pipeline is constructed")
     after_pipeline_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after the pipeline is constructed")
     batch_size: Optional[int] = Field(..., title="Default batch size")
+    vectors: Callable = Field(..., title="Vectors implementation")
     # fmt: on
 
     class Config:
diff --git a/spacy/util.py b/spacy/util.py
index a2a033cbc0d..1689ac827e1 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -118,6 +118,7 @@ class registry(thinc.registry):
     augmenters = catalogue.create("spacy", "augmenters", entry_points=True)
     loggers = catalogue.create("spacy", "loggers", entry_points=True)
     scorers = catalogue.create("spacy", "scorers", entry_points=True)
+    vectors = catalogue.create("spacy", "vectors", entry_points=True)
     # These are factories registered via third-party packages and the
     # spacy_factories entry point. This registry only exists so we can easily
     # load them via the entry points. The "true" factories are added via the
diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index a88f380f9f4..2817bcad42a 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -1,3 +1,6 @@
+# cython: infer_types=True, profile=True, binding=True
+from typing import Callable
+
 from cython.operator cimport dereference as deref
 from libc.stdint cimport uint32_t, uint64_t
 from libcpp.set cimport set as cppset
@@ -5,7 +8,8 @@ from murmurhash.mrmr cimport hash128_x64
 
 import warnings
 from enum import Enum
-from typing import cast
+from pathlib import Path
+from typing import TYPE_CHECKING, Union, cast
 
 import numpy
 import srsly
@@ -21,6 +25,9 @@ from .attrs import IDS
 from .errors import Errors, Warnings
 from .strings import get_string_id
 
+if TYPE_CHECKING:
+    from .vocab import Vocab  # noqa: F401  # no-cython-lint
+
 
 def unpickle_vectors(bytes_data):
     return Vectors().from_bytes(bytes_data)
@@ -35,7 +42,71 @@ class Mode(str, Enum):
         return list(cls.__members__.keys())
 
 
-cdef class Vectors:
+cdef class BaseVectors:
+    def __init__(self, *, strings=None):
+        # Make sure abstract BaseVectors is not instantiated.
+        if self.__class__ == BaseVectors:
+            raise TypeError(
+                Errors.E1046.format(cls_name=self.__class__.__name__)
+            )
+
+    def __getitem__(self, key):
+        raise NotImplementedError
+
+    def __contains__(self, key):
+        raise NotImplementedError
+
+    def is_full(self):
+        raise NotImplementedError
+
+    def get_batch(self, keys):
+        raise NotImplementedError
+
+    @property
+    def shape(self):
+        raise NotImplementedError
+
+    def __len__(self):
+        raise NotImplementedError
+
+    @property
+    def vectors_length(self):
+        raise NotImplementedError
+
+    @property
+    def size(self):
+        raise NotImplementedError
+
+    def add(self, key, *, vector=None):
+        raise NotImplementedError
+
+    def to_ops(self, ops: Ops):
+        pass
+
+    # add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
+    # allow serialization
+    def to_bytes(self, **kwargs):
+        return b""
+
+    def from_bytes(self, data: bytes, **kwargs):
+        return self
+
+    def to_disk(self, path: Union[str, Path], **kwargs):
+        return None
+
+    def from_disk(self, path: Union[str, Path], **kwargs):
+        return self
+
+
+@util.registry.vectors("spacy.Vectors.v1")
+def create_mode_vectors() -> Callable[["Vocab"], BaseVectors]:
+    def vectors_factory(vocab: "Vocab") -> BaseVectors:
+        return Vectors(strings=vocab.strings)
+
+    return vectors_factory
+
+
+cdef class Vectors(BaseVectors):
     """Store, save and load word vectors.
 
     Vectors data is kept in the vectors.data attribute, which should be an
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index d1edc853395..48e8fcb9087 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -94,8 +94,9 @@ cdef class Vocab:
             return self._vectors
 
         def __set__(self, vectors):
-            for s in vectors.strings:
-                self.strings.add(s)
+            if hasattr(vectors, "strings"):
+                for s in vectors.strings:
+                    self.strings.add(s)
             self._vectors = vectors
             self._vectors.strings = self.strings
 
@@ -193,7 +194,7 @@ cdef class Vocab:
         lex = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
         lex.orth = self.strings.add(string)
         lex.length = len(string)
-        if self.vectors is not None:
+        if self.vectors is not None and hasattr(self.vectors, "key2row"):
             lex.id = self.vectors.key2row.get(lex.orth, OOV_RANK)
         else:
             lex.id = OOV_RANK
@@ -289,12 +290,17 @@ cdef class Vocab:
 
     @property
     def vectors_length(self):
-        return self.vectors.shape[1]
+        if hasattr(self.vectors, "shape"):
+            return self.vectors.shape[1]
+        else:
+            return -1
 
     def reset_vectors(self, *, width=None, shape=None):
         """Drop the current vector table. Because all vectors must be the same
         width, you have to call this to change the size of the vectors.
         """
+        if not isinstance(self.vectors, Vectors):
+            raise ValueError(Errors.E849.format(method="reset_vectors", vectors_type=type(self.vectors)))
         if width is not None and shape is not None:
             raise ValueError(Errors.E065.format(width=width, shape=shape))
         elif shape is not None:
@@ -304,6 +310,8 @@ cdef class Vocab:
             self.vectors = Vectors(strings=self.strings, shape=(self.vectors.shape[0], width))
 
     def deduplicate_vectors(self):
+        if not isinstance(self.vectors, Vectors):
+            raise ValueError(Errors.E849.format(method="deduplicate_vectors", vectors_type=type(self.vectors)))
         if self.vectors.mode != VectorsMode.default:
             raise ValueError(Errors.E858.format(
                 mode=self.vectors.mode,
@@ -357,6 +365,8 @@ cdef class Vocab:
 
         DOCS: https://spacy.io/api/vocab#prune_vectors
         """
+        if not isinstance(self.vectors, Vectors):
+            raise ValueError(Errors.E849.format(method="prune_vectors", vectors_type=type(self.vectors)))
         if self.vectors.mode != VectorsMode.default:
             raise ValueError(Errors.E858.format(
                 mode=self.vectors.mode,
diff --git a/website/docs/api/basevectors.mdx b/website/docs/api/basevectors.mdx
new file mode 100644
index 00000000000..993b9a33e96
--- /dev/null
+++ b/website/docs/api/basevectors.mdx
@@ -0,0 +1,143 @@
+---
+title: BaseVectors
+teaser: Abstract class for word vectors
+tag: class
+source: spacy/vectors.pyx
+version: 3.7
+---
+
+`BaseVectors` is an abstract class to support the development of custom vectors
+implementations.
+
+For use in training with [`StaticVectors`](/api/architectures#staticvectors),
+`get_batch` must be implemented. For improved performance, use efficient
+batching in `get_batch` and implement `to_ops` to copy the vector data to the
+current device. See an example custom implementation for
+[BPEmb subword embeddings](/usage/embeddings-transformers#custom-vectors).
+
+## BaseVectors.\_\_init\_\_ {id="init",tag="method"}
+
+Create a new vector store.
+
+| Name           | Description                                                                                                           |
+| -------------- | --------------------------------------------------------------------------------------------------------------------- |
+| _keyword-only_ |                                                                                                                       |
+| `strings`      | The string store. A new string store is created if one is not provided. Defaults to `None`. ~~Optional[StringStore]~~ |
+
+## BaseVectors.\_\_getitem\_\_ {id="getitem",tag="method"}
+
+Get a vector by key. If the key is not found in the table, a `KeyError` should
+be raised.
+
+| Name        | Description                                                      |
+| ----------- | ---------------------------------------------------------------- |
+| `key`       | The key to get the vector for. ~~Union[int, str]~~               |
+| **RETURNS** | The vector for the key. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
+
+## BaseVectors.\_\_len\_\_ {id="len",tag="method"}
+
+Return the number of vectors in the table.
+
+| Name        | Description                                 |
+| ----------- | ------------------------------------------- |
+| **RETURNS** | The number of vectors in the table. ~~int~~ |
+
+## BaseVectors.\_\_contains\_\_ {id="contains",tag="method"}
+
+Check whether there is a vector entry for the given key.
+
+| Name        | Description                                  |
+| ----------- | -------------------------------------------- |
+| `key`       | The key to check. ~~int~~                    |
+| **RETURNS** | Whether the key has a vector entry. ~~bool~~ |
+
+## BaseVectors.add {id="add",tag="method"}
+
+Add a key to the table, if possible. If no keys can be added, return `-1`.
+
+| Name        | Description                                                                         |
+| ----------- | ----------------------------------------------------------------------------------- |
+| `key`       | The key to add. ~~Union[str, int]~~                                                 |
+| **RETURNS** | The row the vector was added to, or `-1` if the operation is not supported. ~~int~~ |
+
+## BaseVectors.shape {id="shape",tag="property"}
+
+Get `(rows, dims)` tuples of number of rows and number of dimensions in the
+vector table.
+
+| Name        | Description                                |
+| ----------- | ------------------------------------------ |
+| **RETURNS** | A `(rows, dims)` pair. ~~Tuple[int, int]~~ |
+
+## BaseVectors.size {id="size",tag="property"}
+
+The vector size, i.e. `rows * dims`.
+
+| Name        | Description              |
+| ----------- | ------------------------ |
+| **RETURNS** | The vector size. ~~int~~ |
+
+## BaseVectors.is_full {id="is_full",tag="property"}
+
+Whether the vectors table is full and no slots are available for new keys.
+
+| Name        | Description                                 |
+| ----------- | ------------------------------------------- |
+| **RETURNS** | Whether the vectors table is full. ~~bool~~ |
+
+## BaseVectors.get_batch {id="get_batch",tag="method",version="3.2"}
+
+Get the vectors for the provided keys efficiently as a batch. Required to use
+the vectors with [`StaticVectors`](/api/architectures#StaticVectors) for
+training.
+
+| Name   | Description                             |
+| ------ | --------------------------------------- |
+| `keys` | The keys. ~~Iterable[Union[int, str]]~~ |
+
+## BaseVectors.to_ops {id="to_ops",tag="method"}
+
+Dummy method. Implement this to change the embedding matrix to use different
+Thinc ops.
+
+| Name  | Description                                              |
+| ----- | -------------------------------------------------------- |
+| `ops` | The Thinc ops to switch the embedding matrix to. ~~Ops~~ |
+
+## BaseVectors.to_disk {id="to_disk",tag="method"}
+
+Dummy method to allow serialization. Implement to save vector data with the
+pipeline.
+
+| Name   | Description                                                                                                                                |
+| ------ | ------------------------------------------------------------------------------------------------------------------------------------------ |
+| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
+
+## BaseVectors.from_disk {id="from_disk",tag="method"}
+
+Dummy method to allow serialization. Implement to load vector data from a saved
+pipeline.
+
+| Name        | Description                                                                                     |
+| ----------- | ----------------------------------------------------------------------------------------------- |
+| `path`      | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
+| **RETURNS** | The modified vectors object. ~~BaseVectors~~                                                    |
+
+## BaseVectors.to_bytes {id="to_bytes",tag="method"}
+
+Dummy method to allow serialization. Implement to serialize vector data to a
+binary string.
+
+| Name        | Description                                          |
+| ----------- | ---------------------------------------------------- |
+| **RETURNS** | The serialized form of the vectors object. ~~bytes~~ |
+
+## BaseVectors.from_bytes {id="from_bytes",tag="method"}
+
+Dummy method to allow serialization. Implement to load vector data from a binary
+string.
+
+| Name        | Description                         |
+| ----------- | ----------------------------------- |
+| `data`      | The data to load from. ~~bytes~~    |
+| **RETURNS** | The vectors object. ~~BaseVectors~~ |
diff --git a/website/docs/api/vectors.mdx b/website/docs/api/vectors.mdx
index fa4cd0c7ad6..0e92eb12ba4 100644
--- a/website/docs/api/vectors.mdx
+++ b/website/docs/api/vectors.mdx
@@ -297,10 +297,9 @@ The vector size, i.e. `rows * dims`.
 
 ## Vectors.is_full {id="is_full",tag="property"}
 
-Whether the vectors table is full and has no slots are available for new keys.
-If a table is full, it can be resized using
-[`Vectors.resize`](/api/vectors#resize). In `floret` mode, the table is always
-full and cannot be resized.
+Whether the vectors table is full and no slots are available for new keys. If a
+table is full, it can be resized using [`Vectors.resize`](/api/vectors#resize).
+In `floret` mode, the table is always full and cannot be resized.
 
 > #### Example
 >
@@ -441,7 +440,7 @@ Load state from a binary string.
 > #### Example
 >
 > ```python
-> fron spacy.vectors import Vectors
+> from spacy.vectors import Vectors
 > vectors_bytes = vectors.to_bytes()
 > new_vectors = Vectors(StringStore())
 > new_vectors.from_bytes(vectors_bytes)
diff --git a/website/docs/usage/embeddings-transformers.mdx b/website/docs/usage/embeddings-transformers.mdx
index 5f1e5b817a6..2bd2856b6a3 100644
--- a/website/docs/usage/embeddings-transformers.mdx
+++ b/website/docs/usage/embeddings-transformers.mdx
@@ -632,6 +632,165 @@ def MyCustomVectors(
     )
 ```
 
+#### Creating a custom vectors implementation {id="custom-vectors",version="3.7"}
+
+You can specify a custom registered vectors class under `[nlp.vectors]` in order
+to use static vectors in formats other than the ones supported by
+[`Vectors`](/api/vectors). Extend the abstract [`BaseVectors`](/api/basevectors)
+class to implement your custom vectors.
+
+As an example, the following `BPEmbVectors` class implements support for
+[BPEmb subword embeddings](https://bpemb.h-its.org/):
+
+```python
+# requires: pip install bpemb
+import warnings
+from pathlib import Path
+from typing import Callable, Optional, cast
+
+from bpemb import BPEmb
+from thinc.api import Ops, get_current_ops
+from thinc.backends import get_array_ops
+from thinc.types import Floats2d
+
+from spacy.strings import StringStore
+from spacy.util import registry
+from spacy.vectors import BaseVectors
+from spacy.vocab import Vocab
+
+
+class BPEmbVectors(BaseVectors):
+    def __init__(
+        self,
+        *,
+        strings: Optional[StringStore] = None,
+        lang: Optional[str] = None,
+        vs: Optional[int] = None,
+        dim: Optional[int] = None,
+        cache_dir: Optional[Path] = None,
+        encode_extra_options: Optional[str] = None,
+        model_file: Optional[Path] = None,
+        emb_file: Optional[Path] = None,
+    ):
+        kwargs = {}
+        if lang is not None:
+            kwargs["lang"] = lang
+        if vs is not None:
+            kwargs["vs"] = vs
+        if dim is not None:
+            kwargs["dim"] = dim
+        if cache_dir is not None:
+            kwargs["cache_dir"] = cache_dir
+        if encode_extra_options is not None:
+            kwargs["encode_extra_options"] = encode_extra_options
+        if model_file is not None:
+            kwargs["model_file"] = model_file
+        if emb_file is not None:
+            kwargs["emb_file"] = emb_file
+        self.bpemb = BPEmb(**kwargs)
+        self.strings = strings
+        self.name = repr(self.bpemb)
+        self.n_keys = -1
+        self.mode = "BPEmb"
+        self.to_ops(get_current_ops())
+
+    def __contains__(self, key):
+        return True
+
+    def is_full(self):
+        return True
+
+    def add(self, key, *, vector=None, row=None):
+        warnings.warn(
+            (
+                "Skipping BPEmbVectors.add: the bpemb vector table cannot be "
+                "modified. Vectors are calculated from bytepieces."
+            )
+        )
+        return -1
+
+    def __getitem__(self, key):
+        return self.get_batch([key])[0]
+
+    def get_batch(self, keys):
+        keys = [self.strings.as_string(key) for key in keys]
+        bp_ids = self.bpemb.encode_ids(keys)
+        ops = get_array_ops(self.bpemb.emb.vectors)
+        indices = ops.asarray(ops.xp.hstack(bp_ids), dtype="int32")
+        lengths = ops.asarray([len(x) for x in bp_ids], dtype="int32")
+        vecs = ops.reduce_mean(cast(Floats2d, self.bpemb.emb.vectors[indices]), lengths)
+        return vecs
+
+    @property
+    def shape(self):
+        return self.bpemb.vectors.shape
+
+    def __len__(self):
+        return self.shape[0]
+
+    @property
+    def vectors_length(self):
+        return self.shape[1]
+
+    @property
+    def size(self):
+        return self.bpemb.vectors.size
+
+    def to_ops(self, ops: Ops):
+        self.bpemb.emb.vectors = ops.asarray(self.bpemb.emb.vectors)
+
+
+@registry.vectors("BPEmbVectors.v1")
+def create_bpemb_vectors(
+    lang: Optional[str] = "multi",
+    vs: Optional[int] = None,
+    dim: Optional[int] = None,
+    cache_dir: Optional[Path] = None,
+    encode_extra_options: Optional[str] = None,
+    model_file: Optional[Path] = None,
+    emb_file: Optional[Path] = None,
+) -> Callable[[Vocab], BPEmbVectors]:
+    def bpemb_vectors_factory(vocab: Vocab) -> BPEmbVectors:
+        return BPEmbVectors(
+            strings=vocab.strings,
+            lang=lang,
+            vs=vs,
+            dim=dim,
+            cache_dir=cache_dir,
+            encode_extra_options=encode_extra_options,
+            model_file=model_file,
+            emb_file=emb_file,
+        )
+
+    return bpemb_vectors_factory
+```
+
+<Infobox variant="warning">
+
+Note that the serialization methods are not implemented, so the embeddings are
+loaded from your local cache or downloaded by `BPEmb` each time the pipeline is
+loaded.
+
+</Infobox>
+
+To use this in your pipeline, specify this registered function under
+`[nlp.vectors]` in your config:
+
+```ini
+[nlp.vectors]
+@vectors = "BPEmbVectors.v1"
+lang = "en"
+```
+
+Or specify it when creating a blank pipeline:
+
+```python
+nlp = spacy.blank("en", config={"nlp.vectors": {"@vectors": "BPEmbVectors.v1", "lang": "en"}})
+```
+
+Remember to include this code with `--code` when using
+[`spacy train`](/api/cli#train) and [`spacy package`](/api/cli#package).
+
 ## Pretraining {id="pretraining"}
 
 The [`spacy pretrain`](/api/cli#pretrain) command lets you initialize your
diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json
index 04102095f3a..d2f73d83a66 100644
--- a/website/meta/sidebars.json
+++ b/website/meta/sidebars.json
@@ -131,6 +131,7 @@
                 "label": "Other",
                 "items": [
                     { "text": "Attributes", "url": "/api/attributes" },
+                    { "text": "BaseVectors", "url": "/api/basevectors" },
                     { "text": "Corpus", "url": "/api/corpus" },
                     { "text": "InMemoryLookupKB", "url": "/api/inmemorylookupkb" },
                     { "text": "KnowledgeBase", "url": "/api/kb" },

From 07374430964848148bc018ff2f36f3dc3cf3b315 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Wed, 2 Aug 2023 08:15:12 +0200
Subject: [PATCH 09/39] feat: add example stubs (3) (#12801)

* feat: add example stubs

* fix: add required annotations

* fix: mypy issues

* fix: use Py36-compatible Portocol

* Minor reformatting

* adding further type specifications and removing internal methods

* black formatting

* widen type to iterable

* add private methods that are being used by the built-in convertors

* revert changes to corpus.py

* fixes

* fixes

* fix typing of PlainTextCorpus

---------

Co-authored-by: Basile Dura <basile@bdura.me>
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/tokens/doc.pyi       |  8 ++++-
 spacy/training/corpus.py   |  2 +-
 spacy/training/example.pyi | 66 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 74 insertions(+), 2 deletions(-)
 create mode 100644 spacy/training/example.pyi

diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi
index 00c7a9d07f3..55222f8aa87 100644
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@@ -8,6 +8,7 @@ from typing import (
     List,
     Optional,
     Protocol,
+    Sequence,
     Tuple,
     Union,
     overload,
@@ -134,7 +135,12 @@ class Doc:
     def text(self) -> str: ...
     @property
     def text_with_ws(self) -> str: ...
-    ents: Tuple[Span]
+    # Ideally the getter would output Tuple[Span]
+    # see https://github.com/python/mypy/issues/3004
+    @property
+    def ents(self) -> Sequence[Span]: ...
+    @ents.setter
+    def ents(self, value: Sequence[Span]) -> None: ...
     def set_ents(
         self,
         entities: List[Span],
diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py
index 6037c15e33d..5cc2733a540 100644
--- a/spacy/training/corpus.py
+++ b/spacy/training/corpus.py
@@ -63,7 +63,7 @@ def create_plain_text_reader(
     path: Optional[Path],
     min_length: int = 0,
     max_length: int = 0,
-) -> Callable[["Language"], Iterable[Doc]]:
+) -> Callable[["Language"], Iterable[Example]]:
     """Iterate Example objects from a file or directory of plain text
     UTF-8 files with one line per doc.
 
diff --git a/spacy/training/example.pyi b/spacy/training/example.pyi
new file mode 100644
index 00000000000..06639d70c06
--- /dev/null
+++ b/spacy/training/example.pyi
@@ -0,0 +1,66 @@
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
+
+from ..tokens import Doc, Span
+from ..vocab import Vocab
+from .alignment import Alignment
+
+def annotations_to_doc(
+    vocab: Vocab,
+    tok_annot: Dict[str, Any],
+    doc_annot: Dict[str, Any],
+) -> Doc: ...
+def validate_examples(
+    examples: Iterable[Example],
+    method: str,
+) -> None: ...
+def validate_get_examples(
+    get_examples: Callable[[], Iterable[Example]],
+    method: str,
+): ...
+
+class Example:
+    x: Doc
+    y: Doc
+
+    def __init__(
+        self,
+        predicted: Doc,
+        reference: Doc,
+        *,
+        alignment: Optional[Alignment] = None,
+    ): ...
+    def __len__(self) -> int: ...
+    @property
+    def predicted(self) -> Doc: ...
+    @predicted.setter
+    def predicted(self, doc: Doc) -> None: ...
+    @property
+    def reference(self) -> Doc: ...
+    @reference.setter
+    def reference(self, doc: Doc) -> None: ...
+    def copy(self) -> Example: ...
+    @classmethod
+    def from_dict(cls, predicted: Doc, example_dict: Dict[str, Any]) -> Example: ...
+    @property
+    def alignment(self) -> Alignment: ...
+    def get_aligned(self, field: str, as_string=False): ...
+    def get_aligned_parse(self, projectivize=True): ...
+    def get_aligned_sent_starts(self): ...
+    def get_aligned_spans_x2y(
+        self, x_spans: Iterable[Span], allow_overlap=False
+    ) -> List[Span]: ...
+    def get_aligned_spans_y2x(
+        self, y_spans: Iterable[Span], allow_overlap=False
+    ) -> List[Span]: ...
+    def get_aligned_ents_and_ner(self) -> Tuple[List[Span], List[str]]: ...
+    def get_aligned_ner(self) -> List[str]: ...
+    def get_matching_ents(self, check_label: bool = True) -> List[Span]: ...
+    def to_dict(self) -> Dict[str, Any]: ...
+    def split_sents(self) -> List[Example]: ...
+    @property
+    def text(self) -> str: ...
+    def __str__(self) -> str: ...
+    def __repr__(self) -> str: ...
+
+def _parse_example_dict_data(example_dict): ...
+def _fix_legacy_dict_data(example_dict): ...

From e5773e0c6940070f192eb9663a9fda2a2c989dae Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 2 Aug 2023 09:35:16 +0200
Subject: [PATCH 10/39] Extend to spacy-transformers v1.3.x (#12877)

---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index 116e40f2cc9..d19e5bc018b 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -81,7 +81,7 @@ console_scripts =
 lookups =
     spacy_lookups_data>=1.0.3,<1.1.0
 transformers =
-    spacy_transformers>=1.1.2,<1.3.0
+    spacy_transformers>=1.1.2,<1.4.0
 cuda =
     cupy>=5.0.0b4,<13.0.0
 cuda80 =

From 060241a8d571023937fd8ca701479909a90782da Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 10 Aug 2023 11:42:09 +0200
Subject: [PATCH 11/39] Revert "Extend to spacy-transformers v1.3.x (#12877)"

This reverts commit e5773e0c6940070f192eb9663a9fda2a2c989dae.
---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index d19e5bc018b..116e40f2cc9 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -81,7 +81,7 @@ console_scripts =
 lookups =
     spacy_lookups_data>=1.0.3,<1.1.0
 transformers =
-    spacy_transformers>=1.1.2,<1.4.0
+    spacy_transformers>=1.1.2,<1.3.0
 cuda =
     cupy>=5.0.0b4,<13.0.0
 cuda80 =

From 9622c11529a5b8b25617fb72584997ee94d906ff Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 11 Aug 2023 10:59:51 +0200
Subject: [PATCH 12/39] Extend to weasel v0.2 (#12902)

---
 requirements.txt | 2 +-
 setup.cfg        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 9e787a22312..89fc248fcb7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,7 +12,7 @@ catalogue>=2.0.6,<2.1.0
 typer>=0.3.0,<0.10.0
 pathy>=0.10.0
 smart-open>=5.2.1,<7.0.0
-weasel>=0.1.0,<0.2.0
+weasel>=0.1.0,<0.3.0
 # Third party dependencies
 numpy>=1.15.0; python_version < "3.9"
 numpy>=1.19.0; python_version >= "3.9"
diff --git a/setup.cfg b/setup.cfg
index 116e40f2cc9..078b7d4bd08 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -56,7 +56,7 @@ install_requires =
     wasabi>=0.9.1,<1.2.0
     srsly>=2.4.3,<3.0.0
     catalogue>=2.0.6,<2.1.0
-    weasel>=0.1.0,<0.2.0
+    weasel>=0.1.0,<0.3.0
     # Third-party dependencies
     typer>=0.3.0,<0.10.0
     pathy>=0.10.0

From 6a4aa43164229262f6770272deedebe9ffc45329 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 11 Aug 2023 13:05:46 +0200
Subject: [PATCH 13/39] Extend to thinc v8.2 (#12897)

---
 pyproject.toml   | 2 +-
 requirements.txt | 2 +-
 setup.cfg        | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index c611c6c1c93..336c0793caa 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.1.8,<8.2.0",
+    "thinc>=8.1.8,<8.3.0",
     "numpy>=1.15.0; python_version < '3.9'",
     "numpy>=1.25.0; python_version >= '3.9'",
 ]
diff --git a/requirements.txt b/requirements.txt
index 89fc248fcb7..237c790b4aa 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ spacy-legacy>=3.0.11,<3.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.1.8,<8.2.0
+thinc>=8.1.8,<8.3.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.2.0
diff --git a/setup.cfg b/setup.cfg
index 078b7d4bd08..b01298a72dd 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -44,7 +44,7 @@ setup_requires =
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
     murmurhash>=0.28.0,<1.1.0
-    thinc>=8.1.8,<8.2.0
+    thinc>=8.1.8,<8.3.0
 install_requires =
     # Our libraries
     spacy-legacy>=3.0.11,<3.1.0
@@ -52,7 +52,7 @@ install_requires =
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=8.1.8,<8.2.0
+    thinc>=8.1.8,<8.3.0
     wasabi>=0.9.1,<1.2.0
     srsly>=2.4.3,<3.0.0
     catalogue>=2.0.6,<2.1.0

From 198488ee86735f0f37310913e8dbe69d01371241 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 16 Aug 2023 17:36:53 +0200
Subject: [PATCH 14/39] Extend to weasel v0.3 (#12908)

* Extend to weasel v0.3

* Clean up unused imports in test_cli
---
 requirements.txt        | 2 +-
 setup.cfg               | 2 +-
 spacy/tests/test_cli.py | 8 ++------
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 237c790b4aa..b6cc542a567 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,7 +12,7 @@ catalogue>=2.0.6,<2.1.0
 typer>=0.3.0,<0.10.0
 pathy>=0.10.0
 smart-open>=5.2.1,<7.0.0
-weasel>=0.1.0,<0.3.0
+weasel>=0.1.0,<0.4.0
 # Third party dependencies
 numpy>=1.15.0; python_version < "3.9"
 numpy>=1.19.0; python_version >= "3.9"
diff --git a/setup.cfg b/setup.cfg
index b01298a72dd..9a5388c8035 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -56,7 +56,7 @@ install_requires =
     wasabi>=0.9.1,<1.2.0
     srsly>=2.4.3,<3.0.0
     catalogue>=2.0.6,<2.1.0
-    weasel>=0.1.0,<0.3.0
+    weasel>=0.1.0,<0.4.0
     # Third-party dependencies
     typer>=0.3.0,<0.10.0
     pathy>=0.10.0
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 9b4f6851e47..c107992ed57 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -1,18 +1,14 @@
 import math
 import os
-import time
 from collections import Counter
 from pathlib import Path
 from typing import Any, Dict, List, Tuple
 
-import numpy
 import pytest
 import srsly
 from click import NoSuchOption
 from packaging.specifiers import SpecifierSet
-from thinc.api import Config, ConfigValidationError
-from weasel.cli.remote_storage import RemoteStorage
-from weasel.cli.run import _check_requirements
+from thinc.api import Config
 
 import spacy
 from spacy import about
@@ -39,7 +35,7 @@
 from spacy.lang.en import English
 from spacy.lang.nl import Dutch
 from spacy.language import Language
-from spacy.schemas import RecommendationSchema, validate
+from spacy.schemas import RecommendationSchema
 from spacy.tokens import Doc, DocBin
 from spacy.tokens.span import Span
 from spacy.training import Example, docs_to_json, offsets_to_biluo_tags

From 869cc4ab0b44da9455e772f40d59244fa9c6eb28 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Tue, 22 Aug 2023 09:03:35 +0200
Subject: [PATCH 15/39] warn when an unsupported/unknown key is given to the
 dependency matcher (#12928)

---
 spacy/errors.py                                | 1 +
 spacy/matcher/dependencymatcher.pyx            | 8 ++++++++
 spacy/tests/matcher/test_dependency_matcher.py | 5 +++++
 3 files changed, 14 insertions(+)

diff --git a/spacy/errors.py b/spacy/errors.py
index 14ec669a308..dac07f80409 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -219,6 +219,7 @@ class Warnings(metaclass=ErrorsWithCodes):
     W125 = ("The StaticVectors key_attr is no longer used. To set a custom "
             "key attribute for vectors, configure it through Vectors(attr=) or "
             "'spacy init vectors --attr'")
+    W126 = ("These keys are unsupported: {unsupported}")
 
 
 class Errors(metaclass=ErrorsWithCodes):
diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx
index 348e000ffb2..1f66d99b222 100644
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@@ -129,6 +129,7 @@ cdef class DependencyMatcher:
             else:
                 required_keys = {"RIGHT_ID", "RIGHT_ATTRS", "REL_OP", "LEFT_ID"}
                 relation_keys = set(relation.keys())
+                # Identify required keys that have not been specified
                 missing = required_keys - relation_keys
                 if missing:
                     missing_txt = ", ".join(list(missing))
@@ -136,6 +137,13 @@ cdef class DependencyMatcher:
                         required=required_keys,
                         missing=missing_txt
                     ))
+                # Identify additional, unsupported keys
+                unsupported = relation_keys - required_keys
+                if unsupported:
+                    unsupported_txt = ", ".join(list(unsupported))
+                    warnings.warn(Warnings.W126.format(
+                        unsupported=unsupported_txt
+                    ))
                 if (
                     relation["RIGHT_ID"] in visited_nodes
                     or relation["LEFT_ID"] not in visited_nodes
diff --git a/spacy/tests/matcher/test_dependency_matcher.py b/spacy/tests/matcher/test_dependency_matcher.py
index 44b3bb26b0e..be33f90cf44 100644
--- a/spacy/tests/matcher/test_dependency_matcher.py
+++ b/spacy/tests/matcher/test_dependency_matcher.py
@@ -216,6 +216,11 @@ def test_dependency_matcher_pattern_validation(en_vocab):
         pattern2 = copy.deepcopy(pattern)
         pattern2[1]["RIGHT_ID"] = "fox"
         matcher.add("FOUNDED", [pattern2])
+    # invalid key
+    with pytest.warns(UserWarning):
+        pattern2 = copy.deepcopy(pattern)
+        pattern2[1]["FOO"] = "BAR"
+        matcher.add("FOUNDED", [pattern2])
 
 
 def test_dependency_matcher_callback(en_vocab, doc):

From c2303858e617f28f981db599afd3b05c8c824321 Mon Sep 17 00:00:00 2001
From: Vinit Ravishankar <vinit.ravishankar@gmail.com>
Date: Tue, 29 Aug 2023 17:52:16 +0200
Subject: [PATCH 16/39] Documentation for spacy-curated-transformers (#12677)

* initial

* initial documentation run

* fix typo

* Remove mentions of Torchscript and quantization

Both are disabled in the initial release of `spacy-curated-transformers`.

* Fix `piece_encoder` entries

* Remove `spacy-transformers`-specific warning

* Fix duplicate entries in tables

* Doc fixes

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Remove type aliases

* Fix copy-paste typo

* Change `debug pieces` version tag to `3.7`

* Set curated transformers API version to  `3.7`

* Fix transformer listener naming

* Add docs for `init fill-config-transformer`

* Update CLI command invocation syntax

* Update intro section of the pipeline component docs

* Fix source URL

* Add a note to the architectures section about the `init fill-config-transformer` CLI command

* Apply suggestions from code review

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update CLI command name, args

* Remove hyphen from the `curated-transformers.mdx` filename

* Fix links

* Remove placeholder text

* Add text to the model/tokenizer loader sections

* Fill in the `DocTransformerOutput` section

* Formatting fixes

* Add curated transformer page to API docs sidebar

* More formatting fixes

* Remove TODO comment

* Remove outdated info about default config

* Apply suggestions from code review

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Add link to HF model hub

* `prettier`

---------

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 website/docs/api/architectures.mdx      | 280 ++++++++++++
 website/docs/api/cli.mdx                |  73 ++-
 website/docs/api/curatedtransformer.mdx | 572 ++++++++++++++++++++++++
 website/meta/sidebars.json              |   1 +
 4 files changed, 919 insertions(+), 7 deletions(-)
 create mode 100644 website/docs/api/curatedtransformer.mdx

diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx
index bab24f13b52..2853d25128f 100644
--- a/website/docs/api/architectures.mdx
+++ b/website/docs/api/architectures.mdx
@@ -481,6 +481,286 @@ The other arguments are shared between all versions.
 
 </Accordion>
 
+## Curated Transformer architectures {id="curated-trf",source="https://github.com/explosion/spacy-curated-transformers/blob/main/spacy_curated_transformers/models/architectures.py"}
+
+The following architectures are provided by the package
+[`spacy-curated-transformers`](https://github.com/explosion/spacy-curated-transformers).
+See the [usage documentation](/usage/embeddings-transformers#transformers) for
+how to integrate the architectures into your training config.
+
+When loading the model
+[from the Hugging Face Hub](/api/curatedtransformer#hf_trfencoder_loader), the
+model config's parameters must be same as the hyperparameters used by the
+pre-trained model. The
+[`init fill-curated-transformer`](/api/cli#init-fill-curated-transformer) CLI
+command can be used to automatically fill in these values.
+
+### spacy-curated-transformers.AlbertTransformer.v1
+
+Construct an ALBERT transformer model.
+
+| Name                           | Description                                                                              |
+| ------------------------------ | ---------------------------------------------------------------------------------------- |
+| `vocab_size`                   | Vocabulary size. ~~int~~                                                                 |
+| `with_spans`                   | Callback that constructs a span generator model. ~~Callable~~                            |
+| `piece_encoder`                | The piece encoder to segment input tokens. ~~Model~~                                     |
+| `attention_probs_dropout_prob` | Dropout probability of the self-attention layers. ~~float~~                              |
+| `embedding_width`              | Width of the embedding representations. ~~int~~                                          |
+| `hidden_act`                   | Activation used by the point-wise feed-forward layers. ~~str~~                           |
+| `hidden_dropout_prob`          | Dropout probability of the point-wise feed-forward and embedding layers. ~~float~~       |
+| `hidden_width`                 | Width of the final representations. ~~int~~                                              |
+| `intermediate_width`           | Width of the intermediate projection layer in the point-wise feed-forward layer. ~~int~~ |
+| `layer_norm_eps`               | Epsilon for layer normalization. ~~float~~                                               |
+| `max_position_embeddings`      | Maximum length of position embeddings. ~~int~~                                           |
+| `model_max_length`             | Maximum length of model inputs. ~~int~~                                                  |
+| `num_attention_heads`          | Number of self-attention heads. ~~int~~                                                  |
+| `num_hidden_groups`            | Number of layer groups whose constituents share parameters. ~~int~~                      |
+| `num_hidden_layers`            | Number of hidden layers. ~~int~~                                                         |
+| `padding_idx`                  | Index of the padding meta-token. ~~int~~                                                 |
+| `type_vocab_size`              | Type vocabulary size. ~~int~~                                                            |
+| `mixed_precision`              | Use mixed-precision training. ~~bool~~                                                   |
+| `grad_scaler_config`           | Configuration passed to the PyTorch gradient scaler. ~~dict~~                            |
+| **CREATES**                    | The model using the architecture ~~Model~~                                               |
+
+### spacy-curated-transformers.BertTransformer.v1
+
+Construct a BERT transformer model.
+
+| Name                           | Description                                                                              |
+| ------------------------------ | ---------------------------------------------------------------------------------------- |
+| `vocab_size`                   | Vocabulary size. ~~int~~                                                                 |
+| `with_spans`                   | Callback that constructs a span generator model. ~~Callable~~                            |
+| `piece_encoder`                | The piece encoder to segment input tokens. ~~Model~~                                     |
+| `attention_probs_dropout_prob` | Dropout probability of the self-attention layers. ~~float~~                              |
+| `hidden_act`                   | Activation used by the point-wise feed-forward layers. ~~str~~                           |
+| `hidden_dropout_prob`          | Dropout probability of the point-wise feed-forward and embedding layers. ~~float~~       |
+| `hidden_width`                 | Width of the final representations. ~~int~~                                              |
+| `intermediate_width`           | Width of the intermediate projection layer in the point-wise feed-forward layer. ~~int~~ |
+| `layer_norm_eps`               | Epsilon for layer normalization. ~~float~~                                               |
+| `max_position_embeddings`      | Maximum length of position embeddings. ~~int~~                                           |
+| `model_max_length`             | Maximum length of model inputs. ~~int~~                                                  |
+| `num_attention_heads`          | Number of self-attention heads. ~~int~~                                                  |
+| `num_hidden_layers`            | Number of hidden layers. ~~int~~                                                         |
+| `padding_idx`                  | Index of the padding meta-token. ~~int~~                                                 |
+| `type_vocab_size`              | Type vocabulary size. ~~int~~                                                            |
+| `mixed_precision`              | Use mixed-precision training. ~~bool~~                                                   |
+| `grad_scaler_config`           | Configuration passed to the PyTorch gradient scaler. ~~dict~~                            |
+| **CREATES**                    | The model using the architecture ~~Model~~                                               |
+
+### spacy-curated-transformers.CamembertTransformer.v1
+
+Construct a CamemBERT transformer model.
+
+| Name                           | Description                                                                              |
+| ------------------------------ | ---------------------------------------------------------------------------------------- |
+| `vocab_size`                   | Vocabulary size. ~~int~~                                                                 |
+| `with_spans`                   | Callback that constructs a span generator model. ~~Callable~~                            |
+| `piece_encoder`                | The piece encoder to segment input tokens. ~~Model~~                                     |
+| `attention_probs_dropout_prob` | Dropout probability of the self-attention layers. ~~float~~                              |
+| `hidden_act`                   | Activation used by the point-wise feed-forward layers. ~~str~~                           |
+| `hidden_dropout_prob`          | Dropout probability of the point-wise feed-forward and embedding layers. ~~float~~       |
+| `hidden_width`                 | Width of the final representations. ~~int~~                                              |
+| `intermediate_width`           | Width of the intermediate projection layer in the point-wise feed-forward layer. ~~int~~ |
+| `layer_norm_eps`               | Epsilon for layer normalization. ~~float~~                                               |
+| `max_position_embeddings`      | Maximum length of position embeddings. ~~int~~                                           |
+| `model_max_length`             | Maximum length of model inputs. ~~int~~                                                  |
+| `num_attention_heads`          | Number of self-attention heads. ~~int~~                                                  |
+| `num_hidden_layers`            | Number of hidden layers. ~~int~~                                                         |
+| `padding_idx`                  | Index of the padding meta-token. ~~int~~                                                 |
+| `type_vocab_size`              | Type vocabulary size. ~~int~~                                                            |
+| `mixed_precision`              | Use mixed-precision training. ~~bool~~                                                   |
+| `grad_scaler_config`           | Configuration passed to the PyTorch gradient scaler. ~~dict~~                            |
+| **CREATES**                    | The model using the architecture ~~Model~~                                               |
+
+### spacy-curated-transformers.RobertaTransformer.v1
+
+Construct a RoBERTa transformer model.
+
+| Name                           | Description                                                                              |
+| ------------------------------ | ---------------------------------------------------------------------------------------- |
+| `vocab_size`                   | Vocabulary size. ~~int~~                                                                 |
+| `with_spans`                   | Callback that constructs a span generator model. ~~Callable~~                            |
+| `piece_encoder`                | The piece encoder to segment input tokens. ~~Model~~                                     |
+| `attention_probs_dropout_prob` | Dropout probability of the self-attention layers. ~~float~~                              |
+| `hidden_act`                   | Activation used by the point-wise feed-forward layers. ~~str~~                           |
+| `hidden_dropout_prob`          | Dropout probability of the point-wise feed-forward and embedding layers. ~~float~~       |
+| `hidden_width`                 | Width of the final representations. ~~int~~                                              |
+| `intermediate_width`           | Width of the intermediate projection layer in the point-wise feed-forward layer. ~~int~~ |
+| `layer_norm_eps`               | Epsilon for layer normalization. ~~float~~                                               |
+| `max_position_embeddings`      | Maximum length of position embeddings. ~~int~~                                           |
+| `model_max_length`             | Maximum length of model inputs. ~~int~~                                                  |
+| `num_attention_heads`          | Number of self-attention heads. ~~int~~                                                  |
+| `num_hidden_layers`            | Number of hidden layers. ~~int~~                                                         |
+| `padding_idx`                  | Index of the padding meta-token. ~~int~~                                                 |
+| `type_vocab_size`              | Type vocabulary size. ~~int~~                                                            |
+| `mixed_precision`              | Use mixed-precision training. ~~bool~~                                                   |
+| `grad_scaler_config`           | Configuration passed to the PyTorch gradient scaler. ~~dict~~                            |
+| **CREATES**                    | The model using the architecture ~~Model~~                                               |
+
+### spacy-curated-transformers.XlmrTransformer.v1
+
+Construct a XLM-RoBERTa transformer model.
+
+| Name                           | Description                                                                              |
+| ------------------------------ | ---------------------------------------------------------------------------------------- |
+| `vocab_size`                   | Vocabulary size. ~~int~~                                                                 |
+| `with_spans`                   | Callback that constructs a span generator model. ~~Callable~~                            |
+| `piece_encoder`                | The piece encoder to segment input tokens. ~~Model~~                                     |
+| `attention_probs_dropout_prob` | Dropout probability of the self-attention layers. ~~float~~                              |
+| `hidden_act`                   | Activation used by the point-wise feed-forward layers. ~~str~~                           |
+| `hidden_dropout_prob`          | Dropout probability of the point-wise feed-forward and embedding layers. ~~float~~       |
+| `hidden_width`                 | Width of the final representations. ~~int~~                                              |
+| `intermediate_width`           | Width of the intermediate projection layer in the point-wise feed-forward layer. ~~int~~ |
+| `layer_norm_eps`               | Epsilon for layer normalization. ~~float~~                                               |
+| `max_position_embeddings`      | Maximum length of position embeddings. ~~int~~                                           |
+| `model_max_length`             | Maximum length of model inputs. ~~int~~                                                  |
+| `num_attention_heads`          | Number of self-attention heads. ~~int~~                                                  |
+| `num_hidden_layers`            | Number of hidden layers. ~~int~~                                                         |
+| `padding_idx`                  | Index of the padding meta-token. ~~int~~                                                 |
+| `type_vocab_size`              | Type vocabulary size. ~~int~~                                                            |
+| `mixed_precision`              | Use mixed-precision training. ~~bool~~                                                   |
+| `grad_scaler_config`           | Configuration passed to the PyTorch gradient scaler. ~~dict~~                            |
+| **CREATES**                    | The model using the architecture ~~Model~~                                               |
+
+### spacy-curated-transformers.ScalarWeight.v1
+
+Construct a model that accepts a list of transformer layer outputs and returns a
+weighted representation of the same.
+
+| Name                 | Description                                                                   |
+| -------------------- | ----------------------------------------------------------------------------- |
+| `num_layers`         | Number of transformer hidden layers. ~~int~~                                  |
+| `dropout_prob`       | Dropout probability. ~~float~~                                                |
+| `mixed_precision`    | Use mixed-precision training. ~~bool~~                                        |
+| `grad_scaler_config` | Configuration passed to the PyTorch gradient scaler. ~~dict~~                 |
+| **CREATES**          | The model using the architecture ~~Model[ScalarWeightInT, ScalarWeightOutT]~~ |
+
+### spacy-curated-transformers.TransformerLayersListener.v1
+
+Construct a listener layer that communicates with one or more upstream
+Transformer components. This layer extracts the output of the last transformer
+layer and performs pooling over the individual pieces of each `Doc` token,
+returning their corresponding representations. The upstream name should either
+be the wildcard string '\*', or the name of the Transformer component.
+
+In almost all cases, the wildcard string will suffice as there'll only be one
+upstream Transformer component. But in certain situations, e.g: you have
+disjoint datasets for certain tasks, or you'd like to use a pre-trained pipeline
+but a downstream task requires its own token representations, you could end up
+with more than one Transformer component in the pipeline.
+
+| Name            | Description                                                                                                            |
+| --------------- | ---------------------------------------------------------------------------------------------------------------------- |
+| `layers`        | The number of layers produced by the upstream transformer component, excluding the embedding layer. ~~int~~            |
+| `width`         | The width of the vectors produced by the upstream transformer component. ~~int~~                                       |
+| `pooling`       | Model that is used to perform pooling over the piece representations. ~~Model~~                                        |
+| `upstream_name` | A string to identify the 'upstream' Transformer component to communicate with. ~~str~~                                 |
+| `grad_factor`   | Factor to multiply gradients with. ~~float~~                                                                           |
+| **CREATES**     | A model that returns the relevant vectors from an upstream transformer component. ~~Model[List[Doc], List[Floats2d]]~~ |
+
+### spacy-curated-transformers.LastTransformerLayerListener.v1
+
+Construct a listener layer that communicates with one or more upstream
+Transformer components. This layer extracts the output of the last transformer
+layer and performs pooling over the individual pieces of each Doc token,
+returning their corresponding representations. The upstream name should either
+be the wildcard string '\*', or the name of the Transformer component.
+
+In almost all cases, the wildcard string will suffice as there'll only be one
+upstream Transformer component. But in certain situations, e.g: you have
+disjoint datasets for certain tasks, or you'd like to use a pre-trained pipeline
+but a downstream task requires its own token representations, you could end up
+with more than one Transformer component in the pipeline.
+
+| Name            | Description                                                                                                            |
+| --------------- | ---------------------------------------------------------------------------------------------------------------------- |
+| `width`         | The width of the vectors produced by the upstream transformer component. ~~int~~                                       |
+| `pooling`       | Model that is used to perform pooling over the piece representations. ~~Model~~                                        |
+| `upstream_name` | A string to identify the 'upstream' Transformer component to communicate with. ~~str~~                                 |
+| `grad_factor`   | Factor to multiply gradients with. ~~float~~                                                                           |
+| **CREATES**     | A model that returns the relevant vectors from an upstream transformer component. ~~Model[List[Doc], List[Floats2d]]~~ |
+
+### spacy-curated-transformers.ScalarWeightingListener.v1
+
+Construct a listener layer that communicates with one or more upstream
+Transformer components. This layer calculates a weighted representation of all
+transformer layer outputs and performs pooling over the individual pieces of
+each Doc token, returning their corresponding representations.
+
+Requires its upstream Transformer components to return all layer outputs from
+their models. The upstream name should either be the wildcard string '\*', or
+the name of the Transformer component.
+
+In almost all cases, the wildcard string will suffice as there'll only be one
+upstream Transformer component. But in certain situations, e.g: you have
+disjoint datasets for certain tasks, or you'd like to use a pre-trained pipeline
+but a downstream task requires its own token representations, you could end up
+with more than one Transformer component in the pipeline.
+
+| Name            | Description                                                                                                            |
+| --------------- | ---------------------------------------------------------------------------------------------------------------------- |
+| `width`         | The width of the vectors produced by the upstream transformer component. ~~int~~                                       |
+| `weighting`     | Model that is used to perform the weighting of the different layer outputs. ~~Model~~                                  |
+| `pooling`       | Model that is used to perform pooling over the piece representations. ~~Model~~                                        |
+| `upstream_name` | A string to identify the 'upstream' Transformer component to communicate with. ~~str~~                                 |
+| `grad_factor`   | Factor to multiply gradients with. ~~float~~                                                                           |
+| **CREATES**     | A model that returns the relevant vectors from an upstream transformer component. ~~Model[List[Doc], List[Floats2d]]~~ |
+
+### spacy-curated-transformers.BertWordpieceEncoder.v1
+
+Construct a WordPiece piece encoder model that accepts a list of token sequences
+or documents and returns a corresponding list of piece identifiers. This encoder
+also splits each token on punctuation characters, as expected by most BERT
+models.
+
+This model must be separately initialized using an appropriate loader.
+
+### spacy-curated-transformers.ByteBpeEncoder.v1
+
+Construct a Byte-BPE piece encoder model that accepts a list of token sequences
+or documents and returns a corresponding list of piece identifiers.
+
+This model must be separately initialized using an appropriate loader.
+
+### spacy-curated-transformers.CamembertSentencepieceEncoder.v1
+
+Construct a SentencePiece piece encoder model that accepts a list of token
+sequences or documents and returns a corresponding list of piece identifiers
+with CamemBERT post-processing applied.
+
+This model must be separately initialized using an appropriate loader.
+
+### spacy-curated-transformers.CharEncoder.v1
+
+Construct a character piece encoder model that accepts a list of token sequences
+or documents and returns a corresponding list of piece identifiers.
+
+This model must be separately initialized using an appropriate loader.
+
+### spacy-curated-transformers.SentencepieceEncoder.v1
+
+Construct a SentencePiece piece encoder model that accepts a list of token
+sequences or documents and returns a corresponding list of piece identifiers.
+
+This model must be separately initialized using an appropriate loader.
+
+### spacy-curated-transformers.WordpieceEncoder.v1
+
+Construct a WordPiece piece encoder model that accepts a list of token sequences
+or documents and returns a corresponding list of piece identifiers. This encoder
+also splits each token on punctuation characters, as expected by most BERT
+models.
+
+This model must be separately initialized using an appropriate loader.
+
+### spacy-curated-transformers.XlmrSentencepieceEncoder.v1
+
+Construct a SentencePiece piece encoder model that accepts a list of token
+sequences or documents and returns a corresponding list of piece identifiers
+with XLM-RoBERTa post-processing applied.
+
+This model must be separately initialized using an appropriate loader.
+
 ## Pretraining architectures {id="pretrain",source="spacy/ml/models/multi_task.py"}
 
 The spacy `pretrain` command lets you initialize a `Tok2Vec` layer in your
diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx
index 6a87f78b818..f71b7a75a9b 100644
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@@ -185,6 +185,29 @@ $ python -m spacy init fill-config [base_path] [output_file] [--diff]
 | `--help`, `-h`         | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
 | **CREATES**            | Complete and auto-filled config file for training.                                                                                                                                   |
 
+### init fill-curated-transformer {id="init-fill-curated-transformer",version="3.7",tag="command"}
+
+Auto-fill the Hugging Face model hyperpameters and loader parameters of a
+[Curated Transformer](/api/curatedtransformer) pipeline component in a
+[.cfg file](/usage/training#config). The name and revision of the
+[Hugging Face model](https://huggingface.co/models) can either be passed as
+command-line arguments or read from the
+`initialize.components.transformer.encoder_loader` config section.
+
+```bash
+$ python -m spacy init fill-curated-transformer [base_path] [output_file] [--model-name] [--model-revision] [--pipe-name] [--code]
+```
+
+| Name                     | Description                                                                                                                                                                          |
+| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `base_path`              | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~                                                            |
+| `output_file`            | Path to output `.cfg` file or "-" to write to stdout so you can pipe it to a file. Defaults to "-" (stdout). ~~Path (positional)~~                                                   |
+| `--model-name`, `-m`     | Name of the Hugging Face model. Defaults to the model name from the encoder loader config. ~~Optional[str] (option)~~                                                                |
+| `--model-revision`, `-r` | Revision of the Hugging Face model. Defaults to `main`. ~~Optional[str] (option)~~                                                                                                   |
+| `--pipe-name`, `-n`      | Name of the Curated Transformer pipe whose config is to be filled. Defaults to the first transformer pipe. ~~Optional[str] (option)~~                                                |
+| `--code`, `-c`           | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
+| **CREATES**              | Complete and auto-filled config file for training.                                                                                                                                   |
+
 ### init vectors {id="init-vectors",version="3",tag="command"}
 
 Convert [word vectors](/usage/linguistic-features#vectors-similarity) for use
@@ -1019,6 +1042,42 @@ $ python -m spacy debug model ./config.cfg tagger -l "5,15" -DIM -PAR -P0 -P1 -P
 | overrides               | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~                         |
 | **PRINTS**              | Debugging information.                                                                                                                                                                                             |
 
+### debug pieces {id="debug-pieces",version="3.7",tag="command"}
+
+Analyze word- or sentencepiece stats.
+
+```bash
+$ python -m spacy debug pieces [config_path] [--code] [--name] [overrides]
+```
+
+| Name           | Description                                                                                                                                                                                |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `config_path`  | Path to config file. ~~Union[Path, str] (positional)~~                                                                                                                                     |
+| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~       |
+| `--name`, `-n` | Name of the Curated Transformer pipe whose config is to be filled. Defaults to the first transformer pipe. ~~Optional[str] (option)~~                                                      |
+| overrides      | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
+| **PRINTS**     | Debugging information.                                                                                                                                                                     |
+
+<Accordion title="Example outputs" spaced>
+
+```bash
+$ python -m spacy debug pieces ./config.cfg
+```
+
+```
+========================= Training corpus statistics =========================
+Median token length: 1.0
+Mean token length: 1.54
+Token length range: [1, 13]
+
+======================= Development corpus statistics =======================
+Median token length: 1.0
+Mean token length: 1.44
+Token length range: [1, 8]
+```
+
+</Accordion>
+
 ## train {id="train",tag="command"}
 
 Train a pipeline. Expects data in spaCy's
@@ -1651,10 +1710,10 @@ $ python -m spacy huggingface-hub push [whl_path] [--org] [--msg] [--verbose]
 > $ python -m spacy huggingface-hub push en_ner_fashion-0.0.0-py3-none-any.whl
 > ```
 
-| Name                 | Description                                                                                                                                     |
-| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- |
-| `whl_path`           | The path to the `.whl` file packaged with [`spacy package`](https://spacy.io/api/cli#package). ~~Path(positional)~~                             |
-| `--org`, `-o`        | Optional name of organization to which the pipeline should be uploaded. ~~str (option)~~                                                        |
-| `--msg`, `-m`        | Commit message to use for update. Defaults to `"Update spaCy pipeline"`. ~~str (option)~~                                                       |
-| `--verbose`, `-V`    | Output additional info for debugging, e.g. the full generated hub metadata. ~~bool (flag)~~                                                     |
-| **UPLOADS**          | The pipeline to the hub.                                                                                                                        |
+| Name              | Description                                                                                                         |
+| ----------------- | ------------------------------------------------------------------------------------------------------------------- |
+| `whl_path`        | The path to the `.whl` file packaged with [`spacy package`](https://spacy.io/api/cli#package). ~~Path(positional)~~ |
+| `--org`, `-o`     | Optional name of organization to which the pipeline should be uploaded. ~~str (option)~~                            |
+| `--msg`, `-m`     | Commit message to use for update. Defaults to `"Update spaCy pipeline"`. ~~str (option)~~                           |
+| `--verbose`, `-V` | Output additional info for debugging, e.g. the full generated hub metadata. ~~bool (flag)~~                         |
+| **UPLOADS**       | The pipeline to the hub.                                                                                            |
diff --git a/website/docs/api/curatedtransformer.mdx b/website/docs/api/curatedtransformer.mdx
new file mode 100644
index 00000000000..5fdbd86cbea
--- /dev/null
+++ b/website/docs/api/curatedtransformer.mdx
@@ -0,0 +1,572 @@
+---
+title: CuratedTransformer
+teaser:
+  Pipeline component for multi-task learning with Curated Transformer models
+tag: class
+source: github.com/explosion/spacy-curated-transformers/blob/main/spacy_curated_transformers/pipeline/transformer.py
+version: 3.7
+api_base_class: /api/pipe
+api_string_name: curated_transformer
+---
+
+<Infobox title="Important note" variant="warning">
+
+This component is available via the extension package
+[`spacy-curated-transformers`](https://github.com/explosion/spacy-curated-transformers).
+It exposes the component via entry points, so if you have the package installed,
+using `factory = "curated_transformer"` in your
+[training config](/usage/training#config) will work out-of-the-box.
+
+</Infobox>
+
+This pipeline component lets you use a curated set of transformer models in your
+pipeline. spaCy Curated Transformers currently supports the following model
+types:
+
+- ALBERT
+- BERT
+- CamemBERT
+- RoBERTa
+- XLM-RoBERT
+
+If you want to use another type of model, use
+[spacy-transformers](/api/spacy-transformers), which allows you to use all
+Hugging Face transformer models with spaCy.
+
+You will usually connect downstream components to a shared Curated Transformer
+pipe using one of the Curated Transformer listener layers. This works similarly
+to spaCy's [Tok2Vec](/api/tok2vec), and the
+[Tok2VecListener](/api/architectures/#Tok2VecListener) sublayer. The component
+assigns the output of the transformer to the `Doc`'s extension attributes. To
+access the values, you can use the custom
+[`Doc._.trf_data`](#assigned-attributes) attribute.
+
+For more details, see the [usage documentation](/usage/embeddings-transformers).
+
+## Assigned Attributes {id="assigned-attributes"}
+
+The component sets the following
+[custom extension attribute](/usage/processing-pipeline#custom-components-attributes):
+
+| Location         | Value                                                                      |
+| ---------------- | -------------------------------------------------------------------------- |
+| `Doc._.trf_data` | Curated Transformer outputs for the `Doc` object. ~~DocTransformerOutput~~ |
+
+## Config and Implementation {id="config"}
+
+The default config is defined by the pipeline component factory and describes
+how the component should be configured. You can override its settings via the
+`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
+[`config.cfg` for training](/usage/training#config). See the
+[model architectures](/api/architectures#curated-trf) documentation for details
+on the curated transformer architectures and their arguments and
+hyperparameters.
+
+> #### Example
+>
+> ```python
+> from spacy_curated_transformers.pipeline.transformer import DEFAULT_CONFIG
+>
+> nlp.add_pipe("curated_transformer", config=DEFAULT_CONFIG)
+> ```
+
+| Setting             | Description                                                                                                                                                                                                                                        |
+| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `model`             | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Defaults to [`XlmrTransformer`](/api/architectures#curated-trf). ~~Model~~                                                                                          |
+| `frozen`            | If `True`, the model's weights are frozen and no backpropagation is performed. ~~bool~~                                                                                                                                                            |
+| `all_layer_outputs` | If `True`, the model returns the outputs of all the layers. Otherwise, only the output of the last layer is returned. This must be set to `True` if any of the pipe's downstream listeners require the outputs of all transformer layers. ~~bool~~ |
+
+```python
+https://github.com/explosion/spacy-curated-transformers/blob/main/spacy_curated_transformers/pipeline/transformer.py
+```
+
+## CuratedTransformer.\_\_init\_\_ {id="init",tag="method"}
+
+> #### Example
+>
+> ```python
+> # Construction via add_pipe with default model
+> trf = nlp.add_pipe("curated_transformer")
+>
+> # Construction via add_pipe with custom config
+> config = {
+>     "model": {
+>         "@architectures": "spacy-curated-transformers.XlmrTransformer.v1",
+>         "vocab_size": 250002,
+>         "num_hidden_layers": 12,
+>         "hidden_width": 768,
+>         "piece_encoder": {
+>             "@architectures": "spacy-curated-transformers.XlmrSentencepieceEncoder.v1"
+>         }
+>     }
+> }
+> trf = nlp.add_pipe("curated_transformer", config=config)
+>
+> # Construction from class
+> from spacy_curated_transformers import CuratedTransformer
+> trf = CuratedTransformer(nlp.vocab, model)
+> ```
+
+Construct a `CuratedTransformer` component. One or more subsequent spaCy
+components can use the transformer outputs as features in its model, with
+gradients backpropagated to the single shared weights. The activations from the
+transformer are saved in the [`Doc._.trf_data`](#assigned-attributes) extension
+attribute. You can also provide a callback to set additional annotations. In
+your application, you would normally use a shortcut for this and instantiate the
+component using its string name and [`nlp.add_pipe`](/api/language#create_pipe).
+
+| Name                | Description                                                                                                                                                                                                                                        |
+| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`             | The shared vocabulary. ~~Vocab~~                                                                                                                                                                                                                   |
+| `model`             | One of the supported pre-trained transformer models. ~~Model~~                                                                                                                                                                                     |
+| _keyword-only_      |                                                                                                                                                                                                                                                    |
+| `name`              | The component instance name. ~~str~~                                                                                                                                                                                                               |
+| `frozen`            | If `True`, the model's weights are frozen and no backpropagation is performed. ~~bool~~                                                                                                                                                            |
+| `all_layer_outputs` | If `True`, the model returns the outputs of all the layers. Otherwise, only the output of the last layer is returned. This must be set to `True` if any of the pipe's downstream listeners require the outputs of all transformer layers. ~~bool~~ |
+
+## CuratedTransformer.\_\_call\_\_ {id="call",tag="method"}
+
+Apply the pipe to one document. The document is modified in place, and returned.
+This usually happens under the hood when the `nlp` object is called on a text
+and all pipeline components are applied to the `Doc` in order. Both
+[`__call__`](/api/curatedtransformer#call) and
+[`pipe`](/api/curatedtransformer#pipe) delegate to the
+[`predict`](/api/curatedtransformer#predict) and
+[`set_annotations`](/api/curatedtransformer#set_annotations) methods.
+
+> #### Example
+>
+> ```python
+> doc = nlp("This is a sentence.")
+> trf = nlp.add_pipe("curated_transformer")
+> # This usually happens under the hood
+> processed = trf(doc)
+> ```
+
+| Name        | Description                      |
+| ----------- | -------------------------------- |
+| `doc`       | The document to process. ~~Doc~~ |
+| **RETURNS** | The processed document. ~~Doc~~  |
+
+## CuratedTransformer.pipe {id="pipe",tag="method"}
+
+Apply the pipe to a stream of documents. This usually happens under the hood
+when the `nlp` object is called on a text and all pipeline components are
+applied to the `Doc` in order. Both [`__call__`](/api/curatedtransformer#call)
+and [`pipe`](/api/curatedtransformer#pipe) delegate to the
+[`predict`](/api/curatedtransformer#predict) and
+[`set_annotations`](/api/curatedtransformer#set_annotations) methods.
+
+> #### Example
+>
+> ```python
+> trf = nlp.add_pipe("curated_transformer")
+> for doc in trf.pipe(docs, batch_size=50):
+>     pass
+> ```
+
+| Name           | Description                                                   |
+| -------------- | ------------------------------------------------------------- |
+| `stream`       | A stream of documents. ~~Iterable[Doc]~~                      |
+| _keyword-only_ |                                                               |
+| `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
+| **YIELDS**     | The processed documents in order. ~~Doc~~                     |
+
+## CuratedTransformer.initialize {id="initialize",tag="method"}
+
+Initialize the component for training and return an
+[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
+function that returns an iterable of [`Example`](/api/example) objects. **At
+least one example should be supplied.** The data examples are used to
+**initialize the model** of the component and can either be the full training
+data or a representative sample. Initialization includes validating the network,
+[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
+setting up the label scheme based on the data. This method is typically called
+by [`Language.initialize`](/api/language#initialize).
+
+> #### Example
+>
+> ```python
+> trf = nlp.add_pipe("curated_transformer")
+> trf.initialize(lambda: examples, nlp=nlp)
+> ```
+
+| Name             | Description                                                                                                                                                                |
+| ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `get_examples`   | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Must contain at least one `Example`. ~~Callable[[], Iterable[Example]]~~ |
+| _keyword-only_   |                                                                                                                                                                            |
+| `nlp`            | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                       |
+| `encoder_loader` | Initialization callback for the transformer model. ~~Optional[Callable]~~                                                                                                  |
+| `piece_loader`   | Initialization callback for the input piece encoder. ~~Optional[Callable]~~                                                                                                |
+
+## CuratedTransformer.predict {id="predict",tag="method"}
+
+Apply the component's model to a batch of [`Doc`](/api/doc) objects without
+modifying them.
+
+> #### Example
+>
+> ```python
+> trf = nlp.add_pipe("curated_transformer")
+> scores = trf.predict([doc1, doc2])
+> ```
+
+| Name        | Description                                 |
+| ----------- | ------------------------------------------- |
+| `docs`      | The documents to predict. ~~Iterable[Doc]~~ |
+| **RETURNS** | The model's prediction for each document.   |
+
+## CuratedTransformer.set_annotations {id="set_annotations",tag="method"}
+
+Assign the extracted features to the `Doc` objects. By default, the
+[`DocTransformerOutput`](/api/curatedtransformer#doctransformeroutput) object is
+written to the [`Doc._.trf_data`](#assigned-attributes) attribute. Your
+`set_extra_annotations` callback is then called, if provided.
+
+> #### Example
+>
+> ```python
+> trf = nlp.add_pipe("curated_transformer")
+> scores = trf.predict(docs)
+> trf.set_annotations(docs, scores)
+> ```
+
+| Name     | Description                                                  |
+| -------- | ------------------------------------------------------------ |
+| `docs`   | The documents to modify. ~~Iterable[Doc]~~                   |
+| `scores` | The scores to set, produced by `CuratedTransformer.predict`. |
+
+## CuratedTransformer.update {id="update",tag="method"}
+
+Prepare for an update to the transformer.
+
+Like the [`Tok2Vec`](api/tok2vec) component, the `CuratedTransformer` component
+is unusual in that it does not receive "gold standard" annotations to calculate
+a weight update. The optimal output of the transformer data is unknown; it's a
+hidden layer inside the network that is updated by backpropagating from output
+layers.
+
+The `CuratedTransformer` component therefore does not perform a weight update
+during its own `update` method. Instead, it runs its transformer model and
+communicates the output and the backpropagation callback to any downstream
+components that have been connected to it via the transformer listener sublayer.
+If there are multiple listeners, the last layer will actually backprop to the
+transformer and call the optimizer, while the others simply increment the
+gradients.
+
+> #### Example
+>
+> ```python
+> trf = nlp.add_pipe("curated_transformer")
+> optimizer = nlp.initialize()
+> losses = trf.update(examples, sgd=optimizer)
+> ```
+
+| Name           | Description                                                                                                                                                                      |
+| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `examples`     | A batch of [`Example`](/api/example) objects. Only the [`Example.predicted`](/api/example#predicted) `Doc` object is used, the reference `Doc` is ignored. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                                                                  |
+| `drop`         | The dropout rate. ~~float~~                                                                                                                                                      |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                    |
+| `losses`       | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                         |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                            |
+
+## CuratedTransformer.create_optimizer {id="create_optimizer",tag="method"}
+
+Create an optimizer for the pipeline component.
+
+> #### Example
+>
+> ```python
+> trf = nlp.add_pipe("curated_transformer")
+> optimizer = trf.create_optimizer()
+> ```
+
+| Name        | Description                  |
+| ----------- | ---------------------------- |
+| **RETURNS** | The optimizer. ~~Optimizer~~ |
+
+## CuratedTransformer.use_params {id="use_params",tag="method, contextmanager"}
+
+Modify the pipe's model to use the given parameter values. At the end of the
+context, the original parameters are restored.
+
+> #### Example
+>
+> ```python
+> trf = nlp.add_pipe("curated_transformer")
+> with trf.use_params(optimizer.averages):
+>     trf.to_disk("/best_model")
+> ```
+
+| Name     | Description                                        |
+| -------- | -------------------------------------------------- |
+| `params` | The parameter values to use in the model. ~~dict~~ |
+
+## CuratedTransformer.to_disk {id="to_disk",tag="method"}
+
+Serialize the pipe to disk.
+
+> #### Example
+>
+> ```python
+> trf = nlp.add_pipe("curated_transformer")
+> trf.to_disk("/path/to/transformer")
+> ```
+
+| Name           | Description                                                                                                                                |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
+| `path`         | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
+| _keyword-only_ |                                                                                                                                            |
+| `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~                                                |
+
+## CuratedTransformer.from_disk {id="from_disk",tag="method"}
+
+Load the pipe from disk. Modifies the object in place and returns it.
+
+> #### Example
+>
+> ```python
+> trf = nlp.add_pipe("curated_transformer")
+> trf.from_disk("/path/to/transformer")
+> ```
+
+| Name           | Description                                                                                     |
+| -------------- | ----------------------------------------------------------------------------------------------- |
+| `path`         | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
+| _keyword-only_ |                                                                                                 |
+| `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~     |
+| **RETURNS**    | The modified `CuratedTransformer` object. ~~CuratedTransformer~~                                |
+
+## CuratedTransformer.to_bytes {id="to_bytes",tag="method"}
+
+> #### Example
+>
+> ```python
+> trf = nlp.add_pipe("curated_transformer")
+> trf_bytes = trf.to_bytes()
+> ```
+
+Serialize the pipe to a bytestring.
+
+| Name           | Description                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------- |
+| _keyword-only_ |                                                                                             |
+| `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
+| **RETURNS**    | The serialized form of the `CuratedTransformer` object. ~~bytes~~                           |
+
+## CuratedTransformer.from_bytes {id="from_bytes",tag="method"}
+
+Load the pipe from a bytestring. Modifies the object in place and returns it.
+
+> #### Example
+>
+> ```python
+> trf_bytes = trf.to_bytes()
+> trf = nlp.add_pipe("curated_transformer")
+> trf.from_bytes(trf_bytes)
+> ```
+
+| Name           | Description                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------- |
+| `bytes_data`   | The data to load from. ~~bytes~~                                                            |
+| _keyword-only_ |                                                                                             |
+| `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
+| **RETURNS**    | The `CuratedTransformer` object. ~~CuratedTransformer~~                                     |
+
+## Serialization Fields {id="serialization-fields"}
+
+During serialization, spaCy will export several data fields used to restore
+different aspects of the object. If needed, you can exclude them from
+serialization by passing in the string names via the `exclude` argument.
+
+> #### Example
+>
+> ```python
+> data = trf.to_disk("/path", exclude=["vocab"])
+> ```
+
+| Name    | Description                                                    |
+| ------- | -------------------------------------------------------------- |
+| `vocab` | The shared [`Vocab`](/api/vocab).                              |
+| `cfg`   | The config file. You usually don't want to exclude this.       |
+| `model` | The binary model data. You usually don't want to exclude this. |
+
+## DocTransformerOutput {id="doctransformeroutput",tag="dataclass"}
+
+Curated Transformer outputs for one `Doc` object. Stores the dense
+representations generated by the transformer for each piece identifier. Piece
+identifiers are grouped by token. Instances of this class are typically assigned
+to the [`Doc._.trf_data`](/api/curatedtransformer#assigned-attributes) extension
+attribute.
+
+| Name              | Description                                                                                                                                                                        |
+| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `all_outputs`     | List of `Ragged` tensors that correspends to outputs of the different transformer layers. Each tensor element corresponds to a piece identifier's representation. ~~List[Ragged]~~ |
+| `last_layer_only` | If only the last transformer layer's outputs are preserved. ~~bool~~                                                                                                               |
+
+### DocTransformerOutput.embedding_layer {id="doctransformeroutput-embeddinglayer",tag="property"}
+
+Return the output of the transformer's embedding layer or `None` if
+`last_layer_only` is `True`.
+
+| Name        | Description                                  |
+| ----------- | -------------------------------------------- |
+| **RETURNS** | Embedding layer output. ~~Optional[Ragged]~~ |
+
+### DocTransformerOutput.last_hidden_layer_state {id="doctransformeroutput-lasthiddenlayerstate",tag="property"}
+
+Return the output of the transformer's last hidden layer.
+
+| Name        | Description                          |
+| ----------- | ------------------------------------ |
+| **RETURNS** | Last hidden layer output. ~~Ragged~~ |
+
+### DocTransformerOutput.all_hidden_layer_states {id="doctransformeroutput-allhiddenlayerstates",tag="property"}
+
+Return the outputs of all transformer layers (excluding the embedding layer).
+
+| Name        | Description                            |
+| ----------- | -------------------------------------- |
+| **RETURNS** | Hidden layer outputs. ~~List[Ragged]~~ |
+
+### DocTransformerOutput.num_outputs {id="doctransformeroutput-numoutputs",tag="property"}
+
+Return the number of layer outputs stored in the `DocTransformerOutput` instance
+(including the embedding layer).
+
+| Name        | Description                |
+| ----------- | -------------------------- |
+| **RETURNS** | Numbef of outputs. ~~int~~ |
+
+## Span Getters {id="span_getters",source="github.com/explosion/spacy-transformers/blob/master/spacy_curated_transformers/span_getters.py"}
+
+Span getters are functions that take a batch of [`Doc`](/api/doc) objects and
+return a lists of [`Span`](/api/span) objects for each doc to be processed by
+the transformer. This is used to manage long documents by cutting them into
+smaller sequences before running the transformer. The spans are allowed to
+overlap, and you can also omit sections of the `Doc` if they are not relevant.
+Span getters can be referenced in the
+`[components.transformer.model.with_spans]` block of the config to customize the
+sequences processed by the transformer.
+
+| Name        | Description                                                   |
+| ----------- | ------------------------------------------------------------- |
+| `docs`      | A batch of `Doc` objects. ~~Iterable[Doc]~~                   |
+| **RETURNS** | The spans to process by the transformer. ~~List[List[Span]]~~ |
+
+### WithStridedSpans.v1 {id="strided_spans",tag="registered function"}
+
+> #### Example config
+>
+> ```ini
+> [transformer.model.with_spans]
+> @architectures = "spacy-curated-transformers.WithStridedSpans.v1"
+> stride = 96
+> window = 128
+> ```
+
+Create a span getter for strided spans. If you set the `window` and `stride` to
+the same value, the spans will cover each token once. Setting `stride` lower
+than `window` will allow for an overlap, so that some tokens are counted twice.
+This can be desirable, because it allows all tokens to have both a left and
+right context.
+
+| Name     | Description              |
+| -------- | ------------------------ |
+| `window` | The window size. ~~int~~ |
+| `stride` | The stride size. ~~int~~ |
+
+## Model Loaders
+
+[Curated Transformer models](/api/architectures#curated-trf) are constructed
+with default hyperparameters and randomized weights when the pipeline is
+created. To load the weights of an existing pre-trained model into the pipeline,
+one of the following loader callbacks can be used. The pre-trained model must
+have the same hyperparameters as the model used by the pipeline.
+
+### HFTransformerEncoderLoader.v1 {id="hf_trfencoder_loader",tag="registered_function"}
+
+Construct a callback that initializes a supported transformer model with weights
+from a corresponding HuggingFace model.
+
+| Name       | Description                                |
+| ---------- | ------------------------------------------ |
+| `name`     | Name of the HuggingFace model. ~~str~~     |
+| `revision` | Name of the model revision/branch. ~~str~~ |
+
+### PyTorchCheckpointLoader.v1 {id="pytorch_checkpoint_loader",tag="registered_function"}
+
+Construct a callback that initializes a supported transformer model with weights
+from a PyTorch checkpoint.
+
+| Name   | Description                              |
+| ------ | ---------------------------------------- |
+| `path` | Path to the PyTorch checkpoint. ~~Path~~ |
+
+## Tokenizer Loaders
+
+[Curated Transformer models](/api/architectures#curated-trf) must be paired with
+a matching tokenizer (piece encoder) model in a spaCy pipeline. As with the
+transformer models, tokenizers are constructed with an empty vocabulary during
+pipeline creation - They need to be initialized with an appropriate loader
+before use in training/inference.
+
+### ByteBPELoader.v1 {id="bytebpe_loader",tag="registered_function"}
+
+Construct a callback that initializes a Byte-BPE piece encoder model.
+
+| Name          | Description                           |
+| ------------- | ------------------------------------- |
+| `vocab_path`  | Path to the vocabulary file. ~~Path~~ |
+| `merges_path` | Path to the merges file. ~~Path~~     |
+
+### CharEncoderLoader.v1 {id="charencoder_loader",tag="registered_function"}
+
+Construct a callback that initializes a character piece encoder model.
+
+| Name        | Description                                                                 |
+| ----------- | --------------------------------------------------------------------------- |
+| `path`      | Path to the serialized character model. ~~Path~~                            |
+| `bos_piece` | Piece used as a beginning-of-sentence token. Defaults to `"[BOS]"`. ~~str~~ |
+| `eos_piece` | Piece used as a end-of-sentence token. Defaults to `"[EOS]"`. ~~str~~       |
+| `unk_piece` | Piece used as a stand-in for unknown tokens. Defaults to `"[UNK]"`. ~~str~~ |
+| `normalize` | Unicode normalization form to use. Defaults to `"NFKC"`. ~~str~~            |
+
+### HFPieceEncoderLoader.v1 {id="hf_pieceencoder_loader",tag="registered_function"}
+
+Construct a callback that initializes a HuggingFace piece encoder model. Used in
+conjunction with the HuggingFace model loader.
+
+| Name       | Description                                |
+| ---------- | ------------------------------------------ |
+| `name`     | Name of the HuggingFace model. ~~str~~     |
+| `revision` | Name of the model revision/branch. ~~str~~ |
+
+### SentencepieceLoader.v1 {id="sentencepiece_loader",tag="registered_function"}
+
+Construct a callback that initializes a SentencePiece piece encoder model.
+
+| Name   | Description                                          |
+| ------ | ---------------------------------------------------- |
+| `path` | Path to the serialized SentencePiece model. ~~Path~~ |
+
+### WordpieceLoader.v1 {id="wordpiece_loader",tag="registered_function"}
+
+Construct a callback that initializes a WordPiece piece encoder model.
+
+| Name   | Description                                      |
+| ------ | ------------------------------------------------ |
+| `path` | Path to the serialized WordPiece model. ~~Path~~ |
+
+## Callbacks
+
+### gradual_transformer_unfreezing.v1 {id="gradual_transformer_unfreezing",tag="registered_function"}
+
+Construct a callback that can be used to gradually unfreeze the weights of one
+or more Transformer components during training. This can be used to prevent
+catastrophic forgetting during fine-tuning.
+
+| Name           | Description                                                                                                                                                                  |
+| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `target_pipes` | A dictionary whose keys and values correspond to the names of Transformer components and the training step at which they should be unfrozen respectively. ~~Dict[str, int]~~ |
diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json
index d2f73d83a66..dd9a26af3d8 100644
--- a/website/meta/sidebars.json
+++ b/website/meta/sidebars.json
@@ -97,6 +97,7 @@
                 "items": [
                     { "text": "AttributeRuler", "url": "/api/attributeruler" },
                     { "text": "CoreferenceResolver", "url": "/api/coref" },
+                    { "text": "CuratedTransformer", "url": "/api/curatedtransformer" },
                     { "text": "DependencyParser", "url": "/api/dependencyparser" },
                     { "text": "EditTreeLemmatizer", "url": "/api/edittreelemmatizer" },
                     { "text": "EntityLinker", "url": "/api/entitylinker" },

From 36d4767aca313ff436d398787d8df09b58678b50 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 13 Sep 2023 13:16:05 +0200
Subject: [PATCH 17/39] Skip project remotes test for python 3.12 (#12980)

`weasel` (using `cloudpathlib`) does not currently support remote paths
for python 3.12.
---
 spacy/tests/test_cli_app.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/spacy/tests/test_cli_app.py b/spacy/tests/test_cli_app.py
index 3a426113bae..a2fd4d666e6 100644
--- a/spacy/tests/test_cli_app.py
+++ b/spacy/tests/test_cli_app.py
@@ -1,4 +1,5 @@
 import os
+import sys
 from pathlib import Path
 
 import pytest
@@ -213,6 +214,9 @@ def test_project_clone(options):
         assert (out / "README.md").is_file()
 
 
+@pytest.mark.skipif(
+    sys.version_info >= (3, 12), reason="Python 3.12+ not supported for remotes"
+)
 def test_project_push_pull(project_dir):
     proj = dict(SAMPLE_PROJECT)
     remote = "xyz"

From 4e3360ad12c924b37185c238e832c10ac3ad9e15 Mon Sep 17 00:00:00 2001
From: Eliana Vornov <eliana@vornov.com>
Date: Mon, 25 Sep 2023 05:25:41 -0400
Subject: [PATCH 18/39] add --spans-key option for CLI spancat evaluation
 (#12981)

* add span key option for CLI evaluation

* Rephrase CLI help to refer to Doc.spans instead of spancat

* Rephrase docs to refer to Doc.spans instead of spancat

---------

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/cli/evaluate.py    | 2 ++
 website/docs/api/cli.mdx | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py
index 6235b658d22..2276ca6b0d4 100644
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@@ -28,6 +28,7 @@ def evaluate_cli(
     displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
     displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
     per_component: bool = Opt(False, "--per-component", "-P", help="Return scores per component, only applicable when an output JSON file is specified."),
+    spans_key: str = Opt("sc", "--spans-key", "-sk", help="Spans key to use when evaluating Doc.spans"),
     # fmt: on
 ):
     """
@@ -53,6 +54,7 @@ def evaluate_cli(
         displacy_limit=displacy_limit,
         per_component=per_component,
         silent=False,
+        spans_key=spans_key,
     )
 
 
diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx
index d63ac6e1dd6..2646a848b8a 100644
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@@ -1183,7 +1183,7 @@ skew. To render a sample of dependency parses in a HTML file using the
 `--displacy-path` argument.
 
 ```bash
-$ python -m spacy benchmark accuracy [model] [data_path] [--output] [--code] [--gold-preproc] [--gpu-id] [--displacy-path] [--displacy-limit]
+$ python -m spacy benchmark accuracy [model] [data_path] [--output] [--code] [--gold-preproc] [--gpu-id] [--displacy-path] [--displacy-limit] [--per-component] [--spans-key]
 ```
 
 | Name                                                 | Description                                                                                                                                                                          |
@@ -1197,6 +1197,7 @@ $ python -m spacy benchmark accuracy [model] [data_path] [--output] [--code] [--
 | `--displacy-path`, `-dp`                             | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~                                                           |
 | `--displacy-limit`, `-dl`                            | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~            |
 | `--per-component`, `-P` <Tag variant="new">3.6</Tag> | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool (flag)~~                                                                                           |
+| `--spans-key`, `-sk`                                 | Spans key to use when evaluating `Doc.spans`. Defaults to `sc`. ~~str (option)~~                                                                                                     |
 | `--help`, `-h`                                       | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
 | **CREATES**                                          | Training results and optional metrics and visualizations.                                                                                                                            |
 

From ed8c11e2aac43d0a378377823188e30c367391e4 Mon Sep 17 00:00:00 2001
From: Ikko Eltociear Ashimine <eltociear@gmail.com>
Date: Mon, 25 Sep 2023 18:44:35 +0900
Subject: [PATCH 19/39] Fix typo in lemmatizer.py (#13003)

specfic -> specific
---
 spacy/lang/es/lemmatizer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/lang/es/lemmatizer.py b/spacy/lang/es/lemmatizer.py
index 44f9683476b..ee5d38e8466 100644
--- a/spacy/lang/es/lemmatizer.py
+++ b/spacy/lang/es/lemmatizer.py
@@ -163,7 +163,7 @@ def lemmatize_det(
         for old, new in self.lookups.get_table("lemma_rules").get("det", []):
             if word == old:
                 return [new]
-        # If none of the specfic rules apply, search in the common rules for
+        # If none of the specific rules apply, search in the common rules for
         # determiners and pronouns that follow a unique pattern for
         # lemmatization. If the word is in the list, return the corresponding
         # lemma.
@@ -291,7 +291,7 @@ def lemmatize_pron(
         for old, new in self.lookups.get_table("lemma_rules").get("pron", []):
             if word == old:
                 return [new]
-        # If none of the specfic rules apply, search in the common rules for
+        # If none of the specific rules apply, search in the common rules for
         # determiners and pronouns that follow a unique pattern for
         # lemmatization. If the word is in the list, return the corresponding
         # lemma.

From 935a5455b696635119dc879205c92d442da67cb6 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 25 Sep 2023 11:49:28 +0200
Subject: [PATCH 20/39] Docs: add new tag for evaluate CLI --spans-keys
 (#13013)

---
 website/docs/api/cli.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx
index 2646a848b8a..e6b04a930dd 100644
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@@ -1197,7 +1197,7 @@ $ python -m spacy benchmark accuracy [model] [data_path] [--output] [--code] [--
 | `--displacy-path`, `-dp`                             | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~                                                           |
 | `--displacy-limit`, `-dl`                            | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~            |
 | `--per-component`, `-P` <Tag variant="new">3.6</Tag> | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool (flag)~~                                                                                           |
-| `--spans-key`, `-sk`                                 | Spans key to use when evaluating `Doc.spans`. Defaults to `sc`. ~~str (option)~~                                                                                                     |
+| `--spans-key`, `-sk` <Tag variant="new">3.6.2</Tag>  | Spans key to use when evaluating `Doc.spans`. Defaults to `sc`. ~~str (option)~~                                                                                                     |
 | `--help`, `-h`                                       | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
 | **CREATES**                                          | Training results and optional metrics and visualizations.                                                                                                                            |
 

From ff4215f1c7c99c1728a5a77c8283b91f71804cd4 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 25 Sep 2023 14:48:38 +0200
Subject: [PATCH 21/39] Drop support for python 3.6 (#13009)

* Drop support for python 3.6

* Update docs
---
 .github/workflows/tests.yml  | 3 ---
 README.md                    | 2 +-
 build-constraints.txt        | 6 +++---
 requirements.txt             | 5 ++---
 setup.cfg                    | 9 +++------
 website/docs/usage/index.mdx | 2 +-
 6 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 2f74d887d3b..f68280be200 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -60,8 +60,6 @@ jobs:
         os: [ubuntu-latest, windows-latest, macos-latest]
         python_version: ["3.11"]
         include:
-          - os: ubuntu-20.04
-            python_version: "3.6"
           - os: windows-latest
             python_version: "3.7"
           - os: macos-latest
@@ -95,7 +93,6 @@ jobs:
       - name: Run mypy
         run: |
           python -m mypy spacy
-        if: matrix.python_version != '3.6'
 
       - name: Delete source directory and .egg-info
         run: |
diff --git a/README.md b/README.md
index 59d3ee9ee2b..02c2e1baf63 100644
--- a/README.md
+++ b/README.md
@@ -109,7 +109,7 @@ For detailed installation instructions, see the
 
 - **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual
   Studio)
-- **Python version**: Python 3.6+ (only 64 bit)
+- **Python version**: Python 3.7+ (only 64 bit)
 - **Package managers**: [pip] · [conda] (via `conda-forge`)
 
 [pip]: https://pypi.org/project/spacy/
diff --git a/build-constraints.txt b/build-constraints.txt
index 5540d634d69..b1cf596ca7c 100644
--- a/build-constraints.txt
+++ b/build-constraints.txt
@@ -1,6 +1,6 @@
-# build version constraints for use with wheelwright + multibuild
-numpy==1.15.0; python_version<='3.7' and platform_machine!='aarch64'
-numpy==1.19.2; python_version<='3.7' and platform_machine=='aarch64'
+# build version constraints for use with wheelwright
+numpy==1.15.0; python_version=='3.7' and platform_machine!='aarch64'
+numpy==1.19.2; python_version=='3.7' and platform_machine=='aarch64'
 numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
 numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
 numpy>=1.25.0; python_version>='3.9'
diff --git a/requirements.txt b/requirements.txt
index b6cc542a567..f711d0012a7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -33,12 +33,11 @@ pytest-timeout>=1.3.0,<2.0.0
 mock>=2.0.0,<3.0.0
 flake8>=3.8.0,<6.0.0
 hypothesis>=3.27.0,<7.0.0
-mypy>=0.990,<1.1.0; platform_machine != "aarch64" and python_version >= "3.7"
-types-dataclasses>=0.1.3; python_version < "3.7"
+mypy>=0.990,<1.1.0; platform_machine != "aarch64"
 types-mock>=0.1.1
 types-setuptools>=57.0.0
 types-requests
 types-setuptools>=57.0.0
 black==22.3.0
-cython-lint>=0.15.0; python_version >= "3.7"
+cython-lint>=0.15.0
 isort>=5.0,<6.0
diff --git a/setup.cfg b/setup.cfg
index 9a5388c8035..a6b60ba59fb 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -17,7 +17,6 @@ classifiers =
     Operating System :: Microsoft :: Windows
     Programming Language :: Cython
     Programming Language :: Python :: 3
-    Programming Language :: Python :: 3.6
     Programming Language :: Python :: 3.7
     Programming Language :: Python :: 3.8
     Programming Language :: Python :: 3.9
@@ -31,15 +30,13 @@ project_urls =
 [options]
 zip_safe = false
 include_package_data = true
-python_requires = >=3.6
+python_requires = >=3.7
 # NOTE: This section is superseded by pyproject.toml and will be removed in
 # spaCy v4
 setup_requires =
     cython>=0.25,<3.0
-    # The newest supported pip for python 3.6 has bugs related to markers in
-    # this section, so this does not contain the same constraints as
-    # pyproject.toml
-    numpy>=1.15.0
+    numpy>=1.15.0; python_version < "3.9"
+    numpy>=1.19.0; python_version >= "3.9"
     # We also need our Cython packages here to compile against
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
diff --git a/website/docs/usage/index.mdx b/website/docs/usage/index.mdx
index 414968d420b..c50e9db6c6b 100644
--- a/website/docs/usage/index.mdx
+++ b/website/docs/usage/index.mdx
@@ -20,7 +20,7 @@ menu:
 
 ## Installation instructions {id="installation"}
 
-spaCy is compatible with **64-bit CPython 3.6+** and runs on **Unix/Linux**,
+spaCy is compatible with **64-bit CPython 3.7+** and runs on **Unix/Linux**,
 **macOS/OS X** and **Windows**. The latest spaCy releases are available over
 [pip](https://pypi.python.org/pypi/spacy) and
 [conda](https://anaconda.org/conda-forge/spacy).

From b4501db6f8bff096f2a2103f998e81cd2dd1dfa2 Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Mon, 25 Sep 2023 18:20:30 +0200
Subject: [PATCH 22/39] Update emoji library in rule-based matcher example
 (#13014)

---
 website/docs/usage/rule-based-matching.mdx | 58 +++++++++++-----------
 1 file changed, 29 insertions(+), 29 deletions(-)

diff --git a/website/docs/usage/rule-based-matching.mdx b/website/docs/usage/rule-based-matching.mdx
index 4f54415cbdf..d01107ea243 100644
--- a/website/docs/usage/rule-based-matching.mdx
+++ b/website/docs/usage/rule-based-matching.mdx
@@ -850,14 +850,14 @@ negative pattern. To keep it simple, we'll either add or subtract `0.1` points 
 this way, the score will also reflect combinations of emoji, even positive _and_
 negative ones.
 
-With a library like [Emojipedia](https://github.com/bcongdon/python-emojipedia),
-we can also retrieve a short description for each emoji – for example, 😍's
-official title is "Smiling Face With Heart-Eyes". Assigning it to a
+With a library like [emoji](https://github.com/carpedm20/emoji), we can also
+retrieve a short description for each emoji – for example, 😍's official title
+is "Smiling Face With Heart-Eyes". Assigning it to a
 [custom attribute](/usage/processing-pipelines#custom-components-attributes) on
 the emoji span will make it available as `span._.emoji_desc`.
 
 ```python
-from emojipedia import Emojipedia  # Installation: pip install emojipedia
+import emoji  # Installation: pip install emoji
 from spacy.tokens import Span  # Get the global Span object
 
 Span.set_extension("emoji_desc", default=None)  # Register the custom attribute
@@ -869,9 +869,9 @@ def label_sentiment(matcher, doc, i, matches):
     elif doc.vocab.strings[match_id] == "SAD":
         doc.sentiment -= 0.1  # Subtract 0.1 for negative sentiment
     span = doc[start:end]
-    emoji = Emojipedia.search(span[0].text)  # Get data for emoji
-    span._.emoji_desc = emoji.title  # Assign emoji description
-
+    # Verify if it is an emoji and set the extension attribute correctly.
+    if emoji.is_emoji(span[0].text):
+        span._.emoji_desc = emoji.demojize(span[0].text, delimiters=("", ""), language=doc.lang_).replace("_", " ")
 ```
 
 To label the hashtags, we can use a
@@ -1096,28 +1096,28 @@ The following operators are supported by the `DependencyMatcher`, most of which
 come directly from
 [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html):
 
-| Symbol                                  | Description                                                                                                                     |
-| --------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------- |
-| `A < B`                                 | `A` is the immediate dependent of `B`.                                                                                          |
-| `A > B`                                 | `A` is the immediate head of `B`.                                                                                               |
-| `A << B`                                | `A` is the dependent in a chain to `B` following dep &rarr; head paths.                                                         |
-| `A >> B`                                | `A` is the head in a chain to `B` following head &rarr; dep paths.                                                              |
-| `A . B`                                 | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree.                              |
-| `A .* B`                                | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(Semgrex counterpart: `..`)_.                 |
-| `A ; B`                                 | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(Semgrex counterpart: `-`)_.  |
-| `A ;* B`                                | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(Semgrex counterpart: `--`)_.                  |
-| `A $+ B`                                | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`.                            |
-| `A $- B`                                | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`.                             |
-| `A $++ B`                               | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`.                                           |
-| `A $-- B`                               | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`.                                            |
-| `A >+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_.                     |
-| `A >- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_.                      |
-| `A >++ B`                               | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i`.                                                       |
-| `A >-- B`                               | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i`.                                                        |
-| `A <+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_.                     |
-| `A <- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_.                      |
-| `A <++ B`                               | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i`.                                                       |
-| `A <-- B`                               | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i`.                                                        |
+| Symbol                                  | Description                                                                                                                    |
+| --------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------ |
+| `A < B`                                 | `A` is the immediate dependent of `B`.                                                                                         |
+| `A > B`                                 | `A` is the immediate head of `B`.                                                                                              |
+| `A << B`                                | `A` is the dependent in a chain to `B` following dep &rarr; head paths.                                                        |
+| `A >> B`                                | `A` is the head in a chain to `B` following head &rarr; dep paths.                                                             |
+| `A . B`                                 | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree.                             |
+| `A .* B`                                | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(Semgrex counterpart: `..`)_.                |
+| `A ; B`                                 | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(Semgrex counterpart: `-`)_. |
+| `A ;* B`                                | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(Semgrex counterpart: `--`)_.                 |
+| `A $+ B`                                | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`.                           |
+| `A $- B`                                | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`.                            |
+| `A $++ B`                               | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`.                                          |
+| `A $-- B`                               | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`.                                           |
+| `A >+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_.                    |
+| `A >- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_.                     |
+| `A >++ B`                               | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i`.                                                      |
+| `A >-- B`                               | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i`.                                                       |
+| `A <+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_.                    |
+| `A <- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_.                     |
+| `A <++ B`                               | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i`.                                                      |
+| `A <-- B`                               | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i`.                                                       |
 
 ### Designing dependency matcher patterns {id="dependencymatcher-patterns"}
 

From 6255e386954778ea745aef4cf9ae3c3051a643bd Mon Sep 17 00:00:00 2001
From: Sergiu Nisioi <sergiu.nisioi@gmail.com>
Date: Thu, 28 Sep 2023 16:06:50 +0700
Subject: [PATCH 23/39] Adding rolegal model to the spaCy universe (#13017)

* adding rolegal model to the spaCy universe

* Fix formatting

* Use raw URL

* update image url and example

* fix pip and update url to raw

* okay, let's add thumb instead of image :octopus:

* Update website/meta/universe.json

---------

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 website/meta/universe.json | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index 46de8121c54..b2868c08480 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -4469,6 +4469,37 @@
             },
             "category": ["pipeline", "standalone"],
             "tags": ["spans", "rules", "ner"]
+        },
+        {
+            "id": "rolegal",
+            "title": "A spaCy Package for Romanian Legal Document Processing",
+            "thumb": "https://raw.githubusercontent.com/senisioi/rolegal/main/img/paper200x200.jpeg",
+            "slogan": "rolegal: a spaCy Package for Noisy Romanian Legal Document Processing",
+            "description": "This is a spaCy language model for Romanian legal domain trained with floret 4-gram to 5-gram embeddings and `LEGAL` entity recognition. Useful for processing OCR-resulted noisy legal documents.",
+            "github": "senisioi/rolegal",
+            "pip": "ro-legal-fl",
+            "tags": ["legal", "floret", "ner", "romanian"],
+            "code_example": [
+                "import spacy",
+                "nlp = spacy.load(\"ro_legal_fl\")",
+                "",
+                "doc = nlp(\"Titlul III din LEGEA nr. 255 din 19 iulie 2013, publicată în MONITORUL OFICIAL\")",
+                "# legal entity identification",
+                "for entity in doc.ents:",
+                "    print('entity: ', entity, '; entity type: ', entity.label_)",
+                "",
+                "# floret n-gram embeddings robust to typos",
+                "print(nlp('achizit1e public@').similarity(nlp('achiziții publice')))",
+                "# 0.7393895566928835",
+                "print(nlp('achizitii publice').similarity(nlp('achiziții publice')))",
+                "# 0.8996480808279399"
+            ],
+            "author": "Sergiu Nisioi",
+            "author_links": {
+                "github": "senisioi",
+                "website": "https://nlp.unibuc.ro/people/snisioi.html"
+            },
+            "category": ["pipeline", "training", "models"]
         }
     ],
 

From beda27a91eadd70563dbaffd844d8c9d5e245928 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 28 Sep 2023 11:36:44 +0200
Subject: [PATCH 24/39] Load the cli module lazily for spacy.info (#12962)

* Load the cli module lazily for spacy.info

This avoids that the `spacy` module cannot be imported when the
users chooses not to install `typer`/`requests`.

* Add test

---------

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/__init__.py       | 7 ++++++-
 spacy/tests/test_cli.py | 4 ++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/spacy/__init__.py b/spacy/__init__.py
index 1a18ad0d580..8aa2eccd789 100644
--- a/spacy/__init__.py
+++ b/spacy/__init__.py
@@ -13,7 +13,6 @@
 from . import pipeline  # noqa: F401
 from . import util
 from .about import __version__  # noqa: F401
-from .cli.info import info  # noqa: F401
 from .errors import Errors
 from .glossary import explain  # noqa: F401
 from .language import Language
@@ -77,3 +76,9 @@ def blank(
     # We should accept both dot notation and nested dict here for consistency
     config = util.dot_to_dict(config)
     return LangClass.from_config(config, vocab=vocab, meta=meta)
+
+
+def info(*args, **kwargs):
+    from .cli.info import info as cli_info
+
+    return cli_info(*args, **kwargs)
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 8e1c9ca3215..ebf2ec7da2f 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -14,6 +14,7 @@
 
 import spacy
 from spacy import about
+from spacy import info as spacy_info
 from spacy.cli import info
 from spacy.cli._util import (
     download_file,
@@ -225,6 +226,9 @@ def test_cli_info():
         raw_data = info(tmp_dir, exclude=[""])
         assert raw_data["lang"] == "nl"
         assert raw_data["components"] == ["textcat"]
+        raw_data = spacy_info(tmp_dir, exclude=[""])
+        assert raw_data["lang"] == "nl"
+        assert raw_data["components"] == ["textcat"]
 
 
 def test_cli_converters_conllu_to_docs():

From 55614d6799682c9658b4249c21f47a2ae7c07fe8 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 12 Sep 2023 08:49:41 +0200
Subject: [PATCH 25/39] Add profile=False to currently unprofiled cython

---
 spacy/attrs.pyx                                        | 1 +
 spacy/lexeme.pyx                                       | 1 +
 spacy/ml/parser_model.pyx                              | 1 +
 spacy/morphology.pyx                                   | 1 +
 spacy/parts_of_speech.pyx                              | 2 +-
 spacy/pipeline/_edit_tree_internals/edit_trees.pyx     | 1 +
 spacy/pipeline/_parser_internals/_state.pyx            | 1 +
 spacy/pipeline/_parser_internals/ner.pyx               | 1 +
 spacy/pipeline/_parser_internals/stateclass.pyx        | 1 +
 spacy/pipeline/_parser_internals/transition_system.pyx | 1 +
 spacy/pipeline/transition_parser.pyx                   | 1 +
 spacy/strings.pyx                                      | 1 +
 spacy/symbols.pyx                                      | 1 +
 spacy/tokens/graph.pyx                                 | 1 +
 spacy/tokens/morphanalysis.pyx                         | 1 +
 spacy/tokens/span.pyx                                  | 1 +
 spacy/tokens/span_group.pyx                            | 1 +
 spacy/tokens/token.pyx                                 | 1 +
 spacy/training/align.pyx                               | 1 +
 spacy/training/alignment_array.pyx                     | 1 +
 spacy/training/example.pyx                             | 1 +
 spacy/training/gold_io.pyx                             | 1 +
 spacy/typedefs.pyx                                     | 1 +
 23 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx
index 97b5d5e3627..363dd094dcd 100644
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@@ -1,3 +1,4 @@
+# cython: profile=False
 from .errors import Errors
 
 IOB_STRINGS = ("", "I", "O", "B")
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 60d22e615ca..f803d5e9394 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -1,4 +1,5 @@
 # cython: embedsignature=True
+# cython: profile=False
 # Compiler crashes on memory view coercion without this. Should report bug.
 cimport numpy as np
 from libc.string cimport memset
diff --git a/spacy/ml/parser_model.pyx b/spacy/ml/parser_model.pyx
index ae60972aaff..f004c562e7d 100644
--- a/spacy/ml/parser_model.pyx
+++ b/spacy/ml/parser_model.pyx
@@ -1,4 +1,5 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False
+# cython: profile=False
 cimport numpy as np
 from libc.math cimport exp
 from libc.stdlib cimport calloc, free, realloc
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index ecbbed729b1..cef45b04d14 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -1,4 +1,5 @@
 # cython: infer_types
+# cython: profile=False
 import warnings
 
 import numpy
diff --git a/spacy/parts_of_speech.pyx b/spacy/parts_of_speech.pyx
index e71fb917ffb..98e3570ec64 100644
--- a/spacy/parts_of_speech.pyx
+++ b/spacy/parts_of_speech.pyx
@@ -1,4 +1,4 @@
-
+# cython: profile=False
 IDS = {
     "": NO_TAG,
     "ADJ": ADJ,
diff --git a/spacy/pipeline/_edit_tree_internals/edit_trees.pyx b/spacy/pipeline/_edit_tree_internals/edit_trees.pyx
index 78cd25622ea..7abd9f2a6f4 100644
--- a/spacy/pipeline/_edit_tree_internals/edit_trees.pyx
+++ b/spacy/pipeline/_edit_tree_internals/edit_trees.pyx
@@ -1,4 +1,5 @@
 # cython: infer_types=True, binding=True
+# cython: profile=False
 from cython.operator cimport dereference as deref
 from libc.stdint cimport UINT32_MAX, uint32_t
 from libc.string cimport memset
diff --git a/spacy/pipeline/_parser_internals/_state.pyx b/spacy/pipeline/_parser_internals/_state.pyx
index e69de29bb2d..61bf6203857 100644
--- a/spacy/pipeline/_parser_internals/_state.pyx
+++ b/spacy/pipeline/_parser_internals/_state.pyx
@@ -0,0 +1 @@
+# cython: profile=False
diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx
index 6c4f8e245af..e4312bd2f92 100644
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@@ -1,3 +1,4 @@
+# cython: profile=False
 from cymem.cymem cimport Pool
 from libc.stdint cimport int32_t
 
diff --git a/spacy/pipeline/_parser_internals/stateclass.pyx b/spacy/pipeline/_parser_internals/stateclass.pyx
index fdb5004bb4a..e3b063b7d0d 100644
--- a/spacy/pipeline/_parser_internals/stateclass.pyx
+++ b/spacy/pipeline/_parser_internals/stateclass.pyx
@@ -1,4 +1,5 @@
 # cython: infer_types=True
+# cython: profile=False
 from libcpp.vector cimport vector
 
 from ...tokens.doc cimport Doc
diff --git a/spacy/pipeline/_parser_internals/transition_system.pyx b/spacy/pipeline/_parser_internals/transition_system.pyx
index aabbdfa2409..e035053b314 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pyx
+++ b/spacy/pipeline/_parser_internals/transition_system.pyx
@@ -1,4 +1,5 @@
 # cython: infer_types=True
+# cython: profile=False
 from __future__ import print_function
 
 from cymem.cymem cimport Pool
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 11c8fafc708..9a278fc1328 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -1,4 +1,5 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
+# cython: profile=False
 from __future__ import print_function
 
 cimport numpy as np
diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index b0799d6fcc7..376a131751e 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -1,4 +1,5 @@
 # cython: infer_types=True
+# cython: profile=False
 cimport cython
 from libc.stdint cimport uint32_t
 from libc.string cimport memcpy
diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx
index d1deeb0e784..f7713577bd3 100644
--- a/spacy/symbols.pyx
+++ b/spacy/symbols.pyx
@@ -1,4 +1,5 @@
 # cython: optimize.unpack_method_calls=False
+# cython: profile=False
 IDS = {
     "": NIL,
     "IS_ALPHA": IS_ALPHA,
diff --git a/spacy/tokens/graph.pyx b/spacy/tokens/graph.pyx
index 1cbec09f4e5..6c4ce6ce358 100644
--- a/spacy/tokens/graph.pyx
+++ b/spacy/tokens/graph.pyx
@@ -1,4 +1,5 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
+# cython: profile=False
 from typing import Generator, List, Tuple
 
 cimport cython
diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx
index ba7c638f66a..ea5d07fa449 100644
--- a/spacy/tokens/morphanalysis.pyx
+++ b/spacy/tokens/morphanalysis.pyx
@@ -1,3 +1,4 @@
+# cython: profile=False
 cimport numpy as np
 from libc.string cimport memset
 
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index cf90e416b22..af3ba8db5ef 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -1,3 +1,4 @@
+# cython: profile=False
 cimport numpy as np
 
 import copy
diff --git a/spacy/tokens/span_group.pyx b/spacy/tokens/span_group.pyx
index d245a142508..257c907bcce 100644
--- a/spacy/tokens/span_group.pyx
+++ b/spacy/tokens/span_group.pyx
@@ -1,3 +1,4 @@
+# cython: profile=False
 import struct
 import weakref
 from copy import deepcopy
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index de967ba25c1..9fd4118d67b 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -1,4 +1,5 @@
 # cython: infer_types=True
+# cython: profile=False
 # Compiler crashes on memory view coercion without this. Should report bug.
 cimport numpy as np
 
diff --git a/spacy/training/align.pyx b/spacy/training/align.pyx
index 79fec73c411..c68110e304f 100644
--- a/spacy/training/align.pyx
+++ b/spacy/training/align.pyx
@@ -1,3 +1,4 @@
+# cython: profile=False
 import re
 from itertools import chain
 from typing import List, Tuple
diff --git a/spacy/training/alignment_array.pyx b/spacy/training/alignment_array.pyx
index b0be1512b1a..f0eb5cf39c3 100644
--- a/spacy/training/alignment_array.pyx
+++ b/spacy/training/alignment_array.pyx
@@ -1,3 +1,4 @@
+# cython: profile=False
 from typing import List
 
 import numpy
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index 3f0cf5adee4..abdcecf71d1 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -1,3 +1,4 @@
+# cython: profile=False
 from collections.abc import Iterable as IterableInstance
 
 import numpy
diff --git a/spacy/training/gold_io.pyx b/spacy/training/gold_io.pyx
index 2fc36e41ff6..afbdf463110 100644
--- a/spacy/training/gold_io.pyx
+++ b/spacy/training/gold_io.pyx
@@ -1,3 +1,4 @@
+# cython: profile=False
 import warnings
 
 import srsly
diff --git a/spacy/typedefs.pyx b/spacy/typedefs.pyx
index e69de29bb2d..61bf6203857 100644
--- a/spacy/typedefs.pyx
+++ b/spacy/typedefs.pyx
@@ -0,0 +1 @@
+# cython: profile=False

From 538304948e6ec9a92411a9c1b0386012cf4dafc3 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 12 Sep 2023 08:50:01 +0200
Subject: [PATCH 26/39] Remove profile=True from currently profiled cython

---
 spacy/kb/candidate.pyx                           | 2 +-
 spacy/kb/kb.pyx                                  | 2 +-
 spacy/kb/kb_in_memory.pyx                        | 2 +-
 spacy/matcher/dependencymatcher.pyx              | 2 +-
 spacy/matcher/levenshtein.pyx                    | 2 +-
 spacy/matcher/matcher.pyx                        | 2 +-
 spacy/matcher/phrasematcher.pyx                  | 2 +-
 spacy/pipeline/_parser_internals/_beam_utils.pyx | 1 -
 spacy/pipeline/_parser_internals/arc_eager.pyx   | 2 +-
 spacy/pipeline/_parser_internals/nonproj.pyx     | 2 +-
 spacy/pipeline/dep_parser.pyx                    | 2 +-
 spacy/pipeline/morphologizer.pyx                 | 2 +-
 spacy/pipeline/multitask.pyx                     | 2 +-
 spacy/pipeline/ner.pyx                           | 2 +-
 spacy/pipeline/pipe.pyx                          | 2 +-
 spacy/pipeline/sentencizer.pyx                   | 2 +-
 spacy/pipeline/senter.pyx                        | 2 +-
 spacy/pipeline/tagger.pyx                        | 2 +-
 spacy/pipeline/trainable_pipe.pyx                | 2 +-
 spacy/tokenizer.pyx                              | 2 +-
 spacy/tokens/_retokenize.pyx                     | 2 +-
 spacy/tokens/doc.pyx                             | 2 +-
 spacy/vectors.pyx                                | 2 +-
 spacy/vocab.pyx                                  | 1 -
 24 files changed, 22 insertions(+), 24 deletions(-)

diff --git a/spacy/kb/candidate.pyx b/spacy/kb/candidate.pyx
index 53fc9b036d2..4369676e23a 100644
--- a/spacy/kb/candidate.pyx
+++ b/spacy/kb/candidate.pyx
@@ -1,4 +1,4 @@
-# cython: infer_types=True, profile=True
+# cython: infer_types=True
 
 from typing import Iterable
 
diff --git a/spacy/kb/kb.pyx b/spacy/kb/kb.pyx
index 6ad4c3564c2..c7db34e166a 100644
--- a/spacy/kb/kb.pyx
+++ b/spacy/kb/kb.pyx
@@ -1,4 +1,4 @@
-# cython: infer_types=True, profile=True
+# cython: infer_types=True
 
 from pathlib import Path
 from typing import Iterable, Tuple, Union
diff --git a/spacy/kb/kb_in_memory.pyx b/spacy/kb/kb_in_memory.pyx
index 02773cbae0a..2b21f246a54 100644
--- a/spacy/kb/kb_in_memory.pyx
+++ b/spacy/kb/kb_in_memory.pyx
@@ -1,4 +1,4 @@
-# cython: infer_types=True, profile=True
+# cython: infer_types=True
 from typing import Any, Callable, Dict, Iterable
 
 import srsly
diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx
index 1f66d99b222..ab5f5d5d14b 100644
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@@ -1,4 +1,4 @@
-# cython: infer_types=True, profile=True
+# cython: infer_types=True
 import warnings
 from collections import defaultdict
 from itertools import product
diff --git a/spacy/matcher/levenshtein.pyx b/spacy/matcher/levenshtein.pyx
index e823ce99d4b..e394f2cf4a2 100644
--- a/spacy/matcher/levenshtein.pyx
+++ b/spacy/matcher/levenshtein.pyx
@@ -1,4 +1,4 @@
-# cython: profile=True, binding=True, infer_types=True
+# cython: binding=True, infer_types=True
 from cpython.object cimport PyObject
 from libc.stdint cimport int64_t
 
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 167f85af491..9a9ed421223 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -1,4 +1,4 @@
-# cython: binding=True, infer_types=True, profile=True
+# cython: binding=True, infer_types=True
 from typing import Iterable, List
 
 from cymem.cymem cimport Pool
diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index 26633e6d616..4efcdb05c43 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -1,4 +1,4 @@
-# cython: infer_types=True, profile=True
+# cython: infer_types=True
 from preshed.maps cimport map_clear, map_get, map_init, map_iter, map_set
 
 import warnings
diff --git a/spacy/pipeline/_parser_internals/_beam_utils.pyx b/spacy/pipeline/_parser_internals/_beam_utils.pyx
index de8f0bf7b87..ac04be5a719 100644
--- a/spacy/pipeline/_parser_internals/_beam_utils.pyx
+++ b/spacy/pipeline/_parser_internals/_beam_utils.pyx
@@ -1,5 +1,4 @@
 # cython: infer_types=True
-# cython: profile=True
 import numpy
 
 from thinc.extra.search cimport Beam
diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx
index bcb4626fba8..e1375494482 100644
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@@ -1,4 +1,4 @@
-# cython: profile=True, cdivision=True, infer_types=True
+# cython: cdivision=True, infer_types=True
 from cymem.cymem cimport Address, Pool
 from libc.stdint cimport int32_t
 from libcpp.vector cimport vector
diff --git a/spacy/pipeline/_parser_internals/nonproj.pyx b/spacy/pipeline/_parser_internals/nonproj.pyx
index 93ad14feb3d..7de19851e00 100644
--- a/spacy/pipeline/_parser_internals/nonproj.pyx
+++ b/spacy/pipeline/_parser_internals/nonproj.pyx
@@ -1,4 +1,4 @@
-# cython: profile=True, infer_types=True
+# cython: infer_types=True
 """Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
 for doing pseudo-projective parsing implementation uses the HEAD decoration
 scheme.
diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx
index 57f09178802..18a220bd631 100644
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@@ -1,4 +1,4 @@
-# cython: infer_types=True, profile=True, binding=True
+# cython: infer_types=True, binding=True
 from collections import defaultdict
 from typing import Callable, Optional
 
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 7ca3908bd4d..d415ae43c5c 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -1,4 +1,4 @@
-# cython: infer_types=True, profile=True, binding=True
+# cython: infer_types=True, binding=True
 from itertools import islice
 from typing import Callable, Dict, Optional, Union
 
diff --git a/spacy/pipeline/multitask.pyx b/spacy/pipeline/multitask.pyx
index 2a62a50d5a9..f33a90fde85 100644
--- a/spacy/pipeline/multitask.pyx
+++ b/spacy/pipeline/multitask.pyx
@@ -1,4 +1,4 @@
-# cython: infer_types=True, profile=True, binding=True
+# cython: infer_types=True, binding=True
 from typing import Optional
 
 import numpy
diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx
index 15c092ae9c5..bb009dc7a6a 100644
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@@ -1,4 +1,4 @@
-# cython: infer_types=True, profile=True, binding=True
+# cython: infer_types=True, binding=True
 from collections import defaultdict
 from typing import Callable, Optional
 
diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx
index 90775c465cf..72ea7e45a80 100644
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@@ -1,4 +1,4 @@
-# cython: infer_types=True, profile=True, binding=True
+# cython: infer_types=True, binding=True
 import warnings
 from typing import Callable, Dict, Iterable, Iterator, Tuple, Union
 
diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx
index 76f2966440f..08ba9d989c1 100644
--- a/spacy/pipeline/sentencizer.pyx
+++ b/spacy/pipeline/sentencizer.pyx
@@ -1,4 +1,4 @@
-# cython: infer_types=True, profile=True, binding=True
+# cython: infer_types=True, binding=True
 from typing import Callable, List, Optional
 
 import srsly
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index 37ddcc3c03c..df093baa9c6 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -1,4 +1,4 @@
-# cython: infer_types=True, profile=True, binding=True
+# cython: infer_types=True, binding=True
 from itertools import islice
 from typing import Callable, Optional
 
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 4c5265a785b..34e85d49c2b 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -1,4 +1,4 @@
-# cython: infer_types=True, profile=True, binding=True
+# cython: infer_types=True, binding=True
 from itertools import islice
 from typing import Callable, Optional
 
diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx
index e5865e07070..8f219b32797 100644
--- a/spacy/pipeline/trainable_pipe.pyx
+++ b/spacy/pipeline/trainable_pipe.pyx
@@ -1,4 +1,4 @@
-# cython: infer_types=True, profile=True, binding=True
+# cython: infer_types=True, binding=True
 from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple
 
 import srsly
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 8fc95bea08e..a239eaf456f 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -1,4 +1,4 @@
-# cython: embedsignature=True, profile=True, binding=True
+# cython: embedsignature=True, binding=True
 cimport cython
 from cymem.cymem cimport Pool
 from cython.operator cimport dereference as deref
diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx
index f28d2e0883f..b0e4ff85c9f 100644
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@@ -1,4 +1,4 @@
-# cython: infer_types=True, bounds_check=False, profile=True
+# cython: infer_types=True, bounds_check=False
 from cymem.cymem cimport Pool
 from libc.string cimport memset
 
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 8fc2c4b3cbd..745eb5ff321 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -1,4 +1,4 @@
-# cython: infer_types=True, bounds_check=False, profile=True
+# cython: infer_types=True, bounds_check=False
 from typing import Set
 
 cimport cython
diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index 2817bcad42a..6ff99bb59eb 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -1,4 +1,4 @@
-# cython: infer_types=True, profile=True, binding=True
+# cython: infer_types=True, binding=True
 from typing import Callable
 
 from cython.operator cimport dereference as deref
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 48e8fcb9087..4004a70e034 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -1,4 +1,3 @@
-# cython: profile=True
 import functools
 
 import numpy

From 1adf79414e14f8f45a64c7cdb6cb098b4cf1f46f Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 12 Sep 2023 08:52:15 +0200
Subject: [PATCH 27/39] Set cython profiling default to True for <3.12, False
 for >=3.12

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 3b6fae37b8b..33178662df4 100755
--- a/setup.py
+++ b/setup.py
@@ -78,6 +78,7 @@
     "language_level": -3,
     "embedsignature": True,
     "annotation_typing": False,
+    "profile": sys.version_info < (3, 12),
 }
 # Files to copy into the package that are otherwise not included
 COPY_FILES = {

From 76d94b31f239f419fabfe6fd27bb039175a6bee5 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 22 Sep 2023 16:58:33 +0200
Subject: [PATCH 28/39] Branch on python 3.12+ shutil.rmtree in make_tempdir

---
 spacy/util.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/spacy/util.py b/spacy/util.py
index c5c57d67d9b..8464e411f85 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1068,7 +1068,10 @@ def force_remove(rmfunc, path, ex):
         rmfunc(path)
 
     try:
-        shutil.rmtree(str(d), onerror=force_remove)
+        if sys.version_info >= (3, 12):
+            shutil.rmtree(str(d), onexc=force_remove)
+        else:
+            shutil.rmtree(str(d), onerror=force_remove)
     except PermissionError as e:
         warnings.warn(Warnings.W091.format(dir=d, msg=e))
 

From b4990395f9bff384b5617ee8ad861a1e0f71cf01 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 22 Sep 2023 17:13:49 +0200
Subject: [PATCH 29/39] Update mypy requirements

---
 .github/workflows/tests.yml | 1 +
 requirements.txt            | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index f68280be200..a42803a6168 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -93,6 +93,7 @@ jobs:
       - name: Run mypy
         run: |
           python -m mypy spacy
+        if: matrix.python_version != '3.7'
 
       - name: Delete source directory and .egg-info
         run: |
diff --git a/requirements.txt b/requirements.txt
index 48d188ec9d3..a8ba956a176 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -33,7 +33,7 @@ pytest-timeout>=1.3.0,<2.0.0
 mock>=2.0.0,<3.0.0
 flake8>=3.8.0,<6.0.0
 hypothesis>=3.27.0,<7.0.0
-mypy>=0.990,<1.1.0; platform_machine != "aarch64"
+mypy>=1.5.0,<1.6.0; platform_machine != "aarch64" and python_version >= "3.8"
 types-mock>=0.1.1
 types-setuptools>=57.0.0
 types-requests

From 467c82439e20a7e0b54cdce1fff6ceb63a237c63 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 22 Sep 2023 18:20:22 +0200
Subject: [PATCH 30/39] Always use tqdm with `disable=None`

`tqdm` can cause deadlocks in the test suite if enabled.
---
 spacy/cli/apply.py           | 4 +++-
 spacy/cli/benchmark_speed.py | 2 +-
 spacy/cli/profile.py         | 2 +-
 spacy/training/initialize.py | 2 +-
 4 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/spacy/cli/apply.py b/spacy/cli/apply.py
index 8c4b4c8bfe2..ffd8105060a 100644
--- a/spacy/cli/apply.py
+++ b/spacy/cli/apply.py
@@ -133,7 +133,9 @@ def apply(
     if len(text_files) > 0:
         streams.append(_stream_texts(text_files))
     datagen = cast(DocOrStrStream, chain(*streams))
-    for doc in tqdm.tqdm(nlp.pipe(datagen, batch_size=batch_size, n_process=n_process)):
+    for doc in tqdm.tqdm(
+        nlp.pipe(datagen, batch_size=batch_size, n_process=n_process), disable=None
+    ):
         docbin.add(doc)
     if output_file.suffix == "":
         output_file = output_file.with_suffix(".spacy")
diff --git a/spacy/cli/benchmark_speed.py b/spacy/cli/benchmark_speed.py
index a683d159126..c7fd771c39c 100644
--- a/spacy/cli/benchmark_speed.py
+++ b/spacy/cli/benchmark_speed.py
@@ -89,7 +89,7 @@ def __init__(self, sample: numpy.ndarray) -> None:
 def annotate(
     nlp: Language, docs: List[Doc], batch_size: Optional[int]
 ) -> numpy.ndarray:
-    docs = nlp.pipe(tqdm(docs, unit="doc"), batch_size=batch_size)
+    docs = nlp.pipe(tqdm(docs, unit="doc", disable=None), batch_size=batch_size)
     wps = []
     while True:
         with time_context() as elapsed:
diff --git a/spacy/cli/profile.py b/spacy/cli/profile.py
index e1f720327cb..e5b8f11939f 100644
--- a/spacy/cli/profile.py
+++ b/spacy/cli/profile.py
@@ -71,7 +71,7 @@ def profile(model: str, inputs: Optional[Path] = None, n_texts: int = 10000) ->
 
 
 def parse_texts(nlp: Language, texts: Sequence[str]) -> None:
-    for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16):
+    for doc in nlp.pipe(tqdm.tqdm(texts, disable=None), batch_size=16):
         pass
 
 
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index 82d4ebf24b1..0621702214c 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -302,7 +302,7 @@ def read_vectors(
             shape = (truncate_vectors, shape[1])
     vectors_data = numpy.zeros(shape=shape, dtype="f")
     vectors_keys = []
-    for i, line in enumerate(tqdm.tqdm(f)):
+    for i, line in enumerate(tqdm.tqdm(f, disable=None)):
         line = line.rstrip()
         pieces = line.rsplit(" ", vectors_data.shape[1])
         word = pieces.pop(0)

From 78504c25a516eace9702c8f1bf9dafb82b6b1b2b Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 22 Sep 2023 11:54:15 +0200
Subject: [PATCH 31/39] CI: Add python 3.12.0rc2

---
 .github/workflows/tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index a42803a6168..1058b4673b6 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -58,7 +58,7 @@ jobs:
       fail-fast: true
       matrix:
         os: [ubuntu-latest, windows-latest, macos-latest]
-        python_version: ["3.11"]
+        python_version: ["3.11", "3.12.0-rc.2"]
         include:
           - os: windows-latest
             python_version: "3.7"

From 6b4f774418d2ac771658bc0122a3e97e1fce9085 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 28 Sep 2023 21:27:42 +0200
Subject: [PATCH 32/39] Set version to v3.7.0 (#13028)

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index d816926fdd9..1a3367673f6 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,5 +1,5 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.7.0.dev0"
+__version__ = "3.7.0"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

From 483d4a5bc0762f8d942f20b5ae58010ce73cb423 Mon Sep 17 00:00:00 2001
From: Matthew Hoffman <matthew@protopia.ai>
Date: Thu, 28 Sep 2023 23:22:56 -0700
Subject: [PATCH 33/39] Allow spacy-transformers v1.3.x in transformers extra
 (#13025)

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index 852ff4049f5..75f2e3a15f3 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -78,7 +78,7 @@ console_scripts =
 lookups =
     spacy_lookups_data>=1.0.3,<1.1.0
 transformers =
-    spacy_transformers>=1.1.2,<1.3.0
+    spacy_transformers>=1.1.2,<1.4.0
 cuda =
     cupy>=5.0.0b4,<13.0.0
 cuda80 =

From 1b043dde3fc674869f11b8b138db878552b4c91a Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 9 Aug 2023 13:43:50 +0200
Subject: [PATCH 34/39] Revert "disable tests until 3.7 models are available"

This reverts commit 991bcc111e1a35cc96dba32ac08c212b0b360384.
---
 .github/workflows/tests.yml | 54 ++++++++++++++++++-------------------
 spacy/tests/test_cli.py     |  2 --
 2 files changed, 27 insertions(+), 29 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 1058b4673b6..976b1f4f212 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -115,22 +115,22 @@ jobs:
       - name: Test import
         run: python -W error -c "import spacy"
 
-#      - name: "Test download CLI"
-#        run: |
-#          python -m spacy download ca_core_news_sm
-#          python -m spacy download ca_core_news_md
-#          python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
-#        if: matrix.python_version == '3.9'
-#
-#      - name: "Test download_url in info CLI"
-#        run: |
-#          python -W error -m spacy info ca_core_news_sm | grep -q download_url
-#        if: matrix.python_version == '3.9'
-#
-#      - name: "Test no warnings on load (#11713)"
-#        run: |
-#          python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
-#        if: matrix.python_version == '3.9'
+      - name: "Test download CLI"
+        run: |
+          python -m spacy download ca_core_news_sm
+          python -m spacy download ca_core_news_md
+          python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
+        if: matrix.python_version == '3.9'
+
+      - name: "Test download_url in info CLI"
+        run: |
+          python -W error -m spacy info ca_core_news_sm | grep -q download_url
+        if: matrix.python_version == '3.9'
+
+      - name: "Test no warnings on load (#11713)"
+        run: |
+          python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
+        if: matrix.python_version == '3.9'
 
       - name: "Test convert CLI"
         run: |
@@ -154,17 +154,17 @@ jobs:
           python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
         if: matrix.python_version == '3.9'
 
-#      - name: "Test assemble CLI"
-#        run: |
-#          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
-#          PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
-#        if: matrix.python_version == '3.9'
-#
-#      - name: "Test assemble CLI vectors warning"
-#        run: |
-#          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
-#          python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
-#        if: matrix.python_version == '3.9'
+      - name: "Test assemble CLI"
+        run: |
+          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
+          PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
+        if: matrix.python_version == '3.9'
+
+      - name: "Test assemble CLI vectors warning"
+        run: |
+          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
+          python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
+        if: matrix.python_version == '3.9'
 
       - name: "Install test requirements"
         run: |
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 8c1d1a64caf..0d2fe0a9ef3 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -538,7 +538,6 @@ def test_string_to_list_intify(value):
     assert string_to_list(value, intify=True) == [1, 2, 3]
 
 
-@pytest.mark.skip(reason="Temporarily skip before 3.7 models are published")
 def test_download_compatibility():
     spec = SpecifierSet("==" + about.__version__)
     spec.prereleases = False
@@ -549,7 +548,6 @@ def test_download_compatibility():
         assert get_minor_version(about.__version__) == get_minor_version(version)
 
 
-@pytest.mark.skip(reason="Temporarily skip before 3.7 models are published")
 def test_validate_compatibility_table():
     spec = SpecifierSet("==" + about.__version__)
     spec.prereleases = False

From 160e61772e3e4fbd4e9e28446c6d687596921f93 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Sun, 1 Oct 2023 21:40:07 +0200
Subject: [PATCH 35/39] Docs for v3.7.0 (#13029)

* Docs for v3.7.0

* Minor fixes

* Extend Weasel notes

* Minor edits

* Update version in README
---
 README.md                      |   2 +-
 website/docs/usage/v3-7.mdx    | 140 +++++++++++++++++++++++++++++++++
 website/meta/sidebars.json     |   3 +-
 website/src/templates/index.js |   4 +-
 4 files changed, 145 insertions(+), 4 deletions(-)
 create mode 100644 website/docs/usage/v3-7.mdx

diff --git a/README.md b/README.md
index 3920c1dc229..b2ffa463938 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@ model packaging, deployment and workflow management. spaCy is commercial
 open-source software, released under the
 [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
 
-💫 **Version 3.6 out now!**
+💫 **Version 3.7 out now!**
 [Check out the release notes here.](https://github.com/explosion/spaCy/releases)
 
 [![tests](https://github.com/explosion/spaCy/actions/workflows/tests.yml/badge.svg)](https://github.com/explosion/spaCy/actions/workflows/tests.yml)
diff --git a/website/docs/usage/v3-7.mdx b/website/docs/usage/v3-7.mdx
new file mode 100644
index 00000000000..76fc9530ffd
--- /dev/null
+++ b/website/docs/usage/v3-7.mdx
@@ -0,0 +1,140 @@
+---
+title: What's New in v3.7
+teaser: New features and how to upgrade
+menu:
+  - ['New Features', 'features']
+  - ['Upgrading Notes', 'upgrading']
+---
+
+## New features {id="features",hidden="true"}
+
+spaCy v3.7 adds support for Python 3.12, introduces the new standalone library
+[Weasel](https://github.com/explosion/weasel) for project workflows, and updates
+the transformer-based trained pipelines to use our new
+[Curated Transformers](https://github.com/explosion/curated-transformers)
+library.
+
+This release drops support for Python 3.6.
+
+### Weasel {id="weasel"}
+
+The [spaCy projects](/usage/projects) functionality has been moved into a new
+standalone library [Weasel](https://github.com/explosion/weasel). This brings
+minor changes to spaCy-specific settings in spaCy projects (see
+[upgrading](#upgrading) below), but also makes it possible to use the same
+workflow functionality outside of spaCy.
+
+All `spacy project` commands should run as before, just now they're using Weasel
+under the hood.
+
+<Infobox title="Remote storage for Python 3.12" variant="warning">
+
+Remote storage for spaCy projects is not yet supported for Python 3.12. Use
+Python 3.11 or earlier for remote storage.
+
+</Infobox>
+
+### Registered vectors {id="custom-vectors"}
+
+You can specify a custom registered vectors class under `[nlp.vectors]` in order
+to use static vectors in formats other than the ones supported by
+[`Vectors`](/api/vectors). To implement your custom vectors, extend the abstract
+class [`BaseVectors`](/api/basevectors). See an example using
+[BPEmb subword embeddings](/usage/embeddings-transformers#custom-vectors).
+
+### Additional features and improvements {id="additional-features-and-improvements"}
+
+- Add support for Python 3.12.
+- Extend to Thinc v8.2.
+- Extend `transformers` extra to `spacy-transformers` v1.3.
+- Add `--spans-key` option for CLI evaluation with `spacy benchmark accuracy`.
+- Load the CLI module lazily for `spacy.info`.
+- Add type stubs for for `spacy.training.example`.
+- Warn for unsupported pattern keys in dependency matcher.
+- `Language.replace_listeners`: Pass the replaced listener and the `tok2vec`
+  pipe to the callback in order to support `spacy-curated-transformers`.
+- Always use `tqdm` with `disable=None` in order to disable output in
+  non-interactive environments.
+- Language updates:
+  - Add left and right pointing angle brackets as punctuation to ancient Greek.
+  - Update example sentences for Turkish.
+- Package setup updates:
+  - Update NumPy build constraints for NumPy 1.25+. For Python 3.9+, it is no
+    longer necessary to set build constraints while building binary wheels.
+  - Refactor Cython profiling in order to disable profiling for Python 3.12 in
+    the package setup, since Cython does not currently support profiling for
+    Python 3.12.
+
+## Trained pipelines {id="pipelines"}
+
+### Pipeline updates {id="pipeline-updates"}
+
+The transformer-based `trf` pipelines have been updated to use our new
+[Curated Transformers](https://github.com/explosion/curated-transformers)
+library using the Thinc model wrappers and pipeline component from
+[spaCy Curated Transformers](https://github.com/explosion/spacy-curated-transformers).
+
+## Notes about upgrading from v3.6 {id="upgrading"}
+
+This release drops support for Python 3.6, drops mypy checks for Python 3.7 and
+removes the `ray` extra. In addition there are several minor changes for spaCy
+projects described in the following section.
+
+### Backwards incompatibilities for spaCy Projects {id="upgrading-projects"}
+
+`spacy project` has a few backwards incompatibilities due to the transition to
+the standalone library [Weasel](https://github.com/explosion/weasel), which is
+not as tightly coupled to spaCy. Weasel produces warnings when it detects older
+spaCy-specific settings in your environment or project config.
+
+- Support for the `spacy_version` configuration key has been dropped.
+- Support for the `check_requirements` configuration key has been dropped due to
+  the deprecation of `pkg_resources`.
+- The `SPACY_CONFIG_OVERRIDES` environment variable is no longer checked. You
+  can set configuration overrides using `WEASEL_CONFIG_OVERRIDES`.
+- Support for `SPACY_PROJECT_USE_GIT_VERSION` environment variable has been
+  dropped.
+- Error codes are now Weasel-specific and do not follow spaCy error codes.
+
+### Pipeline package version compatibility {id="version-compat"}
+
+> #### Using legacy implementations
+>
+> In spaCy v3, you'll still be able to load and reference legacy implementations
+> via [`spacy-legacy`](https://github.com/explosion/spacy-legacy), even if the
+> components or architectures change and newer versions are available in the
+> core library.
+
+When you're loading a pipeline package trained with an earlier version of spaCy
+v3, you will see a warning telling you that the pipeline may be incompatible.
+This doesn't necessarily have to be true, but we recommend running your
+pipelines against your test suite or evaluation data to make sure there are no
+unexpected results.
+
+If you're using one of the [trained pipelines](/models) we provide, you should
+run [`spacy download`](/api/cli#download) to update to the latest version. To
+see an overview of all installed packages and their compatibility, you can run
+[`spacy validate`](/api/cli#validate).
+
+If you've trained your own custom pipeline and you've confirmed that it's still
+working as expected, you can update the spaCy version requirements in the
+[`meta.json`](/api/data-formats#meta):
+
+```diff
+- "spacy_version": ">=3.6.0,<3.7.0",
++ "spacy_version": ">=3.6.0,<3.8.0",
+```
+
+### Updating v3.6 configs
+
+To update a config from spaCy v3.6 with the new v3.7 settings, run
+[`init fill-config`](/api/cli#init-fill-config):
+
+```cli
+$ python -m spacy init fill-config config-v3.6.cfg config-v3.7.cfg
+```
+
+In many cases ([`spacy train`](/api/cli#train),
+[`spacy.load`](/api/top-level#spacy.load)), the new defaults will be filled in
+automatically, but you'll need to fill in the new settings to run
+[`debug config`](/api/cli#debug) and [`debug data`](/api/cli#debug-data).
diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json
index 617473cb0a8..24213ed125b 100644
--- a/website/meta/sidebars.json
+++ b/website/meta/sidebars.json
@@ -15,7 +15,8 @@
                     { "text": "New in v3.3", "url": "/usage/v3-3" },
                     { "text": "New in v3.4", "url": "/usage/v3-4" },
                     { "text": "New in v3.5", "url": "/usage/v3-5" },
-                    { "text": "New in v3.6", "url": "/usage/v3-6" }
+                    { "text": "New in v3.6", "url": "/usage/v3-6" },
+                    { "text": "New in v3.7", "url": "/usage/v3-7" }
                 ]
             },
             {
diff --git a/website/src/templates/index.js b/website/src/templates/index.js
index c8295593cfd..1c969bd3984 100644
--- a/website/src/templates/index.js
+++ b/website/src/templates/index.js
@@ -58,8 +58,8 @@ const AlertSpace = ({ nightly, legacy }) => {
 }
 
 const navAlert = (
-    <Link to="/usage/v3-6" noLinkLayout>
-        <strong>💥 Out now:</strong> spaCy v3.6
+    <Link to="/usage/v3-7" noLinkLayout>
+        <strong>💥 Out now:</strong> spaCy v3.7
     </Link>
 )
 

From 92ce32aa3f04b2d7fac2db0b5bfe3411c8709d9e Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 2 Oct 2023 12:53:46 +0200
Subject: [PATCH 36/39] Update binder version to v3.7 (#13034)

---
 website/meta/site.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/meta/site.json b/website/meta/site.json
index 08fcde62e5f..a07d131d3be 100644
--- a/website/meta/site.json
+++ b/website/meta/site.json
@@ -27,7 +27,7 @@
         "indexName": "spacy"
     },
     "binderUrl": "explosion/spacy-io-binder",
-    "binderVersion": "3.6",
+    "binderVersion": "3.7",
     "sections": [
         { "id": "usage", "title": "Usage Documentation", "theme": "blue" },
         { "id": "models", "title": "Models Documentation", "theme": "blue" },

From 6d0185f7fba4d8a4f76a9c35d2e78542ee0c226a Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 4 Oct 2023 12:33:33 +0200
Subject: [PATCH 37/39] Revert "Load the cli module lazily for spacy.info
 (#12962)"

This reverts commit beda27a91eadd70563dbaffd844d8c9d5e245928.
---
 spacy/__init__.py       | 7 +------
 spacy/tests/test_cli.py | 4 ----
 2 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/spacy/__init__.py b/spacy/__init__.py
index 8aa2eccd789..1a18ad0d580 100644
--- a/spacy/__init__.py
+++ b/spacy/__init__.py
@@ -13,6 +13,7 @@
 from . import pipeline  # noqa: F401
 from . import util
 from .about import __version__  # noqa: F401
+from .cli.info import info  # noqa: F401
 from .errors import Errors
 from .glossary import explain  # noqa: F401
 from .language import Language
@@ -76,9 +77,3 @@ def blank(
     # We should accept both dot notation and nested dict here for consistency
     config = util.dot_to_dict(config)
     return LangClass.from_config(config, vocab=vocab, meta=meta)
-
-
-def info(*args, **kwargs):
-    from .cli.info import info as cli_info
-
-    return cli_info(*args, **kwargs)
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 0d2fe0a9ef3..86451317b2b 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -12,7 +12,6 @@
 
 import spacy
 from spacy import about
-from spacy import info as spacy_info
 from spacy.cli import info
 from spacy.cli._util import parse_config_overrides, string_to_list, walk_directory
 from spacy.cli.apply import apply
@@ -193,9 +192,6 @@ def test_cli_info():
         raw_data = info(tmp_dir, exclude=[""])
         assert raw_data["lang"] == "nl"
         assert raw_data["components"] == ["textcat"]
-        raw_data = spacy_info(tmp_dir, exclude=[""])
-        assert raw_data["lang"] == "nl"
-        assert raw_data["components"] == ["textcat"]
 
 
 def test_cli_converters_conllu_to_docs():

From 9d036607f1ad60ebf1719526c0ec1f531eb688e9 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 4 Oct 2023 18:13:12 +0200
Subject: [PATCH 38/39] Set version to v3.7.1 (#13042)

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index 1a3367673f6..0e718400b8b 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,5 +1,5 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.7.0"
+__version__ = "3.7.1"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

From 6e54360a3d068c2b85b45902f8885b8db043372f Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 5 Oct 2023 08:50:22 +0200
Subject: [PATCH 39/39] Remove pathy dependency, update docs for cloudpathlib
 in Weasel (#13035)

---
 requirements.txt                | 1 -
 setup.cfg                       | 1 -
 spacy/cli/_util.py              | 4 ----
 website/docs/api/cli.mdx        | 6 +++---
 website/docs/usage/projects.mdx | 6 +++---
 5 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index a8ba956a176..3050624f99a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,7 +10,6 @@ wasabi>=0.9.1,<1.2.0
 srsly>=2.4.3,<3.0.0
 catalogue>=2.0.6,<2.1.0
 typer>=0.3.0,<0.10.0
-pathy>=0.10.0
 smart-open>=5.2.1,<7.0.0
 weasel>=0.1.0,<0.4.0
 # Third party dependencies
diff --git a/setup.cfg b/setup.cfg
index 75f2e3a15f3..ab9e39e0cd6 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -56,7 +56,6 @@ install_requires =
     weasel>=0.1.0,<0.4.0
     # Third-party dependencies
     typer>=0.3.0,<0.10.0
-    pathy>=0.10.0
     smart-open>=5.2.1,<7.0.0
     tqdm>=4.38.0,<5.0.0
     numpy>=1.15.0; python_version < "3.9"
diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index bc6c53cd96c..fa41e6a08e0 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -41,10 +41,6 @@
     run_command,
 )
 
-if TYPE_CHECKING:
-    from pathy import FluidPath  # noqa: F401
-
-
 SDIST_SUFFIX = ".tar.gz"
 WHEEL_SUFFIX = "-py3-none-any.whl"
 
diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx
index 3ec0081c9d0..51cae960be2 100644
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@@ -1544,9 +1544,9 @@ obsolete files is left up to you.
 
 Remotes can be defined in the `remotes` section of the
 [`project.yml`](/usage/projects#project-yml). Under the hood, spaCy uses
-[`Pathy`](https://github.com/justindujardin/pathy) to communicate with the
-remote storages, so you can use any protocol that `Pathy` supports, including
-[S3](https://aws.amazon.com/s3/),
+[`cloudpathlib`](https://cloudpathlib.drivendata.org) to communicate with the
+remote storages, so you can use any protocol that `cloudpathlib` supports,
+including [S3](https://aws.amazon.com/s3/),
 [Google Cloud Storage](https://cloud.google.com/storage), and the local
 filesystem, although you may need to install extra dependencies to use certain
 protocols.
diff --git a/website/docs/usage/projects.mdx b/website/docs/usage/projects.mdx
index f3cca8013f1..b089a7ab561 100644
--- a/website/docs/usage/projects.mdx
+++ b/website/docs/usage/projects.mdx
@@ -656,9 +656,9 @@ locally.
 You can list one or more remotes in the `remotes` section of your
 [`project.yml`](#project-yml) by mapping a string name to the URL of the
 storage. Under the hood, spaCy uses
-[`Pathy`](https://github.com/justindujardin/pathy) to communicate with the
-remote storages, so you can use any protocol that `Pathy` supports, including
-[S3](https://aws.amazon.com/s3/),
+[`cloudpathlib`](https://cloudpathlib.drivendata.org) to communicate with the
+remote storages, so you can use any protocol that `cloudpathlib` supports,
+including [S3](https://aws.amazon.com/s3/),
 [Google Cloud Storage](https://cloud.google.com/storage), and the local
 filesystem, although you may need to install extra dependencies to use certain
 protocols.