diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 57285a44a..8d4f6f712 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -58,3 +58,20 @@ jobs: - uses: actions/checkout@v5 - uses: pre-commit/action@v3.0.1 + + type_completeness: + runs-on: ubuntu-latest + timeout-minutes: 5 + + steps: + - uses: actions/checkout@v5 + + - name: Install project dependencies + uses: ./.github/setup + with: + # This is quite slow (2-3 minutes) so we don't run it for all OSs / Python versions. + os: ubuntu-latest + python-version: 3.14 + + - name: Verify type completeness using Pyright + run: poetry run poe type_completeness diff --git a/docs/tests.md b/docs/tests.md index 326f07e48..9b76508f6 100644 --- a/docs/tests.md +++ b/docs/tests.md @@ -11,8 +11,9 @@ Here are the most important options. Fore more details, please use `poe --help`. - Run only pytest: `poe pytest` - Run only pre-commit: `poe style` - Run tests against the installed stubs (this will install and uninstall the stubs): `poe test_dist` +- Verify type completeness: `poe type_completeness`. -These tests originally came from https://github.com/VirtusLab/pandas-stubs. +Some of these tests originally came from https://github.com/VirtusLab/pandas-stubs. The following tests are **optional**. Some of them are run by the CI but it is okay if they fail. diff --git a/pandas-stubs/core/generic.pyi b/pandas-stubs/core/generic.pyi index 5289c7d36..c2a025405 100644 --- a/pandas-stubs/core/generic.pyi +++ b/pandas-stubs/core/generic.pyi @@ -26,7 +26,7 @@ import numpy as np from pandas import Index from pandas.core.resample import DatetimeIndexResampler from pandas.core.series import Series -import sqlalchemy.engine +from sqlalchemy.engine import Connectable from pandas._libs.lib import NoDefaultDoNotUse from pandas._typing import ( @@ -168,7 +168,7 @@ class NDFrame: def to_sql( self, name: _str, - con: str | sqlalchemy.engine.Connectable | sqlite3.Connection, + con: str | Connectable | sqlite3.Connection, *, schema: _str | None = None, if_exists: Literal["fail", "replace", "append", "delete_rows"] = "fail", diff --git a/pandas-stubs/io/excel/_base.pyi b/pandas-stubs/io/excel/_base.pyi index e91f58e30..8de3d8262 100644 --- a/pandas-stubs/io/excel/_base.pyi +++ b/pandas-stubs/io/excel/_base.pyi @@ -297,8 +297,6 @@ class ExcelWriter(Generic[_WorkbookT]): def close(self) -> None: ... class ExcelFile: - engine = ... - io: FilePath | ReadBuffer[bytes] | bytes = ... def __init__( self, path_or_buffer: FilePath | ReadBuffer[bytes] | bytes, diff --git a/pandas-stubs/io/sql.pyi b/pandas-stubs/io/sql.pyi index 7e9089b19..ea5e18d63 100644 --- a/pandas-stubs/io/sql.pyi +++ b/pandas-stubs/io/sql.pyi @@ -14,9 +14,14 @@ from typing import ( ) from pandas.core.frame import DataFrame -import sqlalchemy.engine +from sqlalchemy.engine import Connectable from sqlalchemy.orm import FromStatement -import sqlalchemy.sql.expression +from sqlalchemy.sql import Select +from sqlalchemy.sql.expression import ( + Selectable, + TextClause, + UpdateBase, +) from pandas._libs.lib import NoDefaultDoNotUse from pandas._typing import ( @@ -27,15 +32,10 @@ from pandas._typing import ( np_ndarray, ) -_SQLConnection: TypeAlias = str | sqlalchemy.engine.Connectable | sqlite3.Connection +_SQLConnection: TypeAlias = str | Connectable | sqlite3.Connection _SQLStatement: TypeAlias = ( - str - | sqlalchemy.sql.expression.Selectable - | sqlalchemy.sql.expression.TextClause - | sqlalchemy.sql.Select[Any] - | FromStatement[Any] - | sqlalchemy.sql.expression.UpdateBase + str | Selectable | TextClause | Select[Any] | FromStatement[Any] | UpdateBase ) @overload diff --git a/pyproject.toml b/pyproject.toml index ed945d13c..fe66428b9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -135,6 +135,9 @@ args = [ { name = "nightly", positional = false, default = false, type = "boolean", required = false, help = "Compare against pandas nightly (off by default)" }, ] +[tool.poe.tasks.type_completeness] +help = "Check type completeness" +script = "scripts.test.run:type_completeness" [tool.black] target-version = ["py311"] diff --git a/scripts/test/run.py b/scripts/test/run.py index b74125d24..345bd62f7 100644 --- a/scripts/test/run.py +++ b/scripts/test/run.py @@ -161,3 +161,8 @@ def ty() -> None: def pyrefly() -> None: cmd = ["pyrefly", "check", "pandas-stubs"] subprocess.run(cmd, check=True) + + +def type_completeness() -> None: + cmd = ["python", "-m", "scripts.type_completeness"] + subprocess.run(cmd, check=True) diff --git a/scripts/type_completeness.py b/scripts/type_completeness.py new file mode 100644 index 000000000..7e9046840 --- /dev/null +++ b/scripts/type_completeness.py @@ -0,0 +1,166 @@ +"""Ensure that pandas' public API is type-complete, using Pyright. + +We run Pyright's `--verifytypes` to ensure that type-completeness is at 100%. + +Rather than running the command as-is, we need to make some adjustments: + +- Use `--ignoreexternal` to ignore untyped symbols in dependent libraries: + https://github.com/microsoft/pyright/discussions/9911#discussioncomment-12192388. +- We exclude symbols which are technically public (accordinging to Pyright) but which + aren't in pandas' documented API and not considered public by pandas. There is no + CLI flag for this in Pyright, but we can parse the output json and exclude paths ourselves: + https://github.com/microsoft/pyright/discussions/10614#discussioncomment-13543475. +- We create a temporary virtual environment with pandas installed in it, as Pyright + needs that to run its `--verifytypes` command. +""" + +from __future__ import annotations + +from fnmatch import fnmatch +import json +import os +from pathlib import Path +import shutil +import subprocess +import sys +import tempfile +from typing import Any + +EXCLUDE = [ + # pandas distributes (untyped) tests with the package + "*.tests.*", + "*.conftest.*", + # pandas.core is technically private, and anything considered public + # is re-exported in other places. For example, `DataFrameGroupBy` is + # re-exported in `pandas.api.typing`. The re-exports are available + # under `'alternateNames'`, which we consider when excluding symbols. + "pandas.core.*", + # Not considered public + # https://github.com/pandas-dev/pandas/blob/e87248e1a5d6d78a138039f2856a3aec6b9fef54/doc/source/reference/index.rst#L34 + "pandas.compat.*", + # The only parts of `pandas.io` which appears in the API reference are: + # - `pandas.io.json` + # - `pandas.io.formats.style` + # https://github.com/pandas-dev/pandas/blob/b8371f5e6f329bfe1b5f1e099e221c8219fc6bbd/doc/source/reference/io.rst + # See also: https://github.com/pandas-dev/pandas/issues/27522#issuecomment-516360201 + "pandas.io.common.*", + "pandas.io.parsers.*", + "pandas.io.excel.*", + "pandas.io.formats.csvs.*", + "pandas.io.formats.excel.*", + "pandas.io.formats.html.*", + "pandas.io.formats.info.*", + "pandas.io.formats.printing.*", + "pandas.io.formats.string.*", + "pandas.io.formats.xml.*", + # Not documented, not really part of public API + "pandas.api.executors.BaseExecutionEngine", +] +THRESHOLD = 1 + + +def venv_site_packages(venv_python: str) -> Path: + """Return the site-packages directory for a given venv Python executable.""" + cmd = [ + venv_python, + "-c", + "import sysconfig, json; print(sysconfig.get_paths()['purelib'])", + ] + out = subprocess.check_output(cmd, text=True).strip() + return Path(out) + + +def run_pyright(venv_path: str) -> dict[str, Any]: + env = os.environ.copy() + venv = Path(venv_path) + bin_dir = venv / ("Scripts" if sys.platform == "win32" else "bin") + env["PATH"] = f"{bin_dir}{os.pathsep}{env['PATH']}" + out = subprocess.run( + [ # noqa: S607 + "pyright", + "--verifytypes", + "pandas", + "--ignoreexternal", + "--outputjson", + ], + check=False, + env=env, + text=True, + capture_output=True, + ).stdout + return json.loads(out) + + +def parse_pyright_json(data: dict[str, Any]) -> float: + symbols = data["typeCompleteness"]["symbols"] + matched_symbols = [ + x + for x in symbols + if x["isExported"] + # Keep symbols where there's any name which doesn't match any excluded patterns. + and any( + all(not fnmatch(name, pattern) for pattern in EXCLUDE) + for name in [x["name"], *x.get("alternateNames", [])] + ) + ] + return sum(x["isTypeKnown"] for x in matched_symbols) / len(matched_symbols) + + +def main() -> int: + tmpdir = Path(tempfile.mkdtemp(prefix="pandas-stubs-venv-")) + venv_dir = tmpdir / "venv" + try: + subprocess.run([sys.executable, "-m", "venv", venv_dir], check=True) + + if sys.platform == "win32": + venv_python = (venv_dir / "Scripts") / "python.exe" + else: + venv_python = (venv_dir / "bin") / "python" + + subprocess.check_call([venv_python, "-m", "pip", "install", "-U", "pip"]) + subprocess.check_call( + [venv_python, "-m", "pip", "install", "-U", "pyright", "pandas"] + ) + + site_packages = venv_site_packages(str(venv_python)) + + # Copy stubs into site-packages/pandas. + dest = site_packages / "pandas" + pandas_dir = Path(site_packages / "pandas").parent + tracked_files = subprocess.run( + ["git", "ls-files"], # noqa: S607 + check=False, + capture_output=True, + text=True, + ).stdout.splitlines() + for item in tracked_files: + if not item.startswith("pandas-stubs"): + continue + s = item + d = pandas_dir / item.replace("pandas-stubs", "pandas") + d.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(s, d) + + # Pyright requires `py.typed` to exist. + (dest / "py.typed").write_text("\n") + + sys.stdout.write("Running pyright --verifytypes (may take a while)...\n") + out = run_pyright(str(venv_dir)) + + completeness = parse_pyright_json(out) + + sys.stdout.write("--- Results ---\n") + sys.stdout.write(f"Completeness: {completeness:.4%}\n") + + if completeness < 1: + sys.stdout.write(f"Completeness {completeness:.1%} below threshold 100%\n") + return 1 + sys.stdout.write("Completeness is at 100% threshold\n") + return 0 + + finally: + shutil.rmtree(tmpdir) + + +if __name__ == "__main__": + raise SystemExit(main())