From db3c62043ce6f30b0dd0dbb4ae334f15b105bdb3 Mon Sep 17 00:00:00 2001 From: barneygale Date: Tue, 5 Mar 2024 17:44:29 +0000 Subject: [PATCH 01/59] GH-116380: Make `glob.glob()` twice as fast --- Lib/glob.py | 433 ++++++++++++++++++++++++++++------------------------ 1 file changed, 237 insertions(+), 196 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index 343be78a73b20a..06a89f5cd90c2f 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -1,15 +1,21 @@ """Filename globbing utility.""" -import contextlib import os import re import fnmatch -import itertools -import stat +import functools import sys __all__ = ["glob", "iglob", "escape"] + +_special_parts = ('', '.', '..') +_pattern_flags = re.NOFLAG if os.path.normcase('Aa') == 'Aa' else re.IGNORECASE +_dir_open_flags = os.O_RDONLY | getattr(os, 'O_DIRECTORY', 0) +magic_check = re.compile('([*?[])') +magic_check_bytes = re.compile(b'([*?[])') + + def glob(pathname, *, root_dir=None, dir_fd=None, recursive=False, include_hidden=False): """Return a list of paths matching a pathname pattern. @@ -42,199 +48,31 @@ def iglob(pathname, *, root_dir=None, dir_fd=None, recursive=False, """ sys.audit("glob.glob", pathname, recursive) sys.audit("glob.glob/2", pathname, recursive, root_dir, dir_fd) - if root_dir is not None: - root_dir = os.fspath(root_dir) - else: - root_dir = pathname[:0] - it = _iglob(pathname, root_dir, dir_fd, recursive, False, - include_hidden=include_hidden) - if not pathname or recursive and _isrecursive(pathname[:2]): - try: - s = next(it) # skip empty string - if s: - it = itertools.chain((s,), it) - except StopIteration: - pass - return it - -def _iglob(pathname, root_dir, dir_fd, recursive, dironly, - include_hidden=False): - dirname, basename = os.path.split(pathname) - if not has_magic(pathname): - assert not dironly - if basename: - if _lexists(_join(root_dir, pathname), dir_fd): - yield pathname - else: - # Patterns ending with a slash should match only directories - if _isdir(_join(root_dir, dirname), dir_fd): - yield pathname - return - if not dirname: - if recursive and _isrecursive(basename): - yield from _glob2(root_dir, basename, dir_fd, dironly, - include_hidden=include_hidden) - else: - yield from _glob1(root_dir, basename, dir_fd, dironly, - include_hidden=include_hidden) - return - # `os.path.split()` returns the argument itself as a dirname if it is a - # drive or UNC path. Prevent an infinite recursion if a drive or UNC path - # contains magic characters (i.e. r'\\?\C:'). - if dirname != pathname and has_magic(dirname): - dirs = _iglob(dirname, root_dir, dir_fd, recursive, True, - include_hidden=include_hidden) + pathname = os.fspath(pathname) + is_bytes = isinstance(pathname, bytes) + if is_bytes: + pathname = os.fsdecode(pathname) + if root_dir is not None: + root_dir = os.fsdecode(root_dir) + pathname, parts = _split_pathname(pathname) + + select = _selector(parts, recursive, include_hidden) + if pathname: + # Absolute pattern. + drive = os.path.splitdrive(pathname)[0] + paths = select(pathname, pathname, dir_fd, not drive) else: - dirs = [dirname] - if has_magic(basename): - if recursive and _isrecursive(basename): - glob_in_dir = _glob2 + # Relative pattern. + if root_dir is not None: + root_dir = _add_trailing_slash(root_dir) else: - glob_in_dir = _glob1 - else: - glob_in_dir = _glob0 - for dirname in dirs: - for name in glob_in_dir(_join(root_dir, dirname), basename, dir_fd, dironly, - include_hidden=include_hidden): - yield os.path.join(dirname, name) - -# These 2 helper functions non-recursively glob inside a literal directory. -# They return a list of basenames. _glob1 accepts a pattern while _glob0 -# takes a literal basename (so it only has to check for its existence). - -def _glob1(dirname, pattern, dir_fd, dironly, include_hidden=False): - names = _listdir(dirname, dir_fd, dironly) - if include_hidden or not _ishidden(pattern): - names = (x for x in names if include_hidden or not _ishidden(x)) - return fnmatch.filter(names, pattern) - -def _glob0(dirname, basename, dir_fd, dironly, include_hidden=False): - if basename: - if _lexists(_join(dirname, basename), dir_fd): - return [basename] - else: - # `os.path.split()` returns an empty basename for paths ending with a - # directory separator. 'q*x/' should match only directories. - if _isdir(dirname, dir_fd): - return [basename] - return [] - -# Following functions are not public but can be used by third-party code. - -def glob0(dirname, pattern): - return _glob0(dirname, pattern, None, False) - -def glob1(dirname, pattern): - return _glob1(dirname, pattern, None, False) - -# This helper function recursively yields relative pathnames inside a literal -# directory. - -def _glob2(dirname, pattern, dir_fd, dironly, include_hidden=False): - assert _isrecursive(pattern) - if not dirname or _isdir(dirname, dir_fd): - yield pattern[:0] - yield from _rlistdir(dirname, dir_fd, dironly, - include_hidden=include_hidden) - -# If dironly is false, yields all file names inside a directory. -# If dironly is true, yields only directory names. -def _iterdir(dirname, dir_fd, dironly): - try: - fd = None - fsencode = None - if dir_fd is not None: - if dirname: - fd = arg = os.open(dirname, _dir_open_flags, dir_fd=dir_fd) - else: - arg = dir_fd - if isinstance(dirname, bytes): - fsencode = os.fsencode - elif dirname: - arg = dirname - elif isinstance(dirname, bytes): - arg = bytes(os.curdir, 'ASCII') - else: - arg = os.curdir - try: - with os.scandir(arg) as it: - for entry in it: - try: - if not dironly or entry.is_dir(): - if fsencode is not None: - yield fsencode(entry.name) - else: - yield entry.name - except OSError: - pass - finally: - if fd is not None: - os.close(fd) - except OSError: - return - -def _listdir(dirname, dir_fd, dironly): - with contextlib.closing(_iterdir(dirname, dir_fd, dironly)) as it: - return list(it) - -# Recursively yields relative pathnames inside a literal directory. -def _rlistdir(dirname, dir_fd, dironly, include_hidden=False): - names = _listdir(dirname, dir_fd, dironly) - for x in names: - if include_hidden or not _ishidden(x): - yield x - path = _join(dirname, x) if dirname else x - for y in _rlistdir(path, dir_fd, dironly, - include_hidden=include_hidden): - yield _join(x, y) - - -def _lexists(pathname, dir_fd): - # Same as os.path.lexists(), but with dir_fd - if dir_fd is None: - return os.path.lexists(pathname) - try: - os.lstat(pathname, dir_fd=dir_fd) - except (OSError, ValueError): - return False - else: - return True + root_dir = './' + paths = select(root_dir, root_dir, dir_fd, False) + paths = _remove_prefix(paths, root_dir) + if is_bytes: + paths = (os.fsencode(path) for path in paths) + return paths -def _isdir(pathname, dir_fd): - # Same as os.path.isdir(), but with dir_fd - if dir_fd is None: - return os.path.isdir(pathname) - try: - st = os.stat(pathname, dir_fd=dir_fd) - except (OSError, ValueError): - return False - else: - return stat.S_ISDIR(st.st_mode) - -def _join(dirname, basename): - # It is common if dirname or basename is empty - if not dirname or not basename: - return dirname or basename - return os.path.join(dirname, basename) - -magic_check = re.compile('([*?[])') -magic_check_bytes = re.compile(b'([*?[])') - -def has_magic(s): - if isinstance(s, bytes): - match = magic_check_bytes.search(s) - else: - match = magic_check.search(s) - return match is not None - -def _ishidden(path): - return path[0] in ('.', b'.'[0]) - -def _isrecursive(pattern): - if isinstance(pattern, bytes): - return pattern == b'**' - else: - return pattern == '**' def escape(pathname): """Escape all special characters. @@ -249,9 +87,6 @@ def escape(pathname): return drive + pathname -_dir_open_flags = os.O_RDONLY | getattr(os, 'O_DIRECTORY', 0) - - def translate(pat, *, recursive=False, include_hidden=False, seps=None): """Translate a pathname with shell wildcards to a regular expression. @@ -310,3 +145,209 @@ def translate(pat, *, recursive=False, include_hidden=False, seps=None): results.append(any_sep) res = ''.join(results) return fr'(?s:{res})\Z' + + +def _split_pathname(pathname): + """Split the given path into a pair (anchor, parts), where *anchor* is the + path drive and root (if any), and *parts* is a tuple of path components. + """ + parts = [] + split = os.path.split + dirname, part = split(pathname) + while dirname != pathname: + parts.append(part) + pathname = dirname + dirname, part = split(pathname) + parts.reverse() + return dirname, tuple(parts) + + +def _add_trailing_slash(pathname): + """Returns the given path with a trailing slash added, where possible. + """ + return os.path.join(pathname, '') + + +def _remove_prefix(paths, prefix): + """Yields paths with a given prefix removed, filtering out empty results. + """ + for path in paths: + path = path.removeprefix(prefix) + if path: + yield path + + +def _open_dir(path, rel_path, dir_fd): + """Scans the given directory, and returns a 4-tuple with these parts: + + 1. A path or fd to supply to `os.scandir()`. + 2. A prefix to apply to `os.DirEntry.path`, or None. + 3. The file descriptor for the directory, or None. + 4. Whether the caller should close the fd (bool). + """ + if dir_fd is None: + return path, None, None, False + elif rel_path == './': + return dir_fd, _add_trailing_slash(path), dir_fd, False + else: + fd = os.open(rel_path, _dir_open_flags, dir_fd=dir_fd) + return fd, _add_trailing_slash(path), fd, True + + +@functools.lru_cache(maxsize=1024) +def _selector(parts, recursive, include_hidden): + """Returns a function that selects from a given path, walking and + filtering according to the glob-style pattern parts in *parts*. + """ + if not parts: + return _select_exists + part = parts[0] + if recursive and part == '**': + selector = _recursive_selector + elif magic_check.search(part) is not None: + selector = _wildcard_selector + else: + selector = _literal_selector + return selector(part, parts[1:], recursive, include_hidden) + + +def _literal_selector(part, parts, recursive, include_hidden): + """Returns a function that selects a literal descendant of a given path. + """ + is_special = part in _special_parts + while parts: + next_part = parts[0] + if magic_check.search(next_part) is not None: + break + # Consume next non-wildcard component (speeds up joining). + if next_part not in _special_parts: + is_special = False + part += os.path.sep + next_part + parts = parts[1:] + + select_next = _selector(parts, recursive, include_hidden) + + def select_literal(path, rel_path, dir_fd, exists): + path = _add_trailing_slash(path) + part + rel_path = _add_trailing_slash(rel_path) + part + yield from select_next(path, rel_path, dir_fd, exists and is_special) + return select_literal + + +def _wildcard_selector(part, parts, recursive, include_hidden): + """Returns a function that selects direct children of a given path, + filtering by pattern. + """ + if include_hidden and part == '*': + match = None # Skip generating a pattern that would match all inputs. + else: + regex = translate(part, recursive=recursive, + include_hidden=include_hidden, seps=os.path.sep) + match = re.compile(regex, flags=_pattern_flags).match + + dir_only = bool(parts) + select_next = _selector(parts, recursive, include_hidden) + + def select_wildcard(path, rel_path, dir_fd, exists): + close_fd = False + try: + arg, entry_prefix, fd, close_fd = _open_dir(path, rel_path, dir_fd) + with os.scandir(arg) as scandir_it: + entries = list(scandir_it) + for entry in entries: + if match is None or match(entry.name): + if dir_only: + try: + if not entry.is_dir(): + continue + except OSError: + continue + entry_path = entry.path + if entry_prefix is not None: + entry_path = entry_prefix + entry_path + yield from select_next(entry_path, entry.name, fd, True) + except OSError: + pass + finally: + if close_fd: + os.close(fd) + return select_wildcard + + +def _recursive_selector(part, parts, recursive, include_hidden): + """Returns a function that selects a given path and all its children, + recursively, filtering by pattern. + """ + while parts: + next_part = parts[0] + if next_part in _special_parts: + break + # Consume next non-special component (used to build regex). + part += os.path.sep + next_part + parts = parts[1:] + + if include_hidden and part == '**': + match = None # Skip generating a pattern that would match all inputs. + else: + regex = translate(part, recursive=recursive, + include_hidden=include_hidden, seps=os.path.sep) + match = re.compile(regex, flags=_pattern_flags).match + + dir_only = bool(parts) + select_next = _selector(parts, recursive, include_hidden) + + def select_recursive(path, rel_path, dir_fd, exists, match_pos=None): + if match_pos is None: + path = _add_trailing_slash(path) + rel_path = _add_trailing_slash(rel_path) + match_pos = len(path) + + if match is None or match(path, match_pos): + yield from select_next(path, rel_path, dir_fd, exists) + + close_fd = False + try: + arg, entry_prefix, fd, close_fd = _open_dir(path, rel_path, dir_fd) + with os.scandir(arg) as scandir_it: + entries = list(scandir_it) + for entry in entries: + is_dir = False + try: + if entry.is_dir(): + is_dir = True + except OSError: + pass + + if is_dir or not dir_only: + entry_path = entry.path + if entry_prefix is not None: + entry_path = entry_prefix + entry_path + if is_dir: + yield from select_recursive(entry_path, entry.name, fd, True, match_pos) + elif match is None or match(entry_path, match_pos): + yield from select_next(entry_path, entry.name, fd, True) + except OSError: + pass + finally: + if close_fd: + os.close(fd) + return select_recursive + + +def _select_exists(path, rel_path, dir_fd, exists): + """Yields the given path, if it exists. + """ + if exists: + yield path + elif dir_fd is None: + try: + os.lstat(path) + yield path + except OSError: + pass + else: + try: + os.lstat(rel_path, dir_fd=dir_fd) + yield path + except OSError: + pass From 9e1f059760355fb72cebde0cb066d571886374cc Mon Sep 17 00:00:00 2001 From: barneygale Date: Tue, 5 Mar 2024 23:30:27 +0000 Subject: [PATCH 02/59] Use `os.listdir()` if we don't need to check entry type. --- Lib/glob.py | 54 +++++++++++-------- ...-03-05-23-08-11.gh-issue-116380.56HU7I.rst | 2 + 2 files changed, 33 insertions(+), 23 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2024-03-05-23-08-11.gh-issue-116380.56HU7I.rst diff --git a/Lib/glob.py b/Lib/glob.py index 06a89f5cd90c2f..ebdc4db1c9dd2d 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -178,20 +178,19 @@ def _remove_prefix(paths, prefix): def _open_dir(path, rel_path, dir_fd): - """Scans the given directory, and returns a 4-tuple with these parts: + """Scans the given directory, and returns a 3-tuple with these parts: 1. A path or fd to supply to `os.scandir()`. - 2. A prefix to apply to `os.DirEntry.path`, or None. - 3. The file descriptor for the directory, or None. - 4. Whether the caller should close the fd (bool). + 2. The file descriptor for the directory, or None. + 3. Whether the caller should close the fd (bool). """ if dir_fd is None: - return path, None, None, False + return path, None, False elif rel_path == './': - return dir_fd, _add_trailing_slash(path), dir_fd, False + return dir_fd, dir_fd, False else: fd = os.open(rel_path, _dir_open_flags, dir_fd=dir_fd) - return fd, _add_trailing_slash(path), fd, True + return fd, fd, True @functools.lru_cache(maxsize=1024) @@ -251,21 +250,28 @@ def _wildcard_selector(part, parts, recursive, include_hidden): def select_wildcard(path, rel_path, dir_fd, exists): close_fd = False try: - arg, entry_prefix, fd, close_fd = _open_dir(path, rel_path, dir_fd) - with os.scandir(arg) as scandir_it: - entries = list(scandir_it) - for entry in entries: - if match is None or match(entry.name): - if dir_only: + arg, fd, close_fd = _open_dir(path, rel_path, dir_fd) + if dir_only: + if fd is not None: + prefix = _add_trailing_slash(path) + with os.scandir(arg) as scandir_it: + entries = list(scandir_it) + for entry in entries: + if match is None or match(entry.name): try: - if not entry.is_dir(): - continue + if entry.is_dir(): + entry_path = entry.path + if fd is not None: + entry_path = prefix + entry_path + yield from select_next(entry_path, entry.name, fd, True) except OSError: - continue - entry_path = entry.path - if entry_prefix is not None: - entry_path = entry_prefix + entry_path - yield from select_next(entry_path, entry.name, fd, True) + pass + else: + prefix = _add_trailing_slash(path) + for name in os.listdir(arg): + if match is None or match(name): + yield from select_next(prefix + name, name, fd, True) + except OSError: pass finally: @@ -307,7 +313,9 @@ def select_recursive(path, rel_path, dir_fd, exists, match_pos=None): close_fd = False try: - arg, entry_prefix, fd, close_fd = _open_dir(path, rel_path, dir_fd) + arg, fd, close_fd = _open_dir(path, rel_path, dir_fd) + if fd is not None: + prefix = _add_trailing_slash(path) with os.scandir(arg) as scandir_it: entries = list(scandir_it) for entry in entries: @@ -320,8 +328,8 @@ def select_recursive(path, rel_path, dir_fd, exists, match_pos=None): if is_dir or not dir_only: entry_path = entry.path - if entry_prefix is not None: - entry_path = entry_prefix + entry_path + if fd is not None: + entry_path = prefix + entry_path if is_dir: yield from select_recursive(entry_path, entry.name, fd, True, match_pos) elif match is None or match(entry_path, match_pos): diff --git a/Misc/NEWS.d/next/Library/2024-03-05-23-08-11.gh-issue-116380.56HU7I.rst b/Misc/NEWS.d/next/Library/2024-03-05-23-08-11.gh-issue-116380.56HU7I.rst new file mode 100644 index 00000000000000..6c96ec6d3422a1 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-03-05-23-08-11.gh-issue-116380.56HU7I.rst @@ -0,0 +1,2 @@ +Speed up :func:`glob.glob` by making use of :func:`glob.translate` and +tracking path existence in more detail. From 10432df36bc6d5f4c509b63f670349f7a6a0b0e3 Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 6 Mar 2024 00:13:59 +0000 Subject: [PATCH 03/59] A few small speedups. --- Lib/glob.py | 62 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 36 insertions(+), 26 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index ebdc4db1c9dd2d..a37683e9c2ff0d 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -68,9 +68,9 @@ def iglob(pathname, *, root_dir=None, dir_fd=None, recursive=False, else: root_dir = './' paths = select(root_dir, root_dir, dir_fd, False) - paths = _remove_prefix(paths, root_dir) + paths = _remove_prefix(paths, len(root_dir)) if is_bytes: - paths = (os.fsencode(path) for path in paths) + paths = map(os.fsencode, paths) return paths @@ -147,6 +147,12 @@ def translate(pat, *, recursive=False, include_hidden=False, seps=None): return fr'(?s:{res})\Z' +def _compile_pattern(pattern, recursive, include_hidden): + regex = translate(pattern, recursive=recursive, + include_hidden=include_hidden, seps=os.path.sep) + return re.compile(regex, flags=_pattern_flags).match + + def _split_pathname(pathname): """Split the given path into a pair (anchor, parts), where *anchor* is the path drive and root (if any), and *parts* is a tuple of path components. @@ -168,11 +174,11 @@ def _add_trailing_slash(pathname): return os.path.join(pathname, '') -def _remove_prefix(paths, prefix): +def _remove_prefix(paths, prefix_len): """Yields paths with a given prefix removed, filtering out empty results. """ for path in paths: - path = path.removeprefix(prefix) + path = path[prefix_len:] if path: yield path @@ -240,18 +246,14 @@ def _wildcard_selector(part, parts, recursive, include_hidden): if include_hidden and part == '*': match = None # Skip generating a pattern that would match all inputs. else: - regex = translate(part, recursive=recursive, - include_hidden=include_hidden, seps=os.path.sep) - match = re.compile(regex, flags=_pattern_flags).match - - dir_only = bool(parts) - select_next = _selector(parts, recursive, include_hidden) - - def select_wildcard(path, rel_path, dir_fd, exists): - close_fd = False - try: - arg, fd, close_fd = _open_dir(path, rel_path, dir_fd) - if dir_only: + match = _compile_pattern(part, recursive, include_hidden) + + if parts: + select_next = _selector(parts, recursive, include_hidden) + def select_wildcard(path, rel_path, dir_fd, exists): + close_fd = False + try: + arg, fd, close_fd = _open_dir(path, rel_path, dir_fd) if fd is not None: prefix = _add_trailing_slash(path) with os.scandir(arg) as scandir_it: @@ -266,17 +268,27 @@ def select_wildcard(path, rel_path, dir_fd, exists): yield from select_next(entry_path, entry.name, fd, True) except OSError: pass - else: + except OSError: + pass + finally: + if close_fd: + os.close(fd) + + else: + def select_wildcard(path, rel_path, dir_fd, exists): + close_fd = False + try: + arg, fd, close_fd = _open_dir(path, rel_path, dir_fd) prefix = _add_trailing_slash(path) for name in os.listdir(arg): if match is None or match(name): - yield from select_next(prefix + name, name, fd, True) + yield prefix + name + except OSError: + pass + finally: + if close_fd: + os.close(fd) - except OSError: - pass - finally: - if close_fd: - os.close(fd) return select_wildcard @@ -295,9 +307,7 @@ def _recursive_selector(part, parts, recursive, include_hidden): if include_hidden and part == '**': match = None # Skip generating a pattern that would match all inputs. else: - regex = translate(part, recursive=recursive, - include_hidden=include_hidden, seps=os.path.sep) - match = re.compile(regex, flags=_pattern_flags).match + match = _compile_pattern(part, recursive, include_hidden) dir_only = bool(parts) select_next = _selector(parts, recursive, include_hidden) From 7e389e21fe3cfaa5297284346387b4d12b520d46 Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 6 Mar 2024 02:10:22 +0000 Subject: [PATCH 04/59] Simplify prefix removal --- Lib/glob.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index a37683e9c2ff0d..ce62226215f643 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -1,5 +1,6 @@ """Filename globbing utility.""" +import operator import os import re import fnmatch @@ -49,6 +50,8 @@ def iglob(pathname, *, root_dir=None, dir_fd=None, recursive=False, sys.audit("glob.glob", pathname, recursive) sys.audit("glob.glob/2", pathname, recursive, root_dir, dir_fd) pathname = os.fspath(pathname) + if not pathname: + return iter(()) is_bytes = isinstance(pathname, bytes) if is_bytes: pathname = os.fsdecode(pathname) @@ -68,7 +71,10 @@ def iglob(pathname, *, root_dir=None, dir_fd=None, recursive=False, else: root_dir = './' paths = select(root_dir, root_dir, dir_fd, False) - paths = _remove_prefix(paths, len(root_dir)) + if recursive and (parts == ('**',) or parts == ('**', '')): + next(paths) # Do not emit root_dir + root_slicer = operator.itemgetter(slice(len(root_dir), None)) + paths = map(root_slicer, paths) if is_bytes: paths = map(os.fsencode, paths) return paths @@ -174,15 +180,6 @@ def _add_trailing_slash(pathname): return os.path.join(pathname, '') -def _remove_prefix(paths, prefix_len): - """Yields paths with a given prefix removed, filtering out empty results. - """ - for path in paths: - path = path[prefix_len:] - if path: - yield path - - def _open_dir(path, rel_path, dir_fd): """Scans the given directory, and returns a 3-tuple with these parts: From 8680a0a8e96f3244c9d6fb0063e18df81df2000d Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 6 Mar 2024 03:15:28 +0000 Subject: [PATCH 05/59] Re-implement `glob0()`, `glob1()`, and `has_magic()`. --- Lib/glob.py | 28 ++++++++++++++++++++++++++-- Lib/test/test_glob.py | 19 +++++++++++++++++++ 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index ce62226215f643..2d840a683b21ca 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -13,8 +13,6 @@ _special_parts = ('', '.', '..') _pattern_flags = re.NOFLAG if os.path.normcase('Aa') == 'Aa' else re.IGNORECASE _dir_open_flags = os.O_RDONLY | getattr(os, 'O_DIRECTORY', 0) -magic_check = re.compile('([*?[])') -magic_check_bytes = re.compile(b'([*?[])') def glob(pathname, *, root_dir=None, dir_fd=None, recursive=False, @@ -80,6 +78,17 @@ def iglob(pathname, *, root_dir=None, dir_fd=None, recursive=False, return paths +magic_check = re.compile('([*?[])') +magic_check_bytes = re.compile(b'([*?[])') + +def has_magic(s): + if isinstance(s, bytes): + match = magic_check_bytes.search(s) + else: + match = magic_check.search(s) + return match is not None + + def escape(pathname): """Escape all special characters. """ @@ -366,3 +375,18 @@ def _select_exists(path, rel_path, dir_fd, exists): yield path except OSError: pass + + +def _legacy_glob(selector, dirname, pattern): + """Implements the undocumented glob0() and glob1() functions. + """ + root = _add_trailing_slash(dirname) + root_slicer = operator.itemgetter(slice(len(root), None)) + select = selector(pattern, (), False, False) + paths = select(dirname, dirname, None, False) + paths = map(root_slicer, paths) + return list(paths) + + +glob0 = functools.partial(_legacy_glob, _literal_selector) +glob1 = functools.partial(_legacy_glob, _wildcard_selector) diff --git a/Lib/test/test_glob.py b/Lib/test/test_glob.py index 8b2ea8f89f5daf..4b282f402b688e 100644 --- a/Lib/test/test_glob.py +++ b/Lib/test/test_glob.py @@ -361,6 +361,25 @@ def test_glob_many_open_files(self): for it in iters: self.assertEqual(next(it), p) + def test_glob0(self): + # This undocumented function matches literal paths + eq = self.assertSequencesEqual_noorder + eq(glob.glob0(self.tempdir, ''), ['']) + eq(glob.glob0(self.tempdir, 'a'), ['a']) + eq(glob.glob0(self.tempdir, '.bb'), ['.bb']) + eq(glob.glob0(self.tempdir, 'b'), []) + + def test_glob1(self): + # This undocumented function matches non-recursive wildcards + eq = self.assertSequencesEqual_noorder + eq(glob.glob1(self.tempdir, 'a'), ['a']) + eq(glob.glob1(self.tempdir, '.bb'), ['.bb']) + eq(glob.glob1(self.tempdir, '.b*'), ['.bb']) + eq(glob.glob1(self.tempdir, 'b'), []) + eq(glob.glob1(self.tempdir, '?'), ['a']) + eq(glob.glob1(self.tempdir, '*a'), ['a', 'aaa']) + eq(glob.glob1(self.tempdir, 'a*'), ['a', 'aaa', 'aab']) + def test_translate_matching(self): match = re.compile(glob.translate('*')).match self.assertIsNotNone(match('foo')) From 3bf3124f8145141c85ac69be2e374e16663d177d Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 6 Mar 2024 03:53:17 +0000 Subject: [PATCH 06/59] Fix errant `StopIteration`. --- Lib/glob.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index 2d840a683b21ca..a205450bb52328 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -64,13 +64,16 @@ def iglob(pathname, *, root_dir=None, dir_fd=None, recursive=False, paths = select(pathname, pathname, dir_fd, not drive) else: # Relative pattern. - if root_dir is not None: - root_dir = _add_trailing_slash(root_dir) - else: + if root_dir is None: root_dir = './' + else: + root_dir = _add_trailing_slash(root_dir) paths = select(root_dir, root_dir, dir_fd, False) if recursive and (parts == ('**',) or parts == ('**', '')): - next(paths) # Do not emit root_dir + # Consume root_dir. + for path in paths: + assert path == root_dir + break root_slicer = operator.itemgetter(slice(len(root_dir), None)) paths = map(root_slicer, paths) if is_bytes: From f8fb9923972450bc6d379d81bc0f55ae7f155ef2 Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 6 Mar 2024 04:11:56 +0000 Subject: [PATCH 07/59] Skip compiling pattern for consecutive `**` segments. --- Lib/glob.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Lib/glob.py b/Lib/glob.py index a205450bb52328..3f08e223b05340 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -310,7 +310,8 @@ def _recursive_selector(part, parts, recursive, include_hidden): if next_part in _special_parts: break # Consume next non-special component (used to build regex). - part += os.path.sep + next_part + if next_part != part: + part += os.path.sep + next_part parts = parts[1:] if include_hidden and part == '**': From 50ef080501f5eee1480822a6a1a7383398770414 Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 6 Mar 2024 04:19:15 +0000 Subject: [PATCH 08/59] Clarify regex/path building in literal and recursive selectors. --- Lib/glob.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index 3f08e223b05340..174034ac823093 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -229,14 +229,11 @@ def _literal_selector(part, parts, recursive, include_hidden): """Returns a function that selects a literal descendant of a given path. """ is_special = part in _special_parts - while parts: - next_part = parts[0] - if magic_check.search(next_part) is not None: - break + while parts and magic_check.search(parts[0]) is None: # Consume next non-wildcard component (speeds up joining). - if next_part not in _special_parts: + if parts[0] not in _special_parts: is_special = False - part += os.path.sep + next_part + part += os.path.sep + parts[0] parts = parts[1:] select_next = _selector(parts, recursive, include_hidden) @@ -305,13 +302,11 @@ def _recursive_selector(part, parts, recursive, include_hidden): """Returns a function that selects a given path and all its children, recursively, filtering by pattern. """ - while parts: - next_part = parts[0] - if next_part in _special_parts: - break + while parts and parts[0] == '**': + parts = parts[1:] + while parts and parts[0] not in _special_parts: # Consume next non-special component (used to build regex). - if next_part != part: - part += os.path.sep + next_part + part += os.path.sep + parts[0] parts = parts[1:] if include_hidden and part == '**': From ccefacd61d6c092ae3765df0d236423b72c3eda1 Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 6 Mar 2024 05:49:27 +0000 Subject: [PATCH 09/59] Simplify code to ignore root_dir. --- Lib/glob.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index 174034ac823093..23385a15c40a5f 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -5,6 +5,7 @@ import re import fnmatch import functools +import itertools import sys __all__ = ["glob", "iglob", "escape"] @@ -48,8 +49,6 @@ def iglob(pathname, *, root_dir=None, dir_fd=None, recursive=False, sys.audit("glob.glob", pathname, recursive) sys.audit("glob.glob/2", pathname, recursive, root_dir, dir_fd) pathname = os.fspath(pathname) - if not pathname: - return iter(()) is_bytes = isinstance(pathname, bytes) if is_bytes: pathname = os.fsdecode(pathname) @@ -68,14 +67,10 @@ def iglob(pathname, *, root_dir=None, dir_fd=None, recursive=False, root_dir = './' else: root_dir = _add_trailing_slash(root_dir) - paths = select(root_dir, root_dir, dir_fd, False) - if recursive and (parts == ('**',) or parts == ('**', '')): - # Consume root_dir. - for path in paths: - assert path == root_dir - break root_slicer = operator.itemgetter(slice(len(root_dir), None)) + paths = select(root_dir, root_dir, dir_fd, False) paths = map(root_slicer, paths) + paths = itertools.dropwhile(lambda path: not path, paths) if is_bytes: paths = map(os.fsencode, paths) return paths From fa951f6f3301c0b9a8c23467df1e51ff74849cbf Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 6 Mar 2024 06:19:43 +0000 Subject: [PATCH 10/59] Fix possible Windows separator issue. --- Lib/glob.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index 23385a15c40a5f..25089a63628f14 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -1,17 +1,17 @@ """Filename globbing utility.""" -import operator import os import re import fnmatch import functools import itertools +import operator import sys __all__ = ["glob", "iglob", "escape"] -_special_parts = ('', '.', '..') +_special_parts = ('', os.path.curdir, os.path.pardir) _pattern_flags = re.NOFLAG if os.path.normcase('Aa') == 'Aa' else re.IGNORECASE _dir_open_flags = os.O_RDONLY | getattr(os, 'O_DIRECTORY', 0) @@ -64,9 +64,8 @@ def iglob(pathname, *, root_dir=None, dir_fd=None, recursive=False, else: # Relative pattern. if root_dir is None: - root_dir = './' - else: - root_dir = _add_trailing_slash(root_dir) + root_dir = os.path.curdir + root_dir = _add_trailing_slash(root_dir) root_slicer = operator.itemgetter(slice(len(root_dir), None)) paths = select(root_dir, root_dir, dir_fd, False) paths = map(root_slicer, paths) From 0aec12cf00b3e1786d34f800c972f1a3c7565ed2 Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 6 Mar 2024 21:20:34 +0000 Subject: [PATCH 11/59] Address some review feedback. --- Lib/glob.py | 8 +++++++- .../2024-03-05-23-08-11.gh-issue-116380.56HU7I.rst | 4 ++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index 25089a63628f14..447f748cb224bc 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -187,7 +187,7 @@ def _add_trailing_slash(pathname): def _open_dir(path, rel_path, dir_fd): - """Scans the given directory, and returns a 3-tuple with these parts: + """Prepares the directory for scanning. Returns a 3-tuple with parts: 1. A path or fd to supply to `os.scandir()`. 2. The file descriptor for the directory, or None. @@ -256,6 +256,8 @@ def select_wildcard(path, rel_path, dir_fd, exists): arg, fd, close_fd = _open_dir(path, rel_path, dir_fd) if fd is not None: prefix = _add_trailing_slash(path) + # Ensure we don't exhaust file descriptors when globbing deep + # trees by closing the directory *before* yielding anything. with os.scandir(arg) as scandir_it: entries = list(scandir_it) for entry in entries: @@ -280,6 +282,8 @@ def select_wildcard(path, rel_path, dir_fd, exists): try: arg, fd, close_fd = _open_dir(path, rel_path, dir_fd) prefix = _add_trailing_slash(path) + # We use listdir() rather than scandir() because we don't need + # to check for subdirectories; we only need the child names. for name in os.listdir(arg): if match is None or match(name): yield prefix + name @@ -325,6 +329,8 @@ def select_recursive(path, rel_path, dir_fd, exists, match_pos=None): arg, fd, close_fd = _open_dir(path, rel_path, dir_fd) if fd is not None: prefix = _add_trailing_slash(path) + # Ensure we don't exhaust file descriptors when globbing deep + # trees by closing the directory *before* yielding anything. with os.scandir(arg) as scandir_it: entries = list(scandir_it) for entry in entries: diff --git a/Misc/NEWS.d/next/Library/2024-03-05-23-08-11.gh-issue-116380.56HU7I.rst b/Misc/NEWS.d/next/Library/2024-03-05-23-08-11.gh-issue-116380.56HU7I.rst index 6c96ec6d3422a1..db235d153c8666 100644 --- a/Misc/NEWS.d/next/Library/2024-03-05-23-08-11.gh-issue-116380.56HU7I.rst +++ b/Misc/NEWS.d/next/Library/2024-03-05-23-08-11.gh-issue-116380.56HU7I.rst @@ -1,2 +1,2 @@ -Speed up :func:`glob.glob` by making use of :func:`glob.translate` and -tracking path existence in more detail. +Speed up :func:`glob.glob` and :func:`glob.iglob` by making use of +:func:`glob.translate` and tracking path existence in more detail. From 72691baa60ae945e98c0c419817b00a1a221d964 Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 6 Mar 2024 22:06:27 +0000 Subject: [PATCH 12/59] Use assignment expressions in a couple of places --- Lib/glob.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index 447f748cb224bc..15a0740f953914 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -223,11 +223,11 @@ def _literal_selector(part, parts, recursive, include_hidden): """Returns a function that selects a literal descendant of a given path. """ is_special = part in _special_parts - while parts and magic_check.search(parts[0]) is None: + while parts and magic_check.search(next_part := parts[0]) is None: # Consume next non-wildcard component (speeds up joining). - if parts[0] not in _special_parts: + if next_part not in _special_parts: is_special = False - part += os.path.sep + parts[0] + part += os.path.sep + next_part parts = parts[1:] select_next = _selector(parts, recursive, include_hidden) @@ -302,9 +302,9 @@ def _recursive_selector(part, parts, recursive, include_hidden): """ while parts and parts[0] == '**': parts = parts[1:] - while parts and parts[0] not in _special_parts: + while parts and (next_part := parts[0]) not in _special_parts: # Consume next non-special component (used to build regex). - part += os.path.sep + parts[0] + part += os.path.sep + next_part parts = parts[1:] if include_hidden and part == '**': From c58dd21c0c06984bcde55061081192dd91bb4032 Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 6 Mar 2024 22:12:21 +0000 Subject: [PATCH 13/59] Replace lambda with `operator.not_`. --- Lib/glob.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/glob.py b/Lib/glob.py index 15a0740f953914..d19560ecf75144 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -69,7 +69,7 @@ def iglob(pathname, *, root_dir=None, dir_fd=None, recursive=False, root_slicer = operator.itemgetter(slice(len(root_dir), None)) paths = select(root_dir, root_dir, dir_fd, False) paths = map(root_slicer, paths) - paths = itertools.dropwhile(lambda path: not path, paths) + paths = itertools.dropwhile(operator.not_, paths) if is_bytes: paths = map(os.fsencode, paths) return paths From 22b30db6dad43a3d81459b4a16b6febf516af146 Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 6 Mar 2024 23:35:03 +0000 Subject: [PATCH 14/59] Speed up `_add_trailing_slash()` --- Lib/glob.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index d19560ecf75144..e17a927846eca5 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -14,6 +14,10 @@ _special_parts = ('', os.path.curdir, os.path.pardir) _pattern_flags = re.NOFLAG if os.path.normcase('Aa') == 'Aa' else re.IGNORECASE _dir_open_flags = os.O_RDONLY | getattr(os, 'O_DIRECTORY', 0) +if os.path.altsep: + _path_seps = (os.path.sep, os.path.altsep) +else: + _path_seps = os.path.sep def glob(pathname, *, root_dir=None, dir_fd=None, recursive=False, @@ -114,10 +118,7 @@ def translate(pat, *, recursive=False, include_hidden=False, seps=None): given, os.path.sep and os.path.altsep (where available) are used. """ if not seps: - if os.path.altsep: - seps = (os.path.sep, os.path.altsep) - else: - seps = os.path.sep + seps = _path_seps escaped_seps = ''.join(map(re.escape, seps)) any_sep = f'[{escaped_seps}]' if len(seps) > 1 else escaped_seps not_sep = f'[^{escaped_seps}]' @@ -183,7 +184,10 @@ def _split_pathname(pathname): def _add_trailing_slash(pathname): """Returns the given path with a trailing slash added, where possible. """ - return os.path.join(pathname, '') + tail = os.path.splitdrive(pathname)[1] + if not tail or tail[-1] in _path_seps: + return pathname + return pathname + os.path.sep def _open_dir(path, rel_path, dir_fd): From 83b70bd5885a1a493ff50a443ecbb44e80962314 Mon Sep 17 00:00:00 2001 From: barneygale Date: Thu, 7 Mar 2024 02:09:17 +0000 Subject: [PATCH 15/59] Speed up `select_literal()` --- Lib/glob.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/glob.py b/Lib/glob.py index e17a927846eca5..e6eae3b24e81cd 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -239,7 +239,7 @@ def _literal_selector(part, parts, recursive, include_hidden): def select_literal(path, rel_path, dir_fd, exists): path = _add_trailing_slash(path) + part rel_path = _add_trailing_slash(rel_path) + part - yield from select_next(path, rel_path, dir_fd, exists and is_special) + return select_next(path, rel_path, dir_fd, exists and is_special) return select_literal From 1d32d14517e39b28d0fecd6bab801733cf1566ee Mon Sep 17 00:00:00 2001 From: barneygale Date: Thu, 7 Mar 2024 03:11:04 +0000 Subject: [PATCH 16/59] Speed up `select_recursive()` --- Lib/glob.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index e6eae3b24e81cd..ef58e927fc3b27 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -319,15 +319,15 @@ def _recursive_selector(part, parts, recursive, include_hidden): dir_only = bool(parts) select_next = _selector(parts, recursive, include_hidden) - def select_recursive(path, rel_path, dir_fd, exists, match_pos=None): - if match_pos is None: - path = _add_trailing_slash(path) - rel_path = _add_trailing_slash(rel_path) - match_pos = len(path) - + def select_recursive(path, rel_path, dir_fd, exists): + path = _add_trailing_slash(path) + rel_path = _add_trailing_slash(rel_path) + match_pos = len(path) if match is None or match(path, match_pos): yield from select_next(path, rel_path, dir_fd, exists) + yield from select_recursive_step(path, rel_path, dir_fd, match_pos) + def select_recursive_step(path, rel_path, dir_fd, match_pos): close_fd = False try: arg, fd, close_fd = _open_dir(path, rel_path, dir_fd) @@ -349,10 +349,13 @@ def select_recursive(path, rel_path, dir_fd, exists, match_pos=None): entry_path = entry.path if fd is not None: entry_path = prefix + entry_path + if match is None or match(entry_path, match_pos): + if dir_only: + yield from select_next(entry_path, entry.name, fd, True) + else: + yield entry_path if is_dir: - yield from select_recursive(entry_path, entry.name, fd, True, match_pos) - elif match is None or match(entry_path, match_pos): - yield from select_next(entry_path, entry.name, fd, True) + yield from select_recursive_step(entry_path, entry.name, fd, match_pos) except OSError: pass finally: From f1440a9a5a5b744b2c77c8da8201e780a4a2b170 Mon Sep 17 00:00:00 2001 From: barneygale Date: Mon, 18 Mar 2024 22:35:27 +0000 Subject: [PATCH 17/59] Cache compiled patterns rather than selectors. --- Lib/glob.py | 62 ++++++++++++++++++++++++++--------------------------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index 1d31f153f552e9..3baf8a85cb7608 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -155,7 +155,15 @@ def translate(pat, *, recursive=False, include_hidden=False, seps=None): return fr'(?s:{res})\Z' +@functools.lru_cache(maxsize=32768) def _compile_pattern(pattern, recursive, include_hidden): + if include_hidden: + if recursive: + if pattern == '**': + return None + else: + if pattern == '*': + return None regex = translate(pattern, recursive=recursive, include_hidden=include_hidden, seps=os.path.sep) return re.compile(regex, flags=_pattern_flags).match @@ -163,7 +171,7 @@ def _compile_pattern(pattern, recursive, include_hidden): def _split_pathname(pathname): """Split the given path into a pair (anchor, parts), where *anchor* is the - path drive and root (if any), and *parts* is a tuple of path components. + path drive and root (if any), and *parts* is a reversed list of path parts. """ parts = [] split = os.path.split @@ -172,8 +180,7 @@ def _split_pathname(pathname): parts.append(part) pathname = dirname dirname, part = split(pathname) - parts.reverse() - return dirname, tuple(parts) + return dirname, parts def _add_trailing_slash(pathname): @@ -201,33 +208,32 @@ def _open_dir(path, rel_path, dir_fd): return fd, fd, True -@functools.lru_cache(maxsize=1024) def _selector(parts, recursive, include_hidden): """Returns a function that selects from a given path, walking and filtering according to the glob-style pattern parts in *parts*. """ if not parts: return _select_exists - part = parts[0] - if recursive and part == '**': + elif recursive and parts[-1] == '**': selector = _recursive_selector - elif magic_check.search(part) is not None: + elif magic_check.search(parts[-1]) is not None: selector = _wildcard_selector else: selector = _literal_selector - return selector(part, parts[1:], recursive, include_hidden) + return selector(parts, recursive, include_hidden) -def _literal_selector(part, parts, recursive, include_hidden): +def _literal_selector(parts, recursive, include_hidden): """Returns a function that selects a literal descendant of a given path. """ + part = parts.pop() is_special = part in _special_parts - while parts and magic_check.search(next_part := parts[0]) is None: + while parts and magic_check.search(parts[-1]) is None: # Consume next non-wildcard component (speeds up joining). + next_part = parts.pop() if next_part not in _special_parts: is_special = False part += os.path.sep + next_part - parts = parts[1:] select_next = _selector(parts, recursive, include_hidden) @@ -238,14 +244,11 @@ def select_literal(path, rel_path, dir_fd, exists): return select_literal -def _wildcard_selector(part, parts, recursive, include_hidden): +def _wildcard_selector(parts, recursive, include_hidden): """Returns a function that selects direct children of a given path, filtering by pattern. """ - if include_hidden and part == '*': - match = None # Skip generating a pattern that would match all inputs. - else: - match = _compile_pattern(part, recursive, include_hidden) + match = _compile_pattern(parts.pop(), False, include_hidden) if parts: select_next = _selector(parts, recursive, include_hidden) @@ -295,22 +298,18 @@ def select_wildcard(path, rel_path, dir_fd, exists): return select_wildcard -def _recursive_selector(part, parts, recursive, include_hidden): +def _recursive_selector(parts, recursive, include_hidden): """Returns a function that selects a given path and all its children, recursively, filtering by pattern. """ - while parts and parts[0] == '**': - parts = parts[1:] - while parts and (next_part := parts[0]) not in _special_parts: + part = parts.pop() + while parts and parts[-1] == '**': + parts.pop() + while parts and parts[-1] not in _special_parts: # Consume next non-special component (used to build regex). - part += os.path.sep + next_part - parts = parts[1:] - - if include_hidden and part == '**': - match = None # Skip generating a pattern that would match all inputs. - else: - match = _compile_pattern(part, recursive, include_hidden) + part += os.path.sep + parts.pop() + match = _compile_pattern(part, True, include_hidden) dir_only = bool(parts) select_next = _selector(parts, recursive, include_hidden) @@ -381,10 +380,11 @@ def _select_exists(path, rel_path, dir_fd, exists): def _legacy_glob(selector, dirname, pattern): """Implements the undocumented glob0() and glob1() functions. """ - root = _add_trailing_slash(dirname) - root_slicer = operator.itemgetter(slice(len(root), None)) - select = selector(pattern, (), False, False) - paths = select(dirname, dirname, None, False) + parts = [pattern] + select = selector(parts, recursive=False, include_hidden=False) + root_dir = _add_trailing_slash(dirname) + root_slicer = operator.itemgetter(slice(len(root_dir), None)) + paths = select(dirname, dirname, dir_fd=None, exists=False) paths = map(root_slicer, paths) return list(paths) From 9c64643d139daf1e572a86f58a2da05a98705cc1 Mon Sep 17 00:00:00 2001 From: barneygale Date: Mon, 18 Mar 2024 23:40:06 +0000 Subject: [PATCH 18/59] Remove a bit of code duplication. --- Lib/glob.py | 42 +++++++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index 3baf8a85cb7608..d5ca7d670739ee 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -69,15 +69,23 @@ def iglob(pathname, *, root_dir=None, dir_fd=None, recursive=False, # Relative pattern. if root_dir is None: root_dir = os.path.curdir - root_dir = _add_trailing_slash(root_dir) - root_slicer = operator.itemgetter(slice(len(root_dir), None)) - paths = select(root_dir, root_dir, dir_fd, False) - paths = map(root_slicer, paths) + paths = _relative_glob(select, root_dir, dir_fd) paths = itertools.dropwhile(operator.not_, paths) if is_bytes: paths = map(os.fsencode, paths) return paths +# Following functions are not public but can be used by third-party code. + +def glob0(dirname, pattern): + select = _literal_selector([pattern], False, False) + paths = _relative_glob(select, dirname, None) + return list(paths) + +def glob1(dirname, pattern): + select = _wildcard_selector([pattern], False, False) + paths = _relative_glob(select, dirname, None) + return list(paths) magic_check = re.compile('([*?[])') magic_check_bytes = re.compile(b'([*?[])') @@ -208,6 +216,17 @@ def _open_dir(path, rel_path, dir_fd): return fd, fd, True +def _relative_glob(select, dirname, dir_fd): + """Globs using a select function from the given dirname. The dirname + prefix is removed from results. + """ + dirname = _add_trailing_slash(dirname) + slicer = operator.itemgetter(slice(len(dirname), None)) + paths = select(dirname, dirname, dir_fd, False) + paths = map(slicer, paths) + return paths + + def _selector(parts, recursive, include_hidden): """Returns a function that selects from a given path, walking and filtering according to the glob-style pattern parts in *parts*. @@ -376,18 +395,3 @@ def _select_exists(path, rel_path, dir_fd, exists): except OSError: pass - -def _legacy_glob(selector, dirname, pattern): - """Implements the undocumented glob0() and glob1() functions. - """ - parts = [pattern] - select = selector(parts, recursive=False, include_hidden=False) - root_dir = _add_trailing_slash(dirname) - root_slicer = operator.itemgetter(slice(len(root_dir), None)) - paths = select(dirname, dirname, dir_fd=None, exists=False) - paths = map(root_slicer, paths) - return list(paths) - - -glob0 = functools.partial(_legacy_glob, _literal_selector) -glob1 = functools.partial(_legacy_glob, _wildcard_selector) From b0e8ba67e09dee442e23796ef4a87da7a401f5ae Mon Sep 17 00:00:00 2001 From: barneygale Date: Tue, 19 Mar 2024 00:29:50 +0000 Subject: [PATCH 19/59] Fix stray newline --- Lib/glob.py | 1 - 1 file changed, 1 deletion(-) diff --git a/Lib/glob.py b/Lib/glob.py index d5ca7d670739ee..ab6ba70e4d4309 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -394,4 +394,3 @@ def _select_exists(path, rel_path, dir_fd, exists): yield path except OSError: pass - From 0e02ec516411f5714dfb12444315f5b4c969d44b Mon Sep 17 00:00:00 2001 From: barneygale Date: Thu, 28 Mar 2024 19:29:37 +0000 Subject: [PATCH 20/59] Remove tests for glob0 and glob1 --- Lib/test/test_glob.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/Lib/test/test_glob.py b/Lib/test/test_glob.py index e5cf5ec4ead34f..6719bdbb0cc9b1 100644 --- a/Lib/test/test_glob.py +++ b/Lib/test/test_glob.py @@ -382,25 +382,6 @@ def test_glob_many_open_files(self): for it in iters: self.assertEqual(next(it), p) - def test_glob0(self): - # This undocumented function matches literal paths - eq = self.assertSequencesEqual_noorder - eq(glob.glob0(self.tempdir, ''), ['']) - eq(glob.glob0(self.tempdir, 'a'), ['a']) - eq(glob.glob0(self.tempdir, '.bb'), ['.bb']) - eq(glob.glob0(self.tempdir, 'b'), []) - - def test_glob1(self): - # This undocumented function matches non-recursive wildcards - eq = self.assertSequencesEqual_noorder - eq(glob.glob1(self.tempdir, 'a'), ['a']) - eq(glob.glob1(self.tempdir, '.bb'), ['.bb']) - eq(glob.glob1(self.tempdir, '.b*'), ['.bb']) - eq(glob.glob1(self.tempdir, 'b'), []) - eq(glob.glob1(self.tempdir, '?'), ['a']) - eq(glob.glob1(self.tempdir, '*a'), ['a', 'aaa']) - eq(glob.glob1(self.tempdir, 'a*'), ['a', 'aaa', 'aab']) - def test_translate_matching(self): match = re.compile(glob.translate('*')).match self.assertIsNotNone(match('foo')) From be4865eaabe4f1493fb9b18fa36a6ee75b8eab4c Mon Sep 17 00:00:00 2001 From: barneygale Date: Fri, 29 Mar 2024 17:12:15 +0000 Subject: [PATCH 21/59] Add a bunch of comments explaining the more subtle parts. --- Lib/glob.py | 109 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 68 insertions(+), 41 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index ab6ba70e4d4309..31c33b4a77429e 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -62,14 +62,18 @@ def iglob(pathname, *, root_dir=None, dir_fd=None, recursive=False, select = _selector(parts, recursive, include_hidden) if pathname: - # Absolute pattern. - drive = os.path.splitdrive(pathname)[0] - paths = select(pathname, pathname, dir_fd, not drive) + # Non-relative pattern. The anchor is guaranteed to exist unless it + # has a Windows drive component. + exists = not os.path.splitdrive(pathname)[0] + paths = select(pathname, pathname, dir_fd, exists) else: # Relative pattern. if root_dir is None: root_dir = os.path.curdir paths = _relative_glob(select, root_dir, dir_fd) + + # Ensure that the empty string is not yielded when given a pattern + # like '' or '**'. paths = itertools.dropwhile(operator.not_, paths) if is_bytes: paths = map(os.fsencode, paths) @@ -165,7 +169,10 @@ def translate(pat, *, recursive=False, include_hidden=False, seps=None): @functools.lru_cache(maxsize=32768) def _compile_pattern(pattern, recursive, include_hidden): + """Compile an re.Pattern object for the given glob-style pattern. + """ if include_hidden: + # Optimization: don't compile patterns that would match all inputs. if recursive: if pattern == '**': return None @@ -246,12 +253,17 @@ def _literal_selector(parts, recursive, include_hidden): """Returns a function that selects a literal descendant of a given path. """ part = parts.pop() + + # Optimization: if the part is special, it doesn't affect whether paths + # are known to exist. is_special = part in _special_parts + + # Optimization: consume and join any subsequent literal parts here, rather + # than leaving them for the next selector. This reduces the number of + # string concatenation operations and calls to _add_trailing_slash(). while parts and magic_check.search(parts[-1]) is None: - # Consume next non-wildcard component (speeds up joining). next_part = parts.pop() - if next_part not in _special_parts: - is_special = False + is_special = is_special and next_part in _special_parts part += os.path.sep + next_part select_next = _selector(parts, recursive, include_hidden) @@ -269,42 +281,15 @@ def _wildcard_selector(parts, recursive, include_hidden): """ match = _compile_pattern(parts.pop(), False, include_hidden) - if parts: - select_next = _selector(parts, recursive, include_hidden) - def select_wildcard(path, rel_path, dir_fd, exists): - close_fd = False - try: - arg, fd, close_fd = _open_dir(path, rel_path, dir_fd) - if fd is not None: - prefix = _add_trailing_slash(path) - # Ensure we don't exhaust file descriptors when globbing deep - # trees by closing the directory *before* yielding anything. - with os.scandir(arg) as scandir_it: - entries = list(scandir_it) - for entry in entries: - if match is None or match(entry.name): - try: - if entry.is_dir(): - entry_path = entry.path - if fd is not None: - entry_path = prefix + entry_path - yield from select_next(entry_path, entry.name, fd, True) - except OSError: - pass - except OSError: - pass - finally: - if close_fd: - os.close(fd) - - else: - def select_wildcard(path, rel_path, dir_fd, exists): + if not parts: + # Optimization: use os.listdir() rather than os.scandir(), because we + # don't need to distinguish between files and directories. We also + # yield results directly rather than passing them through a selector. + def select_last_wildcard(path, rel_path, dir_fd, exists): close_fd = False try: arg, fd, close_fd = _open_dir(path, rel_path, dir_fd) prefix = _add_trailing_slash(path) - # We use listdir() rather than scandir() because we don't need - # to check for subdirectories; we only need the child names. for name in os.listdir(arg): if match is None or match(name): yield prefix + name @@ -313,7 +298,36 @@ def select_wildcard(path, rel_path, dir_fd, exists): finally: if close_fd: os.close(fd) + return select_last_wildcard + + select_next = _selector(parts, recursive, include_hidden) + def select_wildcard(path, rel_path, dir_fd, exists): + close_fd = False + try: + arg, fd, close_fd = _open_dir(path, rel_path, dir_fd) + if fd is not None: + prefix = _add_trailing_slash(path) + # Ensure we don't exhaust file descriptors when globbing deep + # trees by closing the directory *before* yielding anything. + with os.scandir(arg) as scandir_it: + entries = list(scandir_it) + for entry in entries: + if match is None or match(entry.name): + try: + if entry.is_dir(): + entry_path = entry.path + if fd is not None: + entry_path = prefix + entry_path + yield from select_next( + entry_path, entry.name, fd, True) + except OSError: + pass + except OSError: + pass + finally: + if close_fd: + os.close(fd) return select_wildcard @@ -322,10 +336,17 @@ def _recursive_selector(parts, recursive, include_hidden): recursively, filtering by pattern. """ part = parts.pop() + + # Optimization: consume following '**' parts, which have no effect. while parts and parts[-1] == '**': parts.pop() + + # Optimization: consume and join any following non-special parts here, + # rather than leaving them for the next selector. They're used to build a + # regular expression, which we use to filter the results of the recursive + # walk. As a result, non-special pattern segments following a '**' + # wildcard don't require additional filesystem access to expand. while parts and parts[-1] not in _special_parts: - # Consume next non-special component (used to build regex). part += os.path.sep + parts.pop() match = _compile_pattern(part, True, include_hidden) @@ -364,11 +385,15 @@ def select_recursive_step(path, rel_path, dir_fd, match_pos): entry_path = prefix + entry_path if match is None or match(entry_path, match_pos): if dir_only: - yield from select_next(entry_path, entry.name, fd, True) + yield from select_next( + entry_path, entry.name, fd, True) else: + # Optimization: directly yield the path if this is + # last pattern part. yield entry_path if is_dir: - yield from select_recursive_step(entry_path, entry.name, fd, match_pos) + yield from select_recursive_step( + entry_path, entry.name, fd, match_pos) except OSError: pass finally: @@ -381,6 +406,8 @@ def _select_exists(path, rel_path, dir_fd, exists): """Yields the given path, if it exists. """ if exists: + # Optimization: this path is already known to exist, e.g. because it + # was returned from os.scandir(), so we skip calling lstat(). yield path elif dir_fd is None: try: From 13355a056ca80c4c704be4fcf2b0b090de7b00ac Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 3 Apr 2024 19:12:04 +0100 Subject: [PATCH 22/59] Clarify variable naming in iglob() --- Lib/glob.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index 51ec5c9d331c2c..b658c7ca796bba 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -58,14 +58,14 @@ def iglob(pathname, *, root_dir=None, dir_fd=None, recursive=False, pathname = os.fsdecode(pathname) if root_dir is not None: root_dir = os.fsdecode(root_dir) - pathname, parts = _split_pathname(pathname) + anchor, parts = _split_pathname(pathname) select = _selector(parts, recursive, include_hidden) - if pathname: + if anchor: # Non-relative pattern. The anchor is guaranteed to exist unless it # has a Windows drive component. - exists = not os.path.splitdrive(pathname)[0] - paths = select(pathname, pathname, dir_fd, exists) + exists = not os.path.splitdrive(anchor)[0] + paths = select(anchor, anchor, dir_fd, exists) else: # Relative pattern. if root_dir is None: From 2e5cebda195a1a6c8c721a9eddbd54673e7bbbd9 Mon Sep 17 00:00:00 2001 From: barneygale Date: Thu, 4 Apr 2024 02:25:59 +0100 Subject: [PATCH 23/59] Use keyword arguments to pass True/False/None literals, for clarity. --- Lib/glob.py | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index b658c7ca796bba..d686e0d80cfec3 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -60,7 +60,7 @@ def iglob(pathname, *, root_dir=None, dir_fd=None, recursive=False, root_dir = os.fsdecode(root_dir) anchor, parts = _split_pathname(pathname) - select = _selector(parts, recursive, include_hidden) + select = _selector(parts, include_hidden, recursive) if anchor: # Non-relative pattern. The anchor is guaranteed to exist unless it # has a Windows drive component. @@ -87,15 +87,15 @@ def iglob(pathname, *, root_dir=None, dir_fd=None, recursive=False, def glob0(dirname, pattern): import warnings warnings._deprecated("glob.glob0", _deprecated_function_message, remove=(3, 15)) - select = _literal_selector([pattern], False, False) - paths = _relative_glob(select, dirname, None) + select = _literal_selector([pattern], recursive=False, include_hidden=False) + paths = _relative_glob(select, dirname, dir_fd=None) return list(paths) def glob1(dirname, pattern): import warnings warnings._deprecated("glob.glob1", _deprecated_function_message, remove=(3, 15)) - select = _wildcard_selector([pattern], False, False) - paths = _relative_glob(select, dirname, None) + select = _wildcard_selector([pattern], recursive=False, include_hidden=False) + paths = _relative_glob(select, dirname, dir_fd=None) return list(paths) magic_check = re.compile('([*?[])') @@ -175,7 +175,7 @@ def translate(pat, *, recursive=False, include_hidden=False, seps=None): @functools.lru_cache(maxsize=32768) -def _compile_pattern(pattern, recursive, include_hidden): +def _compile_pattern(pattern, include_hidden, recursive): """Compile an re.Pattern object for the given glob-style pattern. """ if include_hidden: @@ -186,8 +186,8 @@ def _compile_pattern(pattern, recursive, include_hidden): else: if pattern == '*': return None - regex = translate(pattern, recursive=recursive, - include_hidden=include_hidden, seps=os.path.sep) + regex = translate(pattern, include_hidden=include_hidden, + recursive=recursive, seps=os.path.sep) return re.compile(regex, flags=_pattern_flags).match @@ -236,12 +236,12 @@ def _relative_glob(select, dirname, dir_fd): """ dirname = _add_trailing_slash(dirname) slicer = operator.itemgetter(slice(len(dirname), None)) - paths = select(dirname, dirname, dir_fd, False) + paths = select(dirname, dirname, dir_fd, exists=False) paths = map(slicer, paths) return paths -def _selector(parts, recursive, include_hidden): +def _selector(parts, include_hidden, recursive): """Returns a function that selects from a given path, walking and filtering according to the glob-style pattern parts in *parts*. """ @@ -253,10 +253,10 @@ def _selector(parts, recursive, include_hidden): selector = _wildcard_selector else: selector = _literal_selector - return selector(parts, recursive, include_hidden) + return selector(parts, include_hidden, recursive) -def _literal_selector(parts, recursive, include_hidden): +def _literal_selector(parts, include_hidden, recursive): """Returns a function that selects a literal descendant of a given path. """ part = parts.pop() @@ -273,7 +273,7 @@ def _literal_selector(parts, recursive, include_hidden): is_special = is_special and next_part in _special_parts part += os.path.sep + next_part - select_next = _selector(parts, recursive, include_hidden) + select_next = _selector(parts, include_hidden, recursive) def select_literal(path, rel_path, dir_fd, exists): path = _add_trailing_slash(path) + part @@ -282,11 +282,11 @@ def select_literal(path, rel_path, dir_fd, exists): return select_literal -def _wildcard_selector(parts, recursive, include_hidden): +def _wildcard_selector(parts, include_hidden, recursive): """Returns a function that selects direct children of a given path, filtering by pattern. """ - match = _compile_pattern(parts.pop(), False, include_hidden) + match = _compile_pattern(parts.pop(), include_hidden, recursive=False) if not parts: # Optimization: use os.listdir() rather than os.scandir(), because we @@ -307,7 +307,7 @@ def select_last_wildcard(path, rel_path, dir_fd, exists): os.close(fd) return select_last_wildcard - select_next = _selector(parts, recursive, include_hidden) + select_next = _selector(parts, include_hidden, recursive) def select_wildcard(path, rel_path, dir_fd, exists): close_fd = False @@ -327,7 +327,7 @@ def select_wildcard(path, rel_path, dir_fd, exists): if fd is not None: entry_path = prefix + entry_path yield from select_next( - entry_path, entry.name, fd, True) + entry_path, entry.name, fd, exists=True) except OSError: pass except OSError: @@ -338,7 +338,7 @@ def select_wildcard(path, rel_path, dir_fd, exists): return select_wildcard -def _recursive_selector(parts, recursive, include_hidden): +def _recursive_selector(parts, include_hidden, recursive): """Returns a function that selects a given path and all its children, recursively, filtering by pattern. """ @@ -356,9 +356,9 @@ def _recursive_selector(parts, recursive, include_hidden): while parts and parts[-1] not in _special_parts: part += os.path.sep + parts.pop() - match = _compile_pattern(part, True, include_hidden) + match = _compile_pattern(part, include_hidden, recursive) dir_only = bool(parts) - select_next = _selector(parts, recursive, include_hidden) + select_next = _selector(parts, include_hidden, recursive) def select_recursive(path, rel_path, dir_fd, exists): path = _add_trailing_slash(path) @@ -393,7 +393,7 @@ def select_recursive_step(path, rel_path, dir_fd, match_pos): if match is None or match(entry_path, match_pos): if dir_only: yield from select_next( - entry_path, entry.name, fd, True) + entry_path, entry.name, fd, exists=True) else: # Optimization: directly yield the path if this is # last pattern part. From 5eba2eb75d697ec373515c708d802a94450cedf0 Mon Sep 17 00:00:00 2001 From: barneygale Date: Thu, 4 Apr 2024 03:52:31 +0100 Subject: [PATCH 24/59] Speed up recursive globbing very slightly --- Lib/glob.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index d686e0d80cfec3..d67783e5de36ac 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -360,15 +360,13 @@ def _recursive_selector(parts, include_hidden, recursive): dir_only = bool(parts) select_next = _selector(parts, include_hidden, recursive) - def select_recursive(path, rel_path, dir_fd, exists): - path = _add_trailing_slash(path) - rel_path = _add_trailing_slash(rel_path) - match_pos = len(path) - if match is None or match(path, match_pos): - yield from select_next(path, rel_path, dir_fd, exists) - yield from select_recursive_step(path, rel_path, dir_fd, match_pos) - - def select_recursive_step(path, rel_path, dir_fd, match_pos): + def select_recursive(path, rel_path, dir_fd, exists, match_pos=None): + if match_pos is None: + path = _add_trailing_slash(path) + rel_path = _add_trailing_slash(rel_path) + match_pos = len(path) + if match is None or match(path, match_pos): + yield from select_next(path, rel_path, dir_fd, exists) close_fd = False try: arg, fd, close_fd = _open_dir(path, rel_path, dir_fd) @@ -399,8 +397,8 @@ def select_recursive_step(path, rel_path, dir_fd, match_pos): # last pattern part. yield entry_path if is_dir: - yield from select_recursive_step( - entry_path, entry.name, fd, match_pos) + yield from select_recursive( + entry_path, entry.name, fd, exists, match_pos) except OSError: pass finally: From ad0ece8267aec349f4bb59016df63fcd6f55f799 Mon Sep 17 00:00:00 2001 From: barneygale Date: Fri, 5 Apr 2024 20:28:25 +0100 Subject: [PATCH 25/59] Implement recursive wildcards with a stack --- Lib/glob.py | 90 ++++++++++++++++++++++++++++------------------------- 1 file changed, 47 insertions(+), 43 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index d67783e5de36ac..d151509567c874 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -360,50 +360,54 @@ def _recursive_selector(parts, include_hidden, recursive): dir_only = bool(parts) select_next = _selector(parts, include_hidden, recursive) - def select_recursive(path, rel_path, dir_fd, exists, match_pos=None): - if match_pos is None: - path = _add_trailing_slash(path) - rel_path = _add_trailing_slash(rel_path) - match_pos = len(path) - if match is None or match(path, match_pos): - yield from select_next(path, rel_path, dir_fd, exists) - close_fd = False - try: - arg, fd, close_fd = _open_dir(path, rel_path, dir_fd) - if fd is not None: - prefix = _add_trailing_slash(path) - # Ensure we don't exhaust file descriptors when globbing deep - # trees by closing the directory *before* yielding anything. - with os.scandir(arg) as scandir_it: - entries = list(scandir_it) - for entry in entries: - is_dir = False - try: - if entry.is_dir(): - is_dir = True - except OSError: - pass - - if is_dir or not dir_only: - entry_path = entry.path - if fd is not None: - entry_path = prefix + entry_path - if match is None or match(entry_path, match_pos): - if dir_only: - yield from select_next( - entry_path, entry.name, fd, exists=True) - else: - # Optimization: directly yield the path if this is - # last pattern part. - yield entry_path - if is_dir: - yield from select_recursive( - entry_path, entry.name, fd, exists, match_pos) - except OSError: - pass - finally: + def select_recursive(path, rel_path, dir_fd, exists): + path = _add_trailing_slash(path) + rel_path = _add_trailing_slash(rel_path) + match_pos = len(path) + if match is None or match(path, match_pos): + yield from select_next(path, rel_path, dir_fd, exists) + stack = [(path, rel_path, dir_fd)] + while stack: + try: + yield from select_recursive_step(stack, match_pos) + except OSError: + pass + + def select_recursive_step(stack, match_pos): + path, rel_path, dir_fd = stack.pop() + if path is None: + os.close(dir_fd) + return + arg, fd, close_fd = _open_dir(path, rel_path, dir_fd) + if fd is not None: + prefix = _add_trailing_slash(path) if close_fd: - os.close(fd) + stack.append((None, None, fd)) + with os.scandir(arg) as scandir_it: + entries = list(scandir_it) + for entry in entries: + is_dir = False + try: + if entry.is_dir(): + is_dir = True + except OSError: + pass + + if is_dir or not dir_only: + entry_path = entry.path + if fd is not None: + entry_path = prefix + entry_path + if match is None or match(entry_path, match_pos): + if dir_only: + yield from select_next( + entry_path, entry.name, fd, exists=True) + else: + # Optimization: directly yield the path if this is + # last pattern part. + yield entry_path + if is_dir: + stack.append((entry_path, entry.name, fd)) + return select_recursive From cafe9be71a63818fd2bfd64339fca9ac2702781f Mon Sep 17 00:00:00 2001 From: barneygale Date: Fri, 5 Apr 2024 21:30:12 +0100 Subject: [PATCH 26/59] Add argument defaults, simplify code slightly. --- Lib/glob.py | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index d151509567c874..05533af73bb1bf 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -87,16 +87,12 @@ def iglob(pathname, *, root_dir=None, dir_fd=None, recursive=False, def glob0(dirname, pattern): import warnings warnings._deprecated("glob.glob0", _deprecated_function_message, remove=(3, 15)) - select = _literal_selector([pattern], recursive=False, include_hidden=False) - paths = _relative_glob(select, dirname, dir_fd=None) - return list(paths) + return list(_relative_glob(_literal_selector([pattern]), dirname)) def glob1(dirname, pattern): import warnings warnings._deprecated("glob.glob1", _deprecated_function_message, remove=(3, 15)) - select = _wildcard_selector([pattern], recursive=False, include_hidden=False) - paths = _relative_glob(select, dirname, dir_fd=None) - return list(paths) + return list(_relative_glob(_wildcard_selector([pattern]), dirname)) magic_check = re.compile('([*?[])') magic_check_bytes = re.compile(b'([*?[])') @@ -175,7 +171,7 @@ def translate(pat, *, recursive=False, include_hidden=False, seps=None): @functools.lru_cache(maxsize=32768) -def _compile_pattern(pattern, include_hidden, recursive): +def _compile_pattern(pattern, include_hidden=False, recursive=False): """Compile an re.Pattern object for the given glob-style pattern. """ if include_hidden: @@ -230,18 +226,16 @@ def _open_dir(path, rel_path, dir_fd): return fd, fd, True -def _relative_glob(select, dirname, dir_fd): +def _relative_glob(select, dirname, dir_fd=None): """Globs using a select function from the given dirname. The dirname prefix is removed from results. """ dirname = _add_trailing_slash(dirname) slicer = operator.itemgetter(slice(len(dirname), None)) - paths = select(dirname, dirname, dir_fd, exists=False) - paths = map(slicer, paths) - return paths + return map(slicer, select(dirname, dirname, dir_fd)) -def _selector(parts, include_hidden, recursive): +def _selector(parts, include_hidden=False, recursive=False): """Returns a function that selects from a given path, walking and filtering according to the glob-style pattern parts in *parts*. """ @@ -256,7 +250,7 @@ def _selector(parts, include_hidden, recursive): return selector(parts, include_hidden, recursive) -def _literal_selector(parts, include_hidden, recursive): +def _literal_selector(parts, include_hidden=False, recursive=False): """Returns a function that selects a literal descendant of a given path. """ part = parts.pop() @@ -275,24 +269,24 @@ def _literal_selector(parts, include_hidden, recursive): select_next = _selector(parts, include_hidden, recursive) - def select_literal(path, rel_path, dir_fd, exists): + def select_literal(path, rel_path, dir_fd=None, exists=False): path = _add_trailing_slash(path) + part rel_path = _add_trailing_slash(rel_path) + part return select_next(path, rel_path, dir_fd, exists and is_special) return select_literal -def _wildcard_selector(parts, include_hidden, recursive): +def _wildcard_selector(parts, include_hidden=False, recursive=False): """Returns a function that selects direct children of a given path, filtering by pattern. """ - match = _compile_pattern(parts.pop(), include_hidden, recursive=False) + match = _compile_pattern(parts.pop(), include_hidden) if not parts: # Optimization: use os.listdir() rather than os.scandir(), because we # don't need to distinguish between files and directories. We also # yield results directly rather than passing them through a selector. - def select_last_wildcard(path, rel_path, dir_fd, exists): + def select_last_wildcard(path, rel_path, dir_fd=None, exists=False): close_fd = False try: arg, fd, close_fd = _open_dir(path, rel_path, dir_fd) @@ -309,7 +303,7 @@ def select_last_wildcard(path, rel_path, dir_fd, exists): select_next = _selector(parts, include_hidden, recursive) - def select_wildcard(path, rel_path, dir_fd, exists): + def select_wildcard(path, rel_path, dir_fd=None, exists=False): close_fd = False try: arg, fd, close_fd = _open_dir(path, rel_path, dir_fd) @@ -338,7 +332,7 @@ def select_wildcard(path, rel_path, dir_fd, exists): return select_wildcard -def _recursive_selector(parts, include_hidden, recursive): +def _recursive_selector(parts, include_hidden=False, recursive=False): """Returns a function that selects a given path and all its children, recursively, filtering by pattern. """ @@ -360,7 +354,7 @@ def _recursive_selector(parts, include_hidden, recursive): dir_only = bool(parts) select_next = _selector(parts, include_hidden, recursive) - def select_recursive(path, rel_path, dir_fd, exists): + def select_recursive(path, rel_path, dir_fd=None, exists=False): path = _add_trailing_slash(path) rel_path = _add_trailing_slash(rel_path) match_pos = len(path) @@ -411,7 +405,7 @@ def select_recursive_step(stack, match_pos): return select_recursive -def _select_exists(path, rel_path, dir_fd, exists): +def _select_exists(path, rel_path, dir_fd=None, exists=False): """Yields the given path, if it exists. """ if exists: From 301d9221978608a23f85c74cdaa485bb790dbf6c Mon Sep 17 00:00:00 2001 From: barneygale Date: Fri, 5 Apr 2024 21:57:05 +0100 Subject: [PATCH 27/59] Also make rel_path optional --- Lib/glob.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index 05533af73bb1bf..5bdb8bc567170b 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -210,7 +210,7 @@ def _add_trailing_slash(pathname): return pathname + os.path.sep -def _open_dir(path, rel_path, dir_fd): +def _open_dir(path, rel_path=None, dir_fd=None): """Prepares the directory for scanning. Returns a 3-tuple with parts: 1. A path or fd to supply to `os.scandir()`. @@ -269,9 +269,10 @@ def _literal_selector(parts, include_hidden=False, recursive=False): select_next = _selector(parts, include_hidden, recursive) - def select_literal(path, rel_path, dir_fd=None, exists=False): + def select_literal(path, rel_path=None, dir_fd=None, exists=False): path = _add_trailing_slash(path) + part - rel_path = _add_trailing_slash(rel_path) + part + if dir_fd is not None: + rel_path = _add_trailing_slash(rel_path) + part return select_next(path, rel_path, dir_fd, exists and is_special) return select_literal @@ -286,7 +287,7 @@ def _wildcard_selector(parts, include_hidden=False, recursive=False): # Optimization: use os.listdir() rather than os.scandir(), because we # don't need to distinguish between files and directories. We also # yield results directly rather than passing them through a selector. - def select_last_wildcard(path, rel_path, dir_fd=None, exists=False): + def select_last_wildcard(path, rel_path=None, dir_fd=None, exists=False): close_fd = False try: arg, fd, close_fd = _open_dir(path, rel_path, dir_fd) @@ -303,7 +304,7 @@ def select_last_wildcard(path, rel_path, dir_fd=None, exists=False): select_next = _selector(parts, include_hidden, recursive) - def select_wildcard(path, rel_path, dir_fd=None, exists=False): + def select_wildcard(path, rel_path=None, dir_fd=None, exists=False): close_fd = False try: arg, fd, close_fd = _open_dir(path, rel_path, dir_fd) @@ -354,9 +355,10 @@ def _recursive_selector(parts, include_hidden=False, recursive=False): dir_only = bool(parts) select_next = _selector(parts, include_hidden, recursive) - def select_recursive(path, rel_path, dir_fd=None, exists=False): + def select_recursive(path, rel_path=None, dir_fd=None, exists=False): path = _add_trailing_slash(path) - rel_path = _add_trailing_slash(rel_path) + if dir_fd is not None: + rel_path = _add_trailing_slash(rel_path) match_pos = len(path) if match is None or match(path, match_pos): yield from select_next(path, rel_path, dir_fd, exists) @@ -405,7 +407,7 @@ def select_recursive_step(stack, match_pos): return select_recursive -def _select_exists(path, rel_path, dir_fd=None, exists=False): +def _select_exists(path, rel_path=None, dir_fd=None, exists=False): """Yields the given path, if it exists. """ if exists: From beb2507002e2621d38e690b808df29d750dd23de Mon Sep 17 00:00:00 2001 From: barneygale Date: Fri, 5 Apr 2024 22:00:00 +0100 Subject: [PATCH 28/59] Optimise _add_trailing_slash --- Lib/glob.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index 5bdb8bc567170b..8bda620d5d1863 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -201,13 +201,18 @@ def _split_pathname(pathname): return dirname, parts -def _add_trailing_slash(pathname): - """Returns the given path with a trailing slash added, where possible. - """ - tail = os.path.splitdrive(pathname)[1] - if not tail or tail[-1] in _path_seps: - return pathname - return pathname + os.path.sep +# Returns the given path with a trailing slash added, where possible. +if os.name == 'nt': + def _add_trailing_slash(pathname): + tail = os.path.splitroot(pathname)[2] + if not tail or tail[-1] in '\\/': + return pathname + return f'{pathname}\\' +else: + def _add_trailing_slash(pathname): + if not pathname or pathname[-1] == '/': + return pathname + return f'{pathname}/' def _open_dir(path, rel_path=None, dir_fd=None): From 312c73a5bea299a3c44107d1974db4f59fa04ac3 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sat, 6 Apr 2024 02:19:49 +0100 Subject: [PATCH 29/59] Remove use of os.listdir() -- doesn't generalise --- Lib/glob.py | 47 +++++++++++++++++------------------------------ 1 file changed, 17 insertions(+), 30 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index 8bda620d5d1863..646ea49e1ba1c9 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -287,27 +287,9 @@ def _wildcard_selector(parts, include_hidden=False, recursive=False): filtering by pattern. """ match = _compile_pattern(parts.pop(), include_hidden) - - if not parts: - # Optimization: use os.listdir() rather than os.scandir(), because we - # don't need to distinguish between files and directories. We also - # yield results directly rather than passing them through a selector. - def select_last_wildcard(path, rel_path=None, dir_fd=None, exists=False): - close_fd = False - try: - arg, fd, close_fd = _open_dir(path, rel_path, dir_fd) - prefix = _add_trailing_slash(path) - for name in os.listdir(arg): - if match is None or match(name): - yield prefix + name - except OSError: - pass - finally: - if close_fd: - os.close(fd) - return select_last_wildcard - - select_next = _selector(parts, include_hidden, recursive) + dir_only = bool(parts) + if dir_only: + select_next = _selector(parts, include_hidden, recursive) def select_wildcard(path, rel_path=None, dir_fd=None, exists=False): close_fd = False @@ -321,15 +303,20 @@ def select_wildcard(path, rel_path=None, dir_fd=None, exists=False): entries = list(scandir_it) for entry in entries: if match is None or match(entry.name): - try: - if entry.is_dir(): - entry_path = entry.path - if fd is not None: - entry_path = prefix + entry_path - yield from select_next( - entry_path, entry.name, fd, exists=True) - except OSError: - pass + if dir_only: + try: + if not entry.is_dir(): + continue + except OSError: + pass + entry_path = entry.path + if fd is not None: + entry_path = prefix + entry_path + if dir_only: + yield from select_next( + entry_path, entry.name, fd, exists=True) + else: + yield entry_path except OSError: pass finally: From ae820e2c0cf6df1ee82ee363283612fb5dfb5f85 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sat, 6 Apr 2024 04:58:08 +0100 Subject: [PATCH 30/59] Add `_Globber` class; prepare for merger with pathlib globbing. --- Lib/glob.py | 364 ++++++++++++++++++++++++++-------------------------- 1 file changed, 185 insertions(+), 179 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index 646ea49e1ba1c9..4e58d06a574805 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -60,7 +60,7 @@ def iglob(pathname, *, root_dir=None, dir_fd=None, recursive=False, root_dir = os.fsdecode(root_dir) anchor, parts = _split_pathname(pathname) - select = _selector(parts, include_hidden, recursive) + select = _Globber(include_hidden, recursive).selector(parts) if anchor: # Non-relative pattern. The anchor is guaranteed to exist unless it # has a Windows drive component. @@ -87,12 +87,12 @@ def iglob(pathname, *, root_dir=None, dir_fd=None, recursive=False, def glob0(dirname, pattern): import warnings warnings._deprecated("glob.glob0", _deprecated_function_message, remove=(3, 15)) - return list(_relative_glob(_literal_selector([pattern]), dirname)) + return list(_relative_glob(_Globber().literal_selector([pattern]), dirname)) def glob1(dirname, pattern): import warnings warnings._deprecated("glob.glob1", _deprecated_function_message, remove=(3, 15)) - return list(_relative_glob(_wildcard_selector([pattern]), dirname)) + return list(_relative_glob(_Globber().wildcard_selector([pattern]), dirname)) magic_check = re.compile('([*?[])') magic_check_bytes = re.compile(b'([*?[])') @@ -171,19 +171,11 @@ def translate(pat, *, recursive=False, include_hidden=False, seps=None): @functools.lru_cache(maxsize=32768) -def _compile_pattern(pattern, include_hidden=False, recursive=False): +def _compile_pattern(include_hidden, recursive, sep, pattern): """Compile an re.Pattern object for the given glob-style pattern. """ - if include_hidden: - # Optimization: don't compile patterns that would match all inputs. - if recursive: - if pattern == '**': - return None - else: - if pattern == '*': - return None regex = translate(pattern, include_hidden=include_hidden, - recursive=recursive, seps=os.path.sep) + recursive=recursive, seps=sep) return re.compile(regex, flags=_pattern_flags).match @@ -203,13 +195,13 @@ def _split_pathname(pathname): # Returns the given path with a trailing slash added, where possible. if os.name == 'nt': - def _add_trailing_slash(pathname): + def _add_slash(pathname): tail = os.path.splitroot(pathname)[2] if not tail or tail[-1] in '\\/': return pathname return f'{pathname}\\' else: - def _add_trailing_slash(pathname): + def _add_slash(pathname): if not pathname or pathname[-1] == '/': return pathname return f'{pathname}/' @@ -235,186 +227,200 @@ def _relative_glob(select, dirname, dir_fd=None): """Globs using a select function from the given dirname. The dirname prefix is removed from results. """ - dirname = _add_trailing_slash(dirname) + dirname = _add_slash(dirname) slicer = operator.itemgetter(slice(len(dirname), None)) return map(slicer, select(dirname, dirname, dir_fd)) -def _selector(parts, include_hidden=False, recursive=False): - """Returns a function that selects from a given path, walking and - filtering according to the glob-style pattern parts in *parts*. - """ - if not parts: - return _select_exists - elif recursive and parts[-1] == '**': - selector = _recursive_selector - elif magic_check.search(parts[-1]) is not None: - selector = _wildcard_selector - else: - selector = _literal_selector - return selector(parts, include_hidden, recursive) - - -def _literal_selector(parts, include_hidden=False, recursive=False): - """Returns a function that selects a literal descendant of a given path. - """ - part = parts.pop() - - # Optimization: if the part is special, it doesn't affect whether paths - # are known to exist. - is_special = part in _special_parts - - # Optimization: consume and join any subsequent literal parts here, rather - # than leaving them for the next selector. This reduces the number of - # string concatenation operations and calls to _add_trailing_slash(). - while parts and magic_check.search(parts[-1]) is None: - next_part = parts.pop() - is_special = is_special and next_part in _special_parts - part += os.path.sep + next_part - - select_next = _selector(parts, include_hidden, recursive) - - def select_literal(path, rel_path=None, dir_fd=None, exists=False): - path = _add_trailing_slash(path) + part - if dir_fd is not None: - rel_path = _add_trailing_slash(rel_path) + part - return select_next(path, rel_path, dir_fd, exists and is_special) - return select_literal - +class _Globber: + def __init__(self, include_hidden=False, recursive=False, sep=os.path.sep): + self.include_hidden = include_hidden + self.recursive = recursive + self.sep = sep + self.compile = functools.partial( + _compile_pattern, include_hidden, recursive, sep) + + def selector(self, parts): + """Returns a function that selects from a given path, walking and + filtering according to the glob-style pattern parts in *parts*. + """ + if not parts: + return self.select_exists + elif self.recursive and parts[-1] == '**': + selector = self.recursive_selector + elif magic_check.search(parts[-1]) is not None: + selector = self.wildcard_selector + else: + selector = self.literal_selector + return selector(parts) + + def literal_selector(self, parts): + """Returns a function that selects a literal descendant of a path. + """ + part = parts.pop() + + # Optimization: if the part is special, it doesn't affect whether + # paths are known to exist. + is_special = part in _special_parts + + # Optimization: consume and join any subsequent literal parts here, + # rather than leaving them for the next selector. This reduces the + # number of string concatenation operations and calls to add_slash(). + while parts and magic_check.search(parts[-1]) is None: + next_part = parts.pop() + is_special = is_special and next_part in _special_parts + part += self.sep + next_part + + select_next = self.selector(parts) + + def select_literal(path, rel_path=None, dir_fd=None, exists=False): + path = _add_slash(path) + part + if dir_fd is not None: + rel_path = _add_slash(rel_path) + part + return select_next(path, rel_path, dir_fd, exists and is_special) + return select_literal + + def wildcard_selector(self, parts): + """Returns a function that selects direct children of a given path, + filtering by pattern. + """ + part = parts.pop() + if self.include_hidden and part == '*': + match = None + else: + match = self.compile(part) + dir_only = bool(parts) + if dir_only: + select_next = self.selector(parts) -def _wildcard_selector(parts, include_hidden=False, recursive=False): - """Returns a function that selects direct children of a given path, - filtering by pattern. - """ - match = _compile_pattern(parts.pop(), include_hidden) - dir_only = bool(parts) - if dir_only: - select_next = _selector(parts, include_hidden, recursive) - - def select_wildcard(path, rel_path=None, dir_fd=None, exists=False): - close_fd = False - try: + def select_wildcard(path, rel_path=None, dir_fd=None, exists=False): + close_fd = False + try: + arg, fd, close_fd = _open_dir(path, rel_path, dir_fd) + if fd is not None: + prefix = _add_slash(path) + # Ensure we don't exhaust file descriptors when globbing deep + # trees by closing the directory *before* yielding anything. + with os.scandir(arg) as scandir_obj: + entries = list(scandir_obj) + for entry in entries: + if match is None or match(entry.name): + if dir_only: + try: + if not entry.is_dir(): + continue + except OSError: + continue + entry_path = entry.path + if fd is not None: + entry_path = prefix + entry_path + if dir_only: + yield from select_next( + entry_path, entry.name, fd, exists=True) + else: + yield entry_path + except OSError: + pass + finally: + if close_fd: + os.close(fd) + return select_wildcard + + def recursive_selector(self, parts): + """Returns a function that selects a given path and all its children, + recursively, filtering by pattern. + """ + part = parts.pop() + + # Optimization: consume following '**' parts, which have no effect. + while parts and parts[-1] == '**': + parts.pop() + + # Optimization: consume and join any following non-special parts here, + # rather than leaving them for the next selector. They're used to + # build a regular expression, which we use to filter the results of + # the recursive walk. As a result, non-special pattern segments + # following a '**' wildcard don't require additional filesystem access + # to expand. + while parts and parts[-1] not in _special_parts: + part += self.sep + parts.pop() + + dir_only = bool(parts) + if self.include_hidden and part == '**': + match = None + else: + match = self.compile(part) + select_next = self.selector(parts) + + def select_recursive(path, rel_path=None, dir_fd=None, exists=False): + path = _add_slash(path) + if dir_fd is not None: + rel_path = _add_slash(rel_path) + match_pos = len(str(path)) + if match is None or match(str(path), match_pos): + yield from select_next(path, rel_path, dir_fd, exists) + stack = [(path, rel_path, dir_fd)] + while stack: + try: + yield from select_recursive_step(stack, match_pos) + except OSError: + pass + + def select_recursive_step(stack, match_pos): + path, rel_path, dir_fd = stack.pop() + if path is None: + os.close(dir_fd) + return arg, fd, close_fd = _open_dir(path, rel_path, dir_fd) if fd is not None: - prefix = _add_trailing_slash(path) + prefix = _add_slash(path) + if close_fd: + stack.append((None, None, fd)) # Ensure we don't exhaust file descriptors when globbing deep # trees by closing the directory *before* yielding anything. - with os.scandir(arg) as scandir_it: - entries = list(scandir_it) + with os.scandir(arg) as scandir_obj: + entries = list(scandir_obj) for entry in entries: - if match is None or match(entry.name): - if dir_only: - try: - if not entry.is_dir(): - continue - except OSError: - pass + is_dir = False + try: + if entry.is_dir(): + is_dir = True + except OSError: + pass + + if is_dir or not dir_only: entry_path = entry.path if fd is not None: entry_path = prefix + entry_path - if dir_only: - yield from select_next( - entry_path, entry.name, fd, exists=True) - else: - yield entry_path - except OSError: - pass - finally: - if close_fd: - os.close(fd) - return select_wildcard - - -def _recursive_selector(parts, include_hidden=False, recursive=False): - """Returns a function that selects a given path and all its children, - recursively, filtering by pattern. - """ - part = parts.pop() - - # Optimization: consume following '**' parts, which have no effect. - while parts and parts[-1] == '**': - parts.pop() - - # Optimization: consume and join any following non-special parts here, - # rather than leaving them for the next selector. They're used to build a - # regular expression, which we use to filter the results of the recursive - # walk. As a result, non-special pattern segments following a '**' - # wildcard don't require additional filesystem access to expand. - while parts and parts[-1] not in _special_parts: - part += os.path.sep + parts.pop() - - match = _compile_pattern(part, include_hidden, recursive) - dir_only = bool(parts) - select_next = _selector(parts, include_hidden, recursive) - - def select_recursive(path, rel_path=None, dir_fd=None, exists=False): - path = _add_trailing_slash(path) - if dir_fd is not None: - rel_path = _add_trailing_slash(rel_path) - match_pos = len(path) - if match is None or match(path, match_pos): - yield from select_next(path, rel_path, dir_fd, exists) - stack = [(path, rel_path, dir_fd)] - while stack: + if match is None or match(str(entry_path), match_pos): + if dir_only: + yield from select_next( + entry_path, entry.name, fd, exists=True) + else: + # Optimization: directly yield the path if this is + # last pattern part. + yield entry_path + if is_dir: + stack.append((entry_path, entry.name, fd)) + + return select_recursive + + def select_exists(self, path, rel_path=None, dir_fd=None, exists=False): + """Yields the given path, if it exists. + """ + if exists: + # Optimization: this path is already known to exist, e.g. because + # it was returned from os.scandir(), so we skip calling lstat(). + yield path + elif dir_fd is None: try: - yield from select_recursive_step(stack, match_pos) + os.lstat(path) + yield path except OSError: pass - - def select_recursive_step(stack, match_pos): - path, rel_path, dir_fd = stack.pop() - if path is None: - os.close(dir_fd) - return - arg, fd, close_fd = _open_dir(path, rel_path, dir_fd) - if fd is not None: - prefix = _add_trailing_slash(path) - if close_fd: - stack.append((None, None, fd)) - with os.scandir(arg) as scandir_it: - entries = list(scandir_it) - for entry in entries: - is_dir = False + else: try: - if entry.is_dir(): - is_dir = True + os.lstat(rel_path, dir_fd=dir_fd) + yield path except OSError: pass - - if is_dir or not dir_only: - entry_path = entry.path - if fd is not None: - entry_path = prefix + entry_path - if match is None or match(entry_path, match_pos): - if dir_only: - yield from select_next( - entry_path, entry.name, fd, exists=True) - else: - # Optimization: directly yield the path if this is - # last pattern part. - yield entry_path - if is_dir: - stack.append((entry_path, entry.name, fd)) - - return select_recursive - - -def _select_exists(path, rel_path=None, dir_fd=None, exists=False): - """Yields the given path, if it exists. - """ - if exists: - # Optimization: this path is already known to exist, e.g. because it - # was returned from os.scandir(), so we skip calling lstat(). - yield path - elif dir_fd is None: - try: - os.lstat(path) - yield path - except OSError: - pass - else: - try: - os.lstat(rel_path, dir_fd=dir_fd) - yield path - except OSError: - pass From dcfe11d8089a26a186873f1153b7e69890fa2068 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sat, 6 Apr 2024 05:19:14 +0100 Subject: [PATCH 31/59] Unify with pathlib implementation \o/ --- Lib/glob.py | 58 ++++--- Lib/pathlib/__init__.py | 42 ++--- Lib/pathlib/_abc.py | 198 ++++------------------ Lib/test/test_pathlib/test_pathlib_abc.py | 6 +- 4 files changed, 89 insertions(+), 215 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index 4e58d06a574805..6b02815cb1b403 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -12,8 +12,9 @@ _special_parts = ('', os.path.curdir, os.path.pardir) -_pattern_flags = re.NOFLAG if os.path.normcase('Aa') == 'Aa' else re.IGNORECASE +_case_sensitive = os.path.normcase('Aa') == 'Aa' _dir_open_flags = os.O_RDONLY | getattr(os, 'O_DIRECTORY', 0) +_disable_recurse_symlinks = object() if os.path.altsep: _path_seps = (os.path.sep, os.path.altsep) else: @@ -171,12 +172,13 @@ def translate(pat, *, recursive=False, include_hidden=False, seps=None): @functools.lru_cache(maxsize=32768) -def _compile_pattern(include_hidden, recursive, sep, pattern): +def _compile_pattern(include_hidden, recursive, case_sensitive, sep, pattern): """Compile an re.Pattern object for the given glob-style pattern. """ + flags = re.NOFLAG if case_sensitive else re.IGNORECASE regex = translate(pattern, include_hidden=include_hidden, recursive=recursive, seps=sep) - return re.compile(regex, flags=_pattern_flags).match + return re.compile(regex, flags=flags).match def _split_pathname(pathname): @@ -233,12 +235,20 @@ def _relative_glob(select, dirname, dir_fd=None): class _Globber: - def __init__(self, include_hidden=False, recursive=False, sep=os.path.sep): + def __init__(self, include_hidden=False, recursive=False, + case_sensitive=_case_sensitive, sep=os.path.sep): self.include_hidden = include_hidden self.recursive = recursive + self.case_sensitive = case_sensitive self.sep = sep self.compile = functools.partial( - _compile_pattern, include_hidden, recursive, sep) + _compile_pattern, include_hidden, recursive, case_sensitive, sep) + + lstat = staticmethod(os.lstat) + scandir = staticmethod(os.scandir) + add_slash = staticmethod(_add_slash) + concat_path = operator.add + parse_entry = operator.attrgetter('path') def selector(self, parts): """Returns a function that selects from a given path, walking and @@ -248,6 +258,8 @@ def selector(self, parts): return self.select_exists elif self.recursive and parts[-1] == '**': selector = self.recursive_selector + elif self.case_sensitive != _case_sensitive: + selector = self.wildcard_selector elif magic_check.search(parts[-1]) is not None: selector = self.wildcard_selector else: @@ -274,9 +286,9 @@ def literal_selector(self, parts): select_next = self.selector(parts) def select_literal(path, rel_path=None, dir_fd=None, exists=False): - path = _add_slash(path) + part + path = self.concat_path(self.add_slash(path), part) if dir_fd is not None: - rel_path = _add_slash(rel_path) + part + rel_path = self.concat_path(self.add_slash(rel_path), part) return select_next(path, rel_path, dir_fd, exists and is_special) return select_literal @@ -298,10 +310,10 @@ def select_wildcard(path, rel_path=None, dir_fd=None, exists=False): try: arg, fd, close_fd = _open_dir(path, rel_path, dir_fd) if fd is not None: - prefix = _add_slash(path) + prefix = self.add_slash(path) # Ensure we don't exhaust file descriptors when globbing deep # trees by closing the directory *before* yielding anything. - with os.scandir(arg) as scandir_obj: + with self.scandir(arg) as scandir_obj: entries = list(scandir_obj) for entry in entries: if match is None or match(entry.name): @@ -311,9 +323,9 @@ def select_wildcard(path, rel_path=None, dir_fd=None, exists=False): continue except OSError: continue - entry_path = entry.path + entry_path = self.parse_entry(entry) if fd is not None: - entry_path = prefix + entry_path + entry_path = self.concat_path(prefix, entry_path) if dir_only: yield from select_next( entry_path, entry.name, fd, exists=True) @@ -342,8 +354,10 @@ def recursive_selector(self, parts): # the recursive walk. As a result, non-special pattern segments # following a '**' wildcard don't require additional filesystem access # to expand. - while parts and parts[-1] not in _special_parts: - part += self.sep + parts.pop() + follow_symlinks = self.recursive is not _disable_recurse_symlinks + if follow_symlinks: + while parts and parts[-1] not in _special_parts: + part += self.sep + parts.pop() dir_only = bool(parts) if self.include_hidden and part == '**': @@ -353,9 +367,9 @@ def recursive_selector(self, parts): select_next = self.selector(parts) def select_recursive(path, rel_path=None, dir_fd=None, exists=False): - path = _add_slash(path) + path = self.add_slash(path) if dir_fd is not None: - rel_path = _add_slash(rel_path) + rel_path = self.add_slash(rel_path) match_pos = len(str(path)) if match is None or match(str(path), match_pos): yield from select_next(path, rel_path, dir_fd, exists) @@ -373,25 +387,25 @@ def select_recursive_step(stack, match_pos): return arg, fd, close_fd = _open_dir(path, rel_path, dir_fd) if fd is not None: - prefix = _add_slash(path) + prefix = self.add_slash(path) if close_fd: stack.append((None, None, fd)) # Ensure we don't exhaust file descriptors when globbing deep # trees by closing the directory *before* yielding anything. - with os.scandir(arg) as scandir_obj: + with self.scandir(arg) as scandir_obj: entries = list(scandir_obj) for entry in entries: is_dir = False try: - if entry.is_dir(): + if entry.is_dir(follow_symlinks=follow_symlinks): is_dir = True except OSError: pass if is_dir or not dir_only: - entry_path = entry.path + entry_path = self.parse_entry(entry) if fd is not None: - entry_path = prefix + entry_path + entry_path = self.concat_path(prefix, entry_path) if match is None or match(str(entry_path), match_pos): if dir_only: yield from select_next( @@ -414,13 +428,13 @@ def select_exists(self, path, rel_path=None, dir_fd=None, exists=False): yield path elif dir_fd is None: try: - os.lstat(path) + self.lstat(path) yield path except OSError: pass else: try: - os.lstat(rel_path, dir_fd=dir_fd) + self.lstat(rel_path, dir_fd=dir_fd) yield path except OSError: pass diff --git a/Lib/pathlib/__init__.py b/Lib/pathlib/__init__.py index 747000f1a43475..8f3d7a9d725c0c 100644 --- a/Lib/pathlib/__init__.py +++ b/Lib/pathlib/__init__.py @@ -5,6 +5,7 @@ operating systems. """ +import glob import io import ntpath import os @@ -111,6 +112,7 @@ class PurePath(_abc.PurePathBase): '_hash', ) parser = os.path + _globber = glob._Globber def __new__(cls, *args, **kwargs): """Construct a PurePath from one or several strings and or existing @@ -453,21 +455,6 @@ def as_uri(self): from urllib.parse import quote_from_bytes return prefix + quote_from_bytes(os.fsencode(path)) - @property - def _pattern_stack(self): - """Stack of path components, to be used with patterns in glob().""" - parts = self._tail.copy() - pattern = self._raw_path - if self.anchor: - raise NotImplementedError("Non-relative patterns are unsupported") - elif not parts: - raise ValueError("Unacceptable pattern: {!r}".format(pattern)) - elif pattern[-1] in (self.parser.sep, self.parser.altsep): - # GH-65238: pathlib doesn't preserve trailing slash. Add it back. - parts.append('') - parts.reverse() - return parts - @property def _pattern_str(self): """The path expressed as a string, for use in pattern-matching.""" @@ -587,13 +574,9 @@ def iterdir(self): def _scandir(self): return os.scandir(self) - def _direntry_str(self, entry): - # Transform an entry yielded from _scandir() into a path string. - return entry.name if str(self) == '.' else entry.path - def _make_child_direntry(self, entry): - # Transform an entry yielded from _scandir() into a path object. - path_str = self._direntry_str(entry) + # Transform an entry yielded from os.scandir() into a path object. + path_str = entry.name if str(self) == '.' else entry.path path = self.with_segments(path_str) path._str = path_str path._drv = self.drive @@ -626,8 +609,18 @@ def glob(self, pattern, *, case_sensitive=None, recurse_symlinks=False): sys.audit("pathlib.Path.glob", self, pattern) if not isinstance(pattern, PurePath): pattern = self.with_segments(pattern) - return _abc.PathBase.glob( - self, pattern, case_sensitive=case_sensitive, recurse_symlinks=recurse_symlinks) + if pattern.anchor: + raise NotImplementedError("Non-relative patterns are unsupported") + parts = pattern._tail.copy() + if not parts: + raise ValueError("Unacceptable pattern: {!r}".format(pattern)) + raw = pattern._raw_path + if raw[-1] in (self.parser.sep, self.parser.altsep): + # GH-65238: pathlib doesn't preserve trailing slash. Add it back. + parts.append('') + parts.reverse() + select = self._glob_selector(parts, case_sensitive, recurse_symlinks) + return map(self.with_segments, select(str(self))) def rglob(self, pattern, *, case_sensitive=None, recurse_symlinks=False): """Recursively yield all existing files (of any kind, including @@ -638,8 +631,7 @@ def rglob(self, pattern, *, case_sensitive=None, recurse_symlinks=False): if not isinstance(pattern, PurePath): pattern = self.with_segments(pattern) pattern = '**' / pattern - return _abc.PathBase.glob( - self, pattern, case_sensitive=case_sensitive, recurse_symlinks=recurse_symlinks) + return self.glob(pattern, case_sensitive=case_sensitive, recurse_symlinks=recurse_symlinks) def walk(self, top_down=True, on_error=None, follow_symlinks=False): """Walk the directory tree from this directory, similar to os.walk().""" diff --git a/Lib/pathlib/_abc.py b/Lib/pathlib/_abc.py index ca38a51d072cfb..62ae0c557b6001 100644 --- a/Lib/pathlib/_abc.py +++ b/Lib/pathlib/_abc.py @@ -12,6 +12,8 @@ """ import functools +import glob +import operator from errno import ENOENT, ENOTDIR, EBADF, ELOOP, EINVAL from stat import S_ISDIR, S_ISLNK, S_ISREG, S_ISSOCK, S_ISBLK, S_ISCHR, S_ISFIFO @@ -40,109 +42,17 @@ def _ignore_error(exception): def _is_case_sensitive(parser): return parser.normcase('Aa') == 'Aa' -# -# Globbing helpers -# - -re = glob = None - - -@functools.lru_cache(maxsize=512) -def _compile_pattern(pat, sep, case_sensitive, recursive=True): - """Compile given glob pattern to a re.Pattern object (observing case - sensitivity).""" - global re, glob - if re is None: - import re, glob - - flags = re.NOFLAG if case_sensitive else re.IGNORECASE - regex = glob.translate(pat, recursive=recursive, include_hidden=True, seps=sep) - return re.compile(regex, flags=flags).match +class _Globber(glob._Globber): + lstat = operator.methodcaller('lstat') + scandir = operator.methodcaller('_scandir') + add_slash = operator.methodcaller('joinpath', '') -def _select_special(paths, part): - """Yield special literal children of the given paths.""" - for path in paths: - yield path._make_child_relpath(part) - - -def _select_children(parent_paths, dir_only, match): - """Yield direct children of given paths, filtering by name and type.""" - for parent_path in parent_paths: - try: - # We must close the scandir() object before proceeding to - # avoid exhausting file descriptors when globbing deep trees. - with parent_path._scandir() as scandir_it: - entries = list(scandir_it) - except OSError: - pass - else: - for entry in entries: - if dir_only: - try: - if not entry.is_dir(): - continue - except OSError: - continue - # Avoid cost of making a path object for non-matching paths by - # matching against the os.DirEntry.name string. - if match is None or match(entry.name): - yield parent_path._make_child_direntry(entry) - + def concat_path(self, path, text): + return path.with_segments(path._raw_path + text) -def _select_recursive(parent_paths, dir_only, follow_symlinks, match): - """Yield given paths and all their children, recursively, filtering by - string and type. - """ - for parent_path in parent_paths: - if match is not None: - # If we're filtering paths through a regex, record the length of - # the parent path. We'll pass it to match(path, pos=...) later. - parent_len = len(str(parent_path._make_child_relpath('_'))) - 1 - paths = [parent_path._make_child_relpath('')] - while paths: - path = paths.pop() - if match is None or match(str(path), parent_len): - # Yield *directory* path that matches pattern (if any). - yield path - try: - # We must close the scandir() object before proceeding to - # avoid exhausting file descriptors when globbing deep trees. - with path._scandir() as scandir_it: - entries = list(scandir_it) - except OSError: - pass - else: - for entry in entries: - # Handle directory entry. - try: - if entry.is_dir(follow_symlinks=follow_symlinks): - # Recurse into this directory. - paths.append(path._make_child_direntry(entry)) - continue - except OSError: - pass - - # Handle file entry. - if not dir_only: - # Avoid cost of making a path object for non-matching - # files by matching against the os.DirEntry object. - if match is None or match(path._direntry_str(entry), parent_len): - # Yield *file* path that matches pattern (if any). - yield path._make_child_direntry(entry) - - -def _select_unique(paths): - """Yields the given paths, filtering out duplicates.""" - yielded = set() - try: - for path in paths: - path_str = str(path) - if path_str not in yielded: - yield path - yielded.add(path_str) - finally: - yielded.clear() + def parse_entry(self, entry): + return entry class UnsupportedOperation(NotImplementedError): @@ -218,6 +128,7 @@ class PurePathBase: '_resolving', ) parser = ParserBase() + _globber = _Globber def __init__(self, path, *paths): self._raw_path = self.parser.join(path, *paths) if paths else path @@ -454,14 +365,6 @@ def is_absolute(self): a drive).""" return self.parser.isabs(self._raw_path) - @property - def _pattern_stack(self): - """Stack of path components, to be used with patterns in glob().""" - anchor, parts = self._stack - if anchor: - raise NotImplementedError("Non-relative patterns are unsupported") - return parts - @property def _pattern_str(self): """The path expressed as a string, for use in pattern-matching.""" @@ -487,8 +390,9 @@ def match(self, path_pattern, *, case_sensitive=None): return False if len(path_parts) > len(pattern_parts) and path_pattern.anchor: return False + globber = self._globber(include_hidden=True, case_sensitive=case_sensitive, sep=sep) for path_part, pattern_part in zip(path_parts, pattern_parts): - match = _compile_pattern(pattern_part, sep, case_sensitive, recursive=False) + match = globber.compile(pattern_part) if match(path_part) is None: return False return True @@ -502,7 +406,12 @@ def full_match(self, pattern, *, case_sensitive=None): pattern = self.with_segments(pattern) if case_sensitive is None: case_sensitive = _is_case_sensitive(self.parser) - match = _compile_pattern(pattern._pattern_str, pattern.parser.sep, case_sensitive) + globber = self._globber( + include_hidden=True, + recursive=True, + case_sensitive=case_sensitive, + sep=pattern.parser.sep) + match = globber.compile(pattern._pattern_str) return match(self._pattern_str) is not None @@ -772,11 +681,6 @@ def _scandir(self): from contextlib import nullcontext return nullcontext(self.iterdir()) - def _direntry_str(self, entry): - # Transform an entry yielded from _scandir() into a path string. - # PathBase._scandir() yields PathBase objects, so use str(). - return str(entry) - def _make_child_direntry(self, entry): # Transform an entry yielded from _scandir() into a path object. # PathBase._scandir() yields PathBase objects, so this is a no-op. @@ -785,62 +689,26 @@ def _make_child_direntry(self, entry): def _make_child_relpath(self, name): return self.joinpath(name) + def _glob_selector(self, parts, case_sensitive, recurse_symlinks): + if case_sensitive is None: + case_sensitive = _is_case_sensitive(self.parser) + globber = self._globber( + include_hidden=True, + recursive=True if recurse_symlinks else glob._disable_recurse_symlinks, + case_sensitive=case_sensitive, + sep=self.parser.sep) + return globber.selector(parts) + def glob(self, pattern, *, case_sensitive=None, recurse_symlinks=True): """Iterate over this subtree and yield all existing files (of any kind, including directories) matching the given relative pattern. """ if not isinstance(pattern, PurePathBase): pattern = self.with_segments(pattern) - if case_sensitive is None: - # TODO: evaluate case-sensitivity of each directory in _select_children(). - case_sensitive = _is_case_sensitive(self.parser) - - stack = pattern._pattern_stack - specials = ('', '.', '..') - deduplicate_paths = False - sep = self.parser.sep - paths = iter([self] if self.is_dir() else []) - while stack: - part = stack.pop() - if part in specials: - # Join special component (e.g. '..') onto paths. - paths = _select_special(paths, part) - - elif part == '**': - # Consume following '**' components, which have no effect. - while stack and stack[-1] == '**': - stack.pop() - - # Consume following non-special components, provided we're - # treating symlinks consistently. Each component is joined - # onto 'part', which is used to generate an re.Pattern object. - if recurse_symlinks: - while stack and stack[-1] not in specials: - part += sep + stack.pop() - - # If the previous loop consumed pattern components, compile an - # re.Pattern object based on those components. - match = _compile_pattern(part, sep, case_sensitive) if part != '**' else None - - # Recursively walk directories, filtering by type and regex. - paths = _select_recursive(paths, bool(stack), recurse_symlinks, match) - - # De-duplicate if we've already seen a '**' component. - if deduplicate_paths: - paths = _select_unique(paths) - deduplicate_paths = True - - elif '**' in part: - raise ValueError("Invalid pattern: '**' can only be an entire path component") - - else: - # If the pattern component isn't '*', compile an re.Pattern - # object based on the component. - match = _compile_pattern(part, sep, case_sensitive) if part != '*' else None - - # Iterate over directories' children filtering by type and regex. - paths = _select_children(paths, bool(stack), match) - return paths + anchor, parts = pattern._stack + if anchor: + raise NotImplementedError("Non-relative patterns are unsupported") + return self._glob_selector(parts, case_sensitive, recurse_symlinks)(self) def rglob(self, pattern, *, case_sensitive=None, recurse_symlinks=True): """Recursively yield all existing files (of any kind, including diff --git a/Lib/test/test_pathlib/test_pathlib_abc.py b/Lib/test/test_pathlib/test_pathlib_abc.py index 336115cf0fead2..ff9ac381a8002d 100644 --- a/Lib/test/test_pathlib/test_pathlib_abc.py +++ b/Lib/test/test_pathlib/test_pathlib_abc.py @@ -1429,10 +1429,10 @@ def __repr__(self): return "{}({!r})".format(self.__class__.__name__, self.as_posix()) def stat(self, *, follow_symlinks=True): - if follow_symlinks: - path = str(self.resolve()) + if follow_symlinks or self.name == '' or self.name == '.' or self.name == '..': + path = str(self.resolve(strict=True)) else: - path = str(self.parent.resolve() / self.name) + path = str(self.parent.resolve(strict=True) / self.name) if path in self._files: st_mode = stat.S_IFREG elif path in self._directories: From 123a0f6b05d4067aa537bf34a46b2e466d5e6089 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sat, 6 Apr 2024 05:57:51 +0100 Subject: [PATCH 32/59] Use literal selector only if no case sensitivity preference is given. --- Lib/glob.py | 6 ++++-- Lib/pathlib/_abc.py | 2 -- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index 6b02815cb1b403..bccac610cd2d3f 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -175,6 +175,8 @@ def translate(pat, *, recursive=False, include_hidden=False, seps=None): def _compile_pattern(include_hidden, recursive, case_sensitive, sep, pattern): """Compile an re.Pattern object for the given glob-style pattern. """ + if case_sensitive is None: + case_sensitive = _case_sensitive flags = re.NOFLAG if case_sensitive else re.IGNORECASE regex = translate(pattern, include_hidden=include_hidden, recursive=recursive, seps=sep) @@ -236,7 +238,7 @@ def _relative_glob(select, dirname, dir_fd=None): class _Globber: def __init__(self, include_hidden=False, recursive=False, - case_sensitive=_case_sensitive, sep=os.path.sep): + case_sensitive=None, sep=os.path.sep): self.include_hidden = include_hidden self.recursive = recursive self.case_sensitive = case_sensitive @@ -258,7 +260,7 @@ def selector(self, parts): return self.select_exists elif self.recursive and parts[-1] == '**': selector = self.recursive_selector - elif self.case_sensitive != _case_sensitive: + elif self.case_sensitive is not None: selector = self.wildcard_selector elif magic_check.search(parts[-1]) is not None: selector = self.wildcard_selector diff --git a/Lib/pathlib/_abc.py b/Lib/pathlib/_abc.py index 62ae0c557b6001..f8566c3111168a 100644 --- a/Lib/pathlib/_abc.py +++ b/Lib/pathlib/_abc.py @@ -690,8 +690,6 @@ def _make_child_relpath(self, name): return self.joinpath(name) def _glob_selector(self, parts, case_sensitive, recurse_symlinks): - if case_sensitive is None: - case_sensitive = _is_case_sensitive(self.parser) globber = self._globber( include_hidden=True, recursive=True if recurse_symlinks else glob._disable_recurse_symlinks, From 0ed7b9c1c67dca1e23ceb9d623383ec0bb69cbed Mon Sep 17 00:00:00 2001 From: barneygale Date: Sat, 6 Apr 2024 06:30:51 +0100 Subject: [PATCH 33/59] Fix a few tests --- Lib/test/test_pathlib/test_pathlib_abc.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/Lib/test/test_pathlib/test_pathlib_abc.py b/Lib/test/test_pathlib/test_pathlib_abc.py index ff9ac381a8002d..716255c8bda2c6 100644 --- a/Lib/test/test_pathlib/test_pathlib_abc.py +++ b/Lib/test/test_pathlib/test_pathlib_abc.py @@ -1741,8 +1741,9 @@ def _check(glob, expected): def test_glob_posix(self): P = self.cls p = P(self.base) + q = p / "FILEa" given = set(p.glob("FILEa")) - expect = set() + expect = set(q) if q.exists() else set() self.assertEqual(given, expect) self.assertEqual(set(p.glob("FILEa*")), set()) @@ -1753,8 +1754,6 @@ def test_glob_windows(self): self.assertEqual(set(p.glob("FILEa")), { P(self.base, "fileA") }) self.assertEqual(set(p.glob("*a\\")), { P(self.base, "dirA/") }) self.assertEqual(set(p.glob("F*a")), { P(self.base, "fileA") }) - self.assertEqual(set(map(str, p.glob("FILEa"))), {f"{p}\\fileA"}) - self.assertEqual(set(map(str, p.glob("F*a"))), {f"{p}\\fileA"}) def test_glob_empty_pattern(self): P = self.cls @@ -1857,8 +1856,9 @@ def _check(path, glob, expected): def test_rglob_posix(self): P = self.cls p = P(self.base, "dirC") + q = p / "FILEd" given = set(p.rglob("FILEd")) - expect = set() + expect = set(q) if q.exists() else set() self.assertEqual(given, expect) self.assertEqual(set(p.rglob("FILEd*")), set()) @@ -1868,7 +1868,6 @@ def test_rglob_windows(self): p = P(self.base, "dirC") self.assertEqual(set(p.rglob("FILEd")), { P(self.base, "dirC/dirD/fileD") }) self.assertEqual(set(p.rglob("*\\")), { P(self.base, "dirC/dirD/") }) - self.assertEqual(set(map(str, p.rglob("FILEd"))), {f"{p}\\dirD\\fileD"}) @needs_symlinks def test_rglob_recurse_symlinks_common(self): @@ -1931,7 +1930,6 @@ def test_glob_dotdot(self): self.assertEqual(set(p.glob("dirA/../file*")), { P(self.base, "dirA/../fileA") }) self.assertEqual(set(p.glob("dirA/../file*/..")), set()) self.assertEqual(set(p.glob("../xyzzy")), set()) - self.assertEqual(set(p.glob("xyzzy/..")), set()) self.assertEqual(set(p.glob("/".join([".."] * 50))), { P(self.base, *[".."] * 50)}) @needs_symlinks From aceb85fca7dfcc51aa713bd86804c922f5f6531a Mon Sep 17 00:00:00 2001 From: barneygale Date: Sat, 6 Apr 2024 06:42:22 +0100 Subject: [PATCH 34/59] Fix a few more tests. --- Lib/test/test_pathlib/test_pathlib.py | 35 +++++++++++++++++++++++ Lib/test/test_pathlib/test_pathlib_abc.py | 35 ----------------------- 2 files changed, 35 insertions(+), 35 deletions(-) diff --git a/Lib/test/test_pathlib/test_pathlib.py b/Lib/test/test_pathlib/test_pathlib.py index 651d66656cbd61..2876f02a1e9526 100644 --- a/Lib/test/test_pathlib/test_pathlib.py +++ b/Lib/test/test_pathlib/test_pathlib.py @@ -1203,6 +1203,41 @@ def test_walk_above_recursion_limit(self): list(base.walk()) list(base.walk(top_down=False)) + @needs_posix + def test_glob_posix(self): + P = self.cls + p = P(self.base) + q = p / "FILEa" + given = set(p.glob("FILEa")) + expect = {q} if q.exists() else set() + self.assertEqual(given, expect) + self.assertEqual(set(p.glob("FILEa*")), set()) + + @needs_posix + def test_rglob_posix(self): + P = self.cls + p = P(self.base, "dirC") + q = p / "dirD" / "FILEd" + given = set(p.rglob("FILEd")) + expect = {q} if q.exists() else set() + self.assertEqual(given, expect) + self.assertEqual(set(p.rglob("FILEd*")), set()) + + @needs_windows + def test_glob_windows(self): + P = self.cls + p = P(self.base) + self.assertEqual(set(p.glob("FILEa")), { P(self.base, "fileA") }) + self.assertEqual(set(p.glob("*a\\")), { P(self.base, "dirA/") }) + self.assertEqual(set(p.glob("F*a")), { P(self.base, "fileA") }) + + @needs_windows + def test_rglob_windows(self): + P = self.cls + p = P(self.base, "dirC") + self.assertEqual(set(p.rglob("FILEd")), { P(self.base, "dirC/dirD/fileD") }) + self.assertEqual(set(p.rglob("*\\")), { P(self.base, "dirC/dirD/") }) + def test_glob_empty_pattern(self): p = self.cls('') with self.assertRaisesRegex(ValueError, 'Unacceptable pattern'): diff --git a/Lib/test/test_pathlib/test_pathlib_abc.py b/Lib/test/test_pathlib/test_pathlib_abc.py index 716255c8bda2c6..87cdce151229e8 100644 --- a/Lib/test/test_pathlib/test_pathlib_abc.py +++ b/Lib/test/test_pathlib/test_pathlib_abc.py @@ -1737,24 +1737,6 @@ def _check(glob, expected): else: _check(p.glob("*/"), ["dirA/", "dirB/", "dirC/", "dirE/", "linkB/"]) - @needs_posix - def test_glob_posix(self): - P = self.cls - p = P(self.base) - q = p / "FILEa" - given = set(p.glob("FILEa")) - expect = set(q) if q.exists() else set() - self.assertEqual(given, expect) - self.assertEqual(set(p.glob("FILEa*")), set()) - - @needs_windows - def test_glob_windows(self): - P = self.cls - p = P(self.base) - self.assertEqual(set(p.glob("FILEa")), { P(self.base, "fileA") }) - self.assertEqual(set(p.glob("*a\\")), { P(self.base, "dirA/") }) - self.assertEqual(set(p.glob("F*a")), { P(self.base, "fileA") }) - def test_glob_empty_pattern(self): P = self.cls p = P(self.base) @@ -1852,23 +1834,6 @@ def _check(path, glob, expected): _check(p, "*.txt", ["dirC/novel.txt"]) _check(p, "*.*", ["dirC/novel.txt"]) - @needs_posix - def test_rglob_posix(self): - P = self.cls - p = P(self.base, "dirC") - q = p / "FILEd" - given = set(p.rglob("FILEd")) - expect = set(q) if q.exists() else set() - self.assertEqual(given, expect) - self.assertEqual(set(p.rglob("FILEd*")), set()) - - @needs_windows - def test_rglob_windows(self): - P = self.cls - p = P(self.base, "dirC") - self.assertEqual(set(p.rglob("FILEd")), { P(self.base, "dirC/dirD/fileD") }) - self.assertEqual(set(p.rglob("*\\")), { P(self.base, "dirC/dirD/") }) - @needs_symlinks def test_rglob_recurse_symlinks_common(self): def _check(path, glob, expected): From 8a15db0ed82ce5d53c79bcbdd600495b0be1f2ca Mon Sep 17 00:00:00 2001 From: barneygale Date: Thu, 11 Apr 2024 00:37:29 +0100 Subject: [PATCH 35/59] Fix select() argument order. --- Lib/glob.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index da993a2758b1b6..729c347cd38306 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -56,7 +56,7 @@ def iglob(pathname, *, root_dir=None, dir_fd=None, recursive=False, # Non-relative pattern. The anchor is guaranteed to exist unless it # has a Windows drive component. exists = not os.path.splitdrive(anchor)[0] - paths = select(anchor, anchor, dir_fd, exists) + paths = select(anchor, dir_fd, anchor, exists) else: # Relative pattern. if root_dir is None: @@ -104,7 +104,7 @@ def _relative_glob(select, dirname, dir_fd=None): """ dirname = _Globber.add_slash(dirname) slicer = operator.itemgetter(slice(len(dirname), None)) - return map(slicer, select(dirname, dirname, dir_fd)) + return map(slicer, select(dirname, dir_fd, dirname)) magic_check = re.compile('([*?[])') magic_check_bytes = re.compile(b'([*?[])') From 2018027819ac8540678ea488de69e48878e4607b Mon Sep 17 00:00:00 2001 From: barneygale Date: Fri, 3 May 2024 21:47:24 +0100 Subject: [PATCH 36/59] Support `include_hidden` and `dir_fd` in `pathlib._glob`. --- Lib/pathlib/_abc.py | 8 ++-- Lib/pathlib/_glob.py | 97 ++++++++++++++++++++++++++++++++++---------- 2 files changed, 80 insertions(+), 25 deletions(-) diff --git a/Lib/pathlib/_abc.py b/Lib/pathlib/_abc.py index 591df443be093a..930f09f41263cf 100644 --- a/Lib/pathlib/_abc.py +++ b/Lib/pathlib/_abc.py @@ -403,7 +403,7 @@ def match(self, path_pattern, *, case_sensitive=None): return False if len(path_parts) > len(pattern_parts) and path_pattern.anchor: return False - globber = self._globber(sep, case_sensitive) + globber = self._globber(sep, case_sensitive, include_hidden=True) for path_part, pattern_part in zip(path_parts, pattern_parts): match = globber.compile(pattern_part) if match(path_part) is None: @@ -419,7 +419,8 @@ def full_match(self, pattern, *, case_sensitive=None): pattern = self.with_segments(pattern) if case_sensitive is None: case_sensitive = _is_case_sensitive(self.parser) - globber = self._globber(pattern.parser.sep, case_sensitive, recursive=True) + globber = self._globber(pattern.parser.sep, case_sensitive, + recursive=True, include_hidden=True) match = globber.compile(pattern._pattern_str) return match(self._pattern_str) is not None @@ -694,7 +695,8 @@ def _glob_selector(self, parts, case_sensitive, recurse_symlinks): # must use scandir() for everything, including non-wildcard parts. case_pedantic = True recursive = True if recurse_symlinks else _glob.no_recurse_symlinks - globber = self._globber(self.parser.sep, case_sensitive, case_pedantic, recursive) + globber = self._globber(self.parser.sep, case_sensitive, + case_pedantic, recursive, include_hidden=True) return globber.selector(parts) def glob(self, pattern, *, case_sensitive=None, recurse_symlinks=True): diff --git a/Lib/pathlib/_glob.py b/Lib/pathlib/_glob.py index 73ccc0677920ce..6ecd674c87cb48 100644 --- a/Lib/pathlib/_glob.py +++ b/Lib/pathlib/_glob.py @@ -9,6 +9,7 @@ magic_check = re.compile('([*?[])') magic_check_bytes = re.compile(b'([*?[])') no_recurse_symlinks = object() +open_dir_flags = os.O_RDONLY | getattr(os, 'O_DIRECTORY', 0) def translate(pat, *, recursive=False, include_hidden=False, seps=None): @@ -66,24 +67,42 @@ def translate(pat, *, recursive=False, include_hidden=False, seps=None): return fr'(?s:{res})\Z' -@functools.lru_cache(maxsize=512) -def compile_pattern(pat, sep, case_sensitive, recursive=True): +functools.lru_cache(maxsize=1024) +def compile_pattern(pat, sep, case_sensitive, recursive, include_hidden): """Compile given glob pattern to a re.Pattern object (observing case sensitivity).""" flags = re.NOFLAG if case_sensitive else re.IGNORECASE - regex = translate(pat, recursive=recursive, include_hidden=True, seps=sep) + regex = translate(pat, recursive=recursive, + include_hidden=include_hidden, seps=sep) return re.compile(regex, flags=flags).match +def open_dir(path, dir_fd=None, rel_path=None): + """Prepares the directory for scanning. Returns a 3-tuple with parts: + 1. A path or fd to supply to `os.scandir()`. + 2. The file descriptor for the directory, or None. + 3. Whether the caller should close the fd (bool). + """ + if dir_fd is None: + return path, None, False + elif rel_path == './': + return dir_fd, dir_fd, False + else: + fd = os.open(rel_path, open_dir_flags, dir_fd=dir_fd) + return fd, fd, True + + class Globber: """Class providing shell-style pattern matching and globbing. """ - def __init__(self, sep, case_sensitive, case_pedantic=False, recursive=False): + def __init__(self, sep=os.path.sep, case_sensitive=os.name != 'nt', + case_pedantic=False, recursive=False, include_hidden=False): self.sep = sep self.case_sensitive = case_sensitive self.case_pedantic = case_pedantic self.recursive = recursive + self.include_hidden = include_hidden # Low-level methods @@ -109,7 +128,8 @@ def add_slash(pathname): # High-level methods def compile(self, pat): - return compile_pattern(pat, self.sep, self.case_sensitive, self.recursive) + return compile_pattern(pat, self.sep, self.case_sensitive, + self.recursive, self.include_hidden) def selector(self, parts): """Returns a function that selects from a given path, walking and @@ -133,9 +153,11 @@ def special_selector(self, part, parts): """ select_next = self.selector(parts) - def select_special(path, exists=False): + def select_special(path, dir_fd=None, rel_path=None, exists=False): path = self.concat_path(self.add_slash(path), part) - return select_next(path, exists) + if dir_fd is not None: + rel_path = self.concat_path(self.add_slash(rel_path), part) + return select_next(path, dir_fd, rel_path, exists) return select_special def literal_selector(self, part, parts): @@ -150,9 +172,11 @@ def literal_selector(self, part, parts): select_next = self.selector(parts) - def select_literal(path, exists=False): + def select_literal(path, dir_fd=None, rel_path=None, exists=False): path = self.concat_path(self.add_slash(path), part) - return select_next(path, exists=False) + if dir_fd is not None: + rel_path = self.concat_path(self.add_slash(rel_path), part) + return select_next(path, dir_fd, rel_path, exists=False) return select_literal def wildcard_selector(self, part, parts): @@ -160,16 +184,20 @@ def wildcard_selector(self, part, parts): filtering by pattern. """ - match = None if part == '*' else self.compile(part) + match = None if self.include_hidden and part == '*' else self.compile(part) dir_only = bool(parts) if dir_only: select_next = self.selector(parts) - def select_wildcard(path, exists=False): + def select_wildcard(path, dir_fd=None, rel_path=None, exists=False): + close_fd = False try: + arg, fd, close_fd = open_dir(path, dir_fd, rel_path) + if fd is not None: + prefix = self.add_slash(path) # We must close the scandir() object before proceeding to # avoid exhausting file descriptors when globbing deep trees. - with self.scandir(path) as scandir_it: + with self.scandir(arg) as scandir_it: entries = list(scandir_it) except OSError: pass @@ -183,10 +211,16 @@ def select_wildcard(path, exists=False): except OSError: continue entry_path = self.parse_entry(entry) + if fd is not None: + entry_path = self.concat_path(prefix, entry_path) if dir_only: - yield from select_next(entry_path, exists=True) + yield from select_next( + entry_path, fd, entry.name, exists=True) else: yield entry_path + finally: + if close_fd: + os.close(fd) return select_wildcard def recursive_selector(self, part, parts): @@ -208,25 +242,35 @@ def recursive_selector(self, part, parts): while parts and parts[-1] not in special_parts: part += self.sep + parts.pop() - match = None if part == '**' else self.compile(part) + match = None if self.include_hidden and part == '**' else self.compile(part) dir_only = bool(parts) select_next = self.selector(parts) - def select_recursive(path, exists=False): + def select_recursive(path, dir_fd=None, rel_path=None, exists=False): path = self.add_slash(path) + if dir_fd is not None: + rel_path = self.add_slash(rel_path) match_pos = len(str(path)) if match is None or match(str(path), match_pos): - yield from select_next(path, exists) - stack = [path] + yield from select_next(path, dir_fd, rel_path, exists) + stack = [(path, dir_fd, rel_path)] while stack: yield from select_recursive_step(stack, match_pos) def select_recursive_step(stack, match_pos): - path = stack.pop() + path, dir_fd, rel_path = stack.pop() try: + if path is None: + os.close(dir_fd) + return + arg, fd, close_fd = open_dir(path, dir_fd, rel_path) + if fd is not None: + prefix = self.add_slash(path) + if close_fd: + stack.append((None, fd, None)) # We must close the scandir() object before proceeding to # avoid exhausting file descriptors when globbing deep trees. - with self.scandir(path) as scandir_it: + with self.scandir(arg) as scandir_it: entries = list(scandir_it) except OSError: pass @@ -241,25 +285,34 @@ def select_recursive_step(stack, match_pos): if is_dir or not dir_only: entry_path = self.parse_entry(entry) + if fd is not None: + entry_path = self.concat_path(prefix, entry_path) if match is None or match(str(entry_path), match_pos): if dir_only: - yield from select_next(entry_path, exists=True) + yield from select_next( + entry_path, fd, entry.name, exists=True) else: # Optimization: directly yield the path if this is # last pattern part. yield entry_path if is_dir: - stack.append(entry_path) + stack.append((entry_path, fd, entry.name)) return select_recursive - def select_exists(self, path, exists=False): + def select_exists(self, path, dir_fd=None, rel_path=None, exists=False): """Yields the given path, if it exists. """ if exists: # Optimization: this path is already known to exist, e.g. because # it was returned from os.scandir(), so we skip calling lstat(). yield path + elif dir_fd is not None: + try: + os.lstat(rel_path, dir_fd=dir_fd) + yield path + except OSError: + pass else: try: self.lstat(path) From 2f216260c29112dd4c48f0aee4fa640c0dec998b Mon Sep 17 00:00:00 2001 From: barneygale Date: Fri, 3 May 2024 21:50:45 +0100 Subject: [PATCH 37/59] Fix stray newline --- Lib/glob.py | 1 - 1 file changed, 1 deletion(-) diff --git a/Lib/glob.py b/Lib/glob.py index 96a235c13a02cb..9d637f055aa067 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -112,7 +112,6 @@ def has_magic(s): match = magic_check.search(s) return match is not None - def escape(pathname): """Escape all special characters. """ From 339df689b8354ebb863401df8a7e6568f176024e Mon Sep 17 00:00:00 2001 From: Barney Gale Date: Sat, 4 May 2024 15:11:16 +0100 Subject: [PATCH 38/59] Update Lib/pathlib/_glob.py Co-authored-by: Pieter Eendebak --- Lib/pathlib/_glob.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/pathlib/_glob.py b/Lib/pathlib/_glob.py index 6ecd674c87cb48..5b8214b2e49887 100644 --- a/Lib/pathlib/_glob.py +++ b/Lib/pathlib/_glob.py @@ -67,7 +67,7 @@ def translate(pat, *, recursive=False, include_hidden=False, seps=None): return fr'(?s:{res})\Z' -functools.lru_cache(maxsize=1024) +@functools.lru_cache(maxsize=1024) def compile_pattern(pat, sep, case_sensitive, recursive, include_hidden): """Compile given glob pattern to a re.Pattern object (observing case sensitivity).""" From 28aa95f2d3071305b48473e740aee5bcc180650b Mon Sep 17 00:00:00 2001 From: barneygale Date: Sat, 4 May 2024 21:23:42 +0100 Subject: [PATCH 39/59] Fix docs --- Doc/library/glob.rst | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/Doc/library/glob.rst b/Doc/library/glob.rst index ab6da98bc74ad2..777a1441c79195 100644 --- a/Doc/library/glob.rst +++ b/Doc/library/glob.rst @@ -75,10 +75,6 @@ The :mod:`glob` module defines the following functions: Using the "``**``" pattern in large directory trees may consume an inordinate amount of time. - .. note:: - This function may return duplicate path names if *pathname* - contains multiple "``**``" patterns and *recursive* is true. - .. versionchanged:: 3.5 Support for recursive globs using "``**``". @@ -88,6 +84,11 @@ The :mod:`glob` module defines the following functions: .. versionchanged:: 3.11 Added the *include_hidden* parameter. + .. versionchanged:: 3.14 + Matching path names are returned only once. In previous versions, this + function may return duplicate path names if *pathname* contains multiple + "``**``" patterns and *recursive* is true. + .. function:: iglob(pathname, *, root_dir=None, dir_fd=None, recursive=False, \ include_hidden=False) @@ -98,10 +99,6 @@ The :mod:`glob` module defines the following functions: .. audit-event:: glob.glob pathname,recursive glob.iglob .. audit-event:: glob.glob/2 pathname,recursive,root_dir,dir_fd glob.iglob - .. note:: - This function may return duplicate path names if *pathname* - contains multiple "``**``" patterns and *recursive* is true. - .. versionchanged:: 3.5 Support for recursive globs using "``**``". @@ -111,6 +108,11 @@ The :mod:`glob` module defines the following functions: .. versionchanged:: 3.11 Added the *include_hidden* parameter. + .. versionchanged:: 3.14 + Matching path names are yielded only once. In previous versions, this + function may yield duplicate path names if *pathname* contains multiple + "``**``" patterns and *recursive* is true. + .. function:: escape(pathname) From abcb1f86689715f4b72e4c85f62b0bf9ef653137 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sat, 4 May 2024 21:51:44 +0100 Subject: [PATCH 40/59] Test for unique results --- Lib/test/test_glob.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Lib/test/test_glob.py b/Lib/test/test_glob.py index 70ee35ed2850bc..bea368699c5ec2 100644 --- a/Lib/test/test_glob.py +++ b/Lib/test/test_glob.py @@ -317,8 +317,12 @@ def test_recursive_glob(self): with change_cwd(self.tempdir): join = os.path.join eq(glob.glob('**', recursive=True), [join(*i) for i in full]) + eq(glob.glob(join('**', '**'), recursive=True), + [join(*i) for i in full]) eq(glob.glob(join('**', ''), recursive=True), [join(*i) for i in dirs]) + eq(glob.glob(join('**', '**', ''), recursive=True), + [join(*i) for i in dirs]) eq(glob.glob(join('**', '*'), recursive=True), [join(*i) for i in full]) eq(glob.glob(join(os.curdir, '**'), recursive=True), From 71387a6a8b7ea853fcdf650d1290d76dc3c9ba3e Mon Sep 17 00:00:00 2001 From: barneygale Date: Sat, 4 May 2024 21:59:15 +0100 Subject: [PATCH 41/59] Spacing --- Lib/test/test_glob.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_glob.py b/Lib/test/test_glob.py index bea368699c5ec2..04ce12e788fac9 100644 --- a/Lib/test/test_glob.py +++ b/Lib/test/test_glob.py @@ -318,11 +318,11 @@ def test_recursive_glob(self): join = os.path.join eq(glob.glob('**', recursive=True), [join(*i) for i in full]) eq(glob.glob(join('**', '**'), recursive=True), - [join(*i) for i in full]) + [join(*i) for i in full]) eq(glob.glob(join('**', ''), recursive=True), [join(*i) for i in dirs]) eq(glob.glob(join('**', '**', ''), recursive=True), - [join(*i) for i in dirs]) + [join(*i) for i in dirs]) eq(glob.glob(join('**', '*'), recursive=True), [join(*i) for i in full]) eq(glob.glob(join(os.curdir, '**'), recursive=True), From cf119224528509e03e367ac4d25b51a2477f3b9b Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 8 May 2024 18:14:39 +0100 Subject: [PATCH 42/59] Update whatsnew --- Doc/whatsnew/3.14.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index 14628f666dd079..14572bce4e2efd 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -90,7 +90,9 @@ Improved Modules Optimizations ============= - +* :func:`glob.glob` and :func:`~glob.iglob` no longer make unnecessary system + calls. This speeds up most globbing operations by 20-80%. + (Contributed by Barney Gale in :gh:`116380`.) Deprecated From 14ae438a1b149c3af1acc3b906f8faab6a1467a2 Mon Sep 17 00:00:00 2001 From: barneygale Date: Fri, 31 May 2024 21:50:22 +0100 Subject: [PATCH 43/59] Close file descriptors when `recursive_selector` is finalized. --- Lib/glob.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index 16e077dc03b34f..219d2ee6547785 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -381,8 +381,15 @@ def select_recursive(path, dir_fd=None, rel_path=None, exists=False): if match is None or match(str(path), match_pos): yield from select_next(path, dir_fd, rel_path, exists) stack = [(path, dir_fd, rel_path)] - while stack: - yield from select_recursive_step(stack, match_pos) + try: + while stack: + yield from select_recursive_step(stack, match_pos) + finally: + # Close any file descriptors still on the stack. + while stack: + path, dir_fd, rel_path = stack.pop() + if path is None: + os.close(dir_fd) def select_recursive_step(stack, match_pos): path, dir_fd, rel_path = stack.pop() From 69d7a86e759882509bfa685232833bd44afff5b9 Mon Sep 17 00:00:00 2001 From: barneygale Date: Fri, 31 May 2024 22:16:36 +0100 Subject: [PATCH 44/59] Make `iglob()` a generator. --- Lib/glob.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index 219d2ee6547785..4a98ae985ded62 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -44,13 +44,17 @@ def iglob(pathname, *, root_dir=None, dir_fd=None, recursive=False, sys.audit("glob.glob", pathname, recursive) sys.audit("glob.glob/2", pathname, recursive, root_dir, dir_fd) pathname = os.fspath(pathname) - is_bytes = isinstance(pathname, bytes) - if is_bytes: + if isinstance(pathname, bytes): pathname = os.fsdecode(pathname) if root_dir is not None: root_dir = os.fsdecode(root_dir) - anchor, parts = _split_pathname(pathname) + for path in _iglob(pathname, root_dir, dir_fd, recursive, include_hidden): + yield os.fsencode(path) + else: + yield from _iglob(pathname, root_dir, dir_fd, recursive, include_hidden) +def _iglob(pathname, root_dir, dir_fd, recursive, include_hidden): + anchor, parts = _split_pathname(pathname) globber = _StringGlobber(recursive=recursive, include_hidden=include_hidden) select = globber.selector(parts) if anchor: @@ -67,8 +71,6 @@ def iglob(pathname, *, root_dir=None, dir_fd=None, recursive=False, # Ensure that the empty string is not yielded when given a pattern # like '' or '**'. paths = itertools.dropwhile(operator.not_, paths) - if is_bytes: - paths = map(os.fsencode, paths) return paths _deprecated_function_message = ( From 3b84a1d6d5892e1e518c11a30ef87efc2b55626b Mon Sep 17 00:00:00 2001 From: barneygale Date: Fri, 31 May 2024 22:22:40 +0100 Subject: [PATCH 45/59] Make `_iglob()` a generator. --- Lib/glob.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index 4a98ae985ded62..11ce93df879d0b 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -4,7 +4,6 @@ import re import fnmatch import functools -import itertools import operator import sys @@ -61,17 +60,19 @@ def _iglob(pathname, root_dir, dir_fd, recursive, include_hidden): # Non-relative pattern. The anchor is guaranteed to exist unless it # has a Windows drive component. exists = not os.path.splitdrive(anchor)[0] - paths = select(anchor, dir_fd, anchor, exists) + yield from select(anchor, dir_fd, anchor, exists) else: # Relative pattern. if root_dir is None: root_dir = os.path.curdir paths = _relative_glob(select, root_dir, dir_fd) - - # Ensure that the empty string is not yielded when given a pattern - # like '' or '**'. - paths = itertools.dropwhile(operator.not_, paths) - return paths + try: + path = next(paths) # skip empty string + if path: + yield path + yield from paths + except StopIteration: + pass _deprecated_function_message = ( "{name} is deprecated and will be removed in Python {remove}. Use " From f9f9a8de53d62090cae9adf53edb4092e527414f Mon Sep 17 00:00:00 2001 From: barneygale Date: Fri, 31 May 2024 22:22:57 +0100 Subject: [PATCH 46/59] Make `_relative_glob()` a generator. --- Lib/glob.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Lib/glob.py b/Lib/glob.py index 11ce93df879d0b..b5f831c8f56386 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -108,7 +108,8 @@ def _relative_glob(select, dirname, dir_fd=None): """ dirname = _StringGlobber.add_slash(dirname) slicer = operator.itemgetter(slice(len(dirname), None)) - return map(slicer, select(dirname, dir_fd, dirname)) + for path in select(dirname, dir_fd, dirname): + yield slicer(path) magic_check = re.compile('([*?[])') magic_check_bytes = re.compile(b'([*?[])') From 24a9ee482cf7df8e25f3405cf93def09858d234e Mon Sep 17 00:00:00 2001 From: barneygale Date: Fri, 31 May 2024 22:25:29 +0100 Subject: [PATCH 47/59] Simplify skipping empty string --- Lib/glob.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index b5f831c8f56386..44c50cfa8d2d6a 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -60,19 +60,18 @@ def _iglob(pathname, root_dir, dir_fd, recursive, include_hidden): # Non-relative pattern. The anchor is guaranteed to exist unless it # has a Windows drive component. exists = not os.path.splitdrive(anchor)[0] - yield from select(anchor, dir_fd, anchor, exists) + paths = select(anchor, dir_fd, anchor, exists) else: # Relative pattern. if root_dir is None: root_dir = os.path.curdir paths = _relative_glob(select, root_dir, dir_fd) - try: - path = next(paths) # skip empty string + # Skip empty string. + for path in paths: if path: yield path - yield from paths - except StopIteration: - pass + break + yield from paths _deprecated_function_message = ( "{name} is deprecated and will be removed in Python {remove}. Use " From a94f2a7b4bf38aeaa0cd22d16c946b937aa20b64 Mon Sep 17 00:00:00 2001 From: barneygale Date: Fri, 7 Jun 2024 18:31:04 +0100 Subject: [PATCH 48/59] Make `_GlobberBase` fully abstract. --- Lib/glob.py | 78 +++++++++++++++++++++++++++++------------------------ 1 file changed, 43 insertions(+), 35 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index e8b224408a43cd..e9c50e40082f72 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -203,21 +203,6 @@ def _compile_pattern(pat, sep, case_sensitive, recursive, include_hidden): return re.compile(regex, flags=flags).match -def _open_dir(path, dir_fd=None, rel_path=None): - """Prepares the directory for scanning. Returns a 3-tuple with parts: - 1. A path or fd to supply to `os.scandir()`. - 2. The file descriptor for the directory, or None. - 3. Whether the caller should close the fd (bool). - """ - if dir_fd is None: - return path, None, False - elif rel_path == './': - return dir_fd, dir_fd, False - else: - fd = os.open(rel_path, _dir_open_flags, dir_fd=dir_fd) - return fd, fd, True - - class _GlobberBase: """Abstract class providing shell-style pattern matching and globbing. """ @@ -238,12 +223,30 @@ def lexists(path): """ raise NotImplementedError + @staticmethod + def lstat(path, dir_fd=None): + """Implements os.lstat() + """ + raise NotImplementedError + + @staticmethod + def opendir(path, flags, dir_fd=None): + """Implements os.open() + """ + raise NotImplementedError + @staticmethod def scandir(path): """Implements os.scandir(). """ raise NotImplementedError + @staticmethod + def closedir(fd): + """Implements os.close(). + """ + raise NotImplementedError + @staticmethod def add_slash(path): """Returns a path with a trailing slash added. @@ -327,15 +330,16 @@ def wildcard_selector(self, part, parts): select_next = self.selector(parts) def select_wildcard(path, dir_fd=None, rel_path=None, exists=False): - close_fd = False + fd = None try: - arg, fd, close_fd = _open_dir(path, dir_fd, rel_path) - if fd is not None: + if dir_fd is None: + with self.scandir(path) as scandir_it: + entries = list(scandir_it) + else: + fd = self.opendir(rel_path, _dir_open_flags, dir_fd=dir_fd) + with self.scandir(fd) as scandir_it: + entries = list(scandir_it) prefix = self.add_slash(path) - # We must close the scandir() object before proceeding to - # avoid exhausting file descriptors when globbing deep trees. - with self.scandir(arg) as scandir_it: - entries = list(scandir_it) except OSError: pass else: @@ -356,8 +360,8 @@ def select_wildcard(path, dir_fd=None, rel_path=None, exists=False): else: yield entry_path finally: - if close_fd: - os.close(fd) + if fd is not None: + self.closedir(fd) return select_wildcard def recursive_selector(self, part, parts): @@ -399,23 +403,24 @@ def select_recursive(path, dir_fd=None, rel_path=None, exists=False): while stack: path, dir_fd, rel_path = stack.pop() if path is None: - os.close(dir_fd) + self.closedir(dir_fd) def select_recursive_step(stack, match_pos): path, dir_fd, rel_path = stack.pop() try: if path is None: - os.close(dir_fd) + self.closedir(dir_fd) return - arg, fd, close_fd = _open_dir(path, dir_fd, rel_path) - if fd is not None: + elif dir_fd is None: + fd = None + with self.scandir(path) as scandir_it: + entries = list(scandir_it) + else: + fd = self.opendir(rel_path, _dir_open_flags, dir_fd=dir_fd) + stack.append((None, fd, None)) + with self.scandir(fd) as scandir_it: + entries = list(scandir_it) prefix = self.add_slash(path) - if close_fd: - stack.append((None, fd, None)) - # We must close the scandir() object before proceeding to - # avoid exhausting file descriptors when globbing deep trees. - with self.scandir(arg) as scandir_it: - entries = list(scandir_it) except OSError: pass else: @@ -453,7 +458,7 @@ def select_exists(self, path, dir_fd=None, rel_path=None, exists=False): yield path elif dir_fd is not None: try: - os.lstat(rel_path, dir_fd=dir_fd) + self.lstat(rel_path, dir_fd=dir_fd) yield path except OSError: pass @@ -465,7 +470,10 @@ class _StringGlobber(_GlobberBase): """Provides shell-style pattern matching and globbing for string paths. """ lexists = staticmethod(os.path.lexists) + lstat = staticmethod(os.lstat) + opendir = staticmethod(os.open) scandir = staticmethod(os.scandir) + closedir = staticmethod(os.close) parse_entry = operator.attrgetter('path') concat_path = operator.add From d19bb893415ad8e87ecffb69f99013fa71725262 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sun, 9 Jun 2024 21:12:23 +0100 Subject: [PATCH 49/59] Address review feedback --- Lib/glob.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index e9c50e40082f72..aa5b6972e9932d 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -59,7 +59,7 @@ def _iglob(pathname, root_dir, dir_fd, recursive, include_hidden): if anchor: # Non-relative pattern. The anchor is guaranteed to exist unless it # has a Windows drive component. - exists = not os.path.splitdrive(anchor)[0] + exists = not os.path.root(anchor)[0] paths = select(anchor, dir_fd, anchor, exists) else: # Relative pattern. @@ -91,6 +91,10 @@ def glob1(dirname, pattern): def _split_pathname(pathname): """Split the given path into a pair (anchor, parts), where *anchor* is the path drive and root (if any), and *parts* is a reversed list of path parts. + For example: + + _split_pathname('C:\\a\\b') == ('C:\\', ['b', 'a']) + _split_pathname('/usr/bin') == ('/', ['bin', 'usr']) """ parts = [] split = os.path.split @@ -102,7 +106,7 @@ def _split_pathname(pathname): return dirname, parts def _relative_glob(select, dirname, dir_fd=None): - """Globs using a select function from the given dirname. The dirname + """Globs using a *select* function from the given dirname. The dirname prefix is removed from results. """ dirname = _StringGlobber.add_slash(dirname) @@ -230,7 +234,7 @@ def lstat(path, dir_fd=None): raise NotImplementedError @staticmethod - def opendir(path, flags, dir_fd=None): + def open(path, flags, dir_fd=None): """Implements os.open() """ raise NotImplementedError @@ -242,7 +246,7 @@ def scandir(path): raise NotImplementedError @staticmethod - def closedir(fd): + def close(fd): """Implements os.close(). """ raise NotImplementedError @@ -336,7 +340,7 @@ def select_wildcard(path, dir_fd=None, rel_path=None, exists=False): with self.scandir(path) as scandir_it: entries = list(scandir_it) else: - fd = self.opendir(rel_path, _dir_open_flags, dir_fd=dir_fd) + fd = self.open(rel_path, _dir_open_flags, dir_fd=dir_fd) with self.scandir(fd) as scandir_it: entries = list(scandir_it) prefix = self.add_slash(path) @@ -361,7 +365,7 @@ def select_wildcard(path, dir_fd=None, rel_path=None, exists=False): yield entry_path finally: if fd is not None: - self.closedir(fd) + self.close(fd) return select_wildcard def recursive_selector(self, part, parts): @@ -403,20 +407,20 @@ def select_recursive(path, dir_fd=None, rel_path=None, exists=False): while stack: path, dir_fd, rel_path = stack.pop() if path is None: - self.closedir(dir_fd) + self.close(dir_fd) def select_recursive_step(stack, match_pos): path, dir_fd, rel_path = stack.pop() try: if path is None: - self.closedir(dir_fd) + self.close(dir_fd) return elif dir_fd is None: fd = None with self.scandir(path) as scandir_it: entries = list(scandir_it) else: - fd = self.opendir(rel_path, _dir_open_flags, dir_fd=dir_fd) + fd = self.open(rel_path, _dir_open_flags, dir_fd=dir_fd) stack.append((None, fd, None)) with self.scandir(fd) as scandir_it: entries = list(scandir_it) @@ -471,9 +475,9 @@ class _StringGlobber(_GlobberBase): """ lexists = staticmethod(os.path.lexists) lstat = staticmethod(os.lstat) - opendir = staticmethod(os.open) + open = staticmethod(os.open) scandir = staticmethod(os.scandir) - closedir = staticmethod(os.close) + close = staticmethod(os.close) parse_entry = operator.attrgetter('path') concat_path = operator.add From 1677588d6cca1b9af57548f318f8b58c8ea250c7 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sun, 9 Jun 2024 21:52:11 +0100 Subject: [PATCH 50/59] Typo fix --- Lib/glob.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/glob.py b/Lib/glob.py index aa5b6972e9932d..8076761f5ef2ce 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -59,7 +59,7 @@ def _iglob(pathname, root_dir, dir_fd, recursive, include_hidden): if anchor: # Non-relative pattern. The anchor is guaranteed to exist unless it # has a Windows drive component. - exists = not os.path.root(anchor)[0] + exists = not os.path.splitroot(anchor)[0] paths = select(anchor, dir_fd, anchor, exists) else: # Relative pattern. From 539f044976a9f03a38446017dcc1cf68cd06762d Mon Sep 17 00:00:00 2001 From: barneygale Date: Sun, 9 Jun 2024 22:28:49 +0100 Subject: [PATCH 51/59] Speed up pattern parsing. --- Lib/glob.py | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index 8076761f5ef2ce..f9e2a04a6a1ece 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -53,14 +53,17 @@ def iglob(pathname, *, root_dir=None, dir_fd=None, recursive=False, yield from _iglob(pathname, root_dir, dir_fd, recursive, include_hidden) def _iglob(pathname, root_dir, dir_fd, recursive, include_hidden): - anchor, parts = _split_pathname(pathname) + if os.name == 'nt': + pathname = pathname.replace('/', '\\') + drive, root, tail = os.path.splitroot(pathname) + anchor = drive + root + parts = tail.split(os.path.sep)[::-1] if tail else [] globber = _StringGlobber(recursive=recursive, include_hidden=include_hidden) select = globber.selector(parts) if anchor: # Non-relative pattern. The anchor is guaranteed to exist unless it # has a Windows drive component. - exists = not os.path.splitroot(anchor)[0] - paths = select(anchor, dir_fd, anchor, exists) + paths = select(anchor, dir_fd, anchor, not drive) else: # Relative pattern. if root_dir is None: @@ -88,23 +91,6 @@ def glob1(dirname, pattern): warnings._deprecated("glob.glob1", _deprecated_function_message, remove=(3, 15)) return list(_relative_glob(_StringGlobber().wildcard_selector(pattern, []), dirname)) -def _split_pathname(pathname): - """Split the given path into a pair (anchor, parts), where *anchor* is the - path drive and root (if any), and *parts* is a reversed list of path parts. - For example: - - _split_pathname('C:\\a\\b') == ('C:\\', ['b', 'a']) - _split_pathname('/usr/bin') == ('/', ['bin', 'usr']) - """ - parts = [] - split = os.path.split - dirname, part = split(pathname) - while dirname != pathname: - parts.append(part) - pathname = dirname - dirname, part = split(pathname) - return dirname, parts - def _relative_glob(select, dirname, dir_fd=None): """Globs using a *select* function from the given dirname. The dirname prefix is removed from results. From 70a1b42dae159cf80273667c881620bf1115c1e9 Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 12 Jun 2024 23:23:39 +0100 Subject: [PATCH 52/59] Add test for globbing above recursion limit. --- Lib/test/test_glob.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_glob.py b/Lib/test/test_glob.py index 53afbf2216c8b4..5118960e2e76f1 100644 --- a/Lib/test/test_glob.py +++ b/Lib/test/test_glob.py @@ -6,7 +6,7 @@ import unittest import warnings -from test.support import is_wasi, Py_DEBUG +from test.support import is_wasi, Py_DEBUG, infinite_recursion from test.support.os_helper import (TESTFN, skip_unless_symlink, can_symlink, create_empty_file, change_cwd) @@ -390,6 +390,15 @@ def test_glob_many_open_files(self): for it in iters: self.assertEqual(next(it), p) + def test_glob_above_recursion_limit(self): + depth = 30 + base = os.path.join(self.tempdir, 'deep') + p = os.path.join(base, *(['d']*depth)) + os.makedirs(p) + pattern = os.path.join(base, '**', 'd') + with infinite_recursion(depth - 5): + glob.glob(pattern, recursive=True) + def test_glob0(self): with self.assertWarns(DeprecationWarning): glob.glob0(self.tempdir, 'a') From 099e86e58252e95d5b72c86bfb566daed64ad318 Mon Sep 17 00:00:00 2001 From: Barney Gale Date: Sun, 1 Sep 2024 15:52:48 +0100 Subject: [PATCH 53/59] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com> --- Doc/whatsnew/3.14.rst | 4 ++-- .../Library/2024-03-05-23-08-11.gh-issue-116380.56HU7I.rst | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index bce6d3f5b12654..0a105bb65658b2 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -242,8 +242,8 @@ asyncio glob ---- -* :func:`glob.glob` and :func:`~glob.iglob` no longer make some unnecessary - system calls. This speeds up most globbing operations by 20-80%. +* Reduce the number of system calls in :func:`glob.glob` and :func:`~glob.iglob`, + thereby improving the speed of globbing operations by 20-80%. (Contributed by Barney Gale in :gh:`116380`.) diff --git a/Misc/NEWS.d/next/Library/2024-03-05-23-08-11.gh-issue-116380.56HU7I.rst b/Misc/NEWS.d/next/Library/2024-03-05-23-08-11.gh-issue-116380.56HU7I.rst index db235d153c8666..b7f27ab7191a96 100644 --- a/Misc/NEWS.d/next/Library/2024-03-05-23-08-11.gh-issue-116380.56HU7I.rst +++ b/Misc/NEWS.d/next/Library/2024-03-05-23-08-11.gh-issue-116380.56HU7I.rst @@ -1,2 +1,2 @@ Speed up :func:`glob.glob` and :func:`glob.iglob` by making use of -:func:`glob.translate` and tracking path existence in more detail. +:func:`glob.translate` and tracking path existence more precisely. From ee76fafa5af633c691dd057c0f9e19b451bc65b8 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sun, 1 Sep 2024 16:37:42 +0100 Subject: [PATCH 54/59] Test that `iglob().close()` closes file descriptors. --- Lib/test/test_glob.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_glob.py b/Lib/test/test_glob.py index 5118960e2e76f1..5bda7b5f822415 100644 --- a/Lib/test/test_glob.py +++ b/Lib/test/test_glob.py @@ -4,6 +4,7 @@ import shutil import sys import unittest +import unittest.mock import warnings from test.support import is_wasi, Py_DEBUG, infinite_recursion @@ -11,6 +12,9 @@ can_symlink, create_empty_file, change_cwd) +_supports_dir_fd = {os.open, os.stat} <= os.supports_dir_fd and os.scandir in os.supports_fd + + class GlobTests(unittest.TestCase): dir_fd = None @@ -48,7 +52,7 @@ def setUp(self): def open_dirfd(self): if self.dir_fd is not None: os.close(self.dir_fd) - if {os.open, os.stat} <= os.supports_dir_fd and os.scandir in os.supports_fd: + if _supports_dir_fd: self.dir_fd = os.open(self.tempdir, os.O_RDONLY | os.O_DIRECTORY) else: self.dir_fd = None @@ -399,6 +403,24 @@ def test_glob_above_recursion_limit(self): with infinite_recursion(depth - 5): glob.glob(pattern, recursive=True) + @unittest.skipUnless(_supports_dir_fd, "Needs support for iglob(dir_fd=...)") + def test_iglob_iter_close(self): + base = os.path.join(self.tempdir, 'deep') + p = os.path.join(base, *(['d'] * 10)) + os.makedirs(p) + with ( + unittest.mock.patch("glob._StringGlobber.open", wraps=os.open) as os_open, + unittest.mock.patch("glob._StringGlobber.close", wraps=os.close) as os_close + ): + self.assertEqual(os_open.call_count, os_close.call_count) + iter = glob.iglob('**/*/d', dir_fd=self.dir_fd, recursive=True) + self.assertEqual(os_open.call_count, os_close.call_count) + self.assertEqual(next(iter), 'deep/d') + self.assertEqual(next(iter), 'deep/d/d') + self.assertGreater(os_open.call_count, os_close.call_count) + iter.close() + self.assertEqual(os_open.call_count, os_close.call_count) + def test_glob0(self): with self.assertWarns(DeprecationWarning): glob.glob0(self.tempdir, 'a') From 4cf8a4d638788f1410e429978c30bbdd8e963a00 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sun, 1 Sep 2024 16:55:33 +0100 Subject: [PATCH 55/59] Address some review feedback --- Lib/glob.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index f9e2a04a6a1ece..06a29909d85148 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -393,7 +393,10 @@ def select_recursive(path, dir_fd=None, rel_path=None, exists=False): while stack: path, dir_fd, rel_path = stack.pop() if path is None: - self.close(dir_fd) + try: + self.close(dir_fd) + except OSError: + pass def select_recursive_step(stack, match_pos): path, dir_fd, rel_path = stack.pop() @@ -440,7 +443,8 @@ def select_recursive_step(stack, match_pos): return select_recursive def select_exists(self, path, dir_fd=None, rel_path=None, exists=False): - """Yields the given path, if it exists. + """Yields the given path, if it exists. If *dir_fd* is given, we check + whether *rel_path* exists relative to the fd. """ if exists: # Optimization: this path is already known to exist, e.g. because From 3ad9367adb73a2b1d8f6f8f20a0d90e6af70e0c4 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sun, 27 Oct 2024 23:18:24 +0000 Subject: [PATCH 56/59] Address more review comments --- Lib/glob.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index 06a29909d85148..d030a3d28317a7 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -47,10 +47,10 @@ def iglob(pathname, *, root_dir=None, dir_fd=None, recursive=False, pathname = os.fsdecode(pathname) if root_dir is not None: root_dir = os.fsdecode(root_dir) - for path in _iglob(pathname, root_dir, dir_fd, recursive, include_hidden): - yield os.fsencode(path) + paths = _iglob(pathname, root_dir, dir_fd, recursive, include_hidden) + return map(os.fsencode, paths) else: - yield from _iglob(pathname, root_dir, dir_fd, recursive, include_hidden) + return _iglob(pathname, root_dir, dir_fd, recursive, include_hidden) def _iglob(pathname, root_dir, dir_fd, recursive, include_hidden): if os.name == 'nt': @@ -70,10 +70,9 @@ def _iglob(pathname, root_dir, dir_fd, recursive, include_hidden): root_dir = os.path.curdir paths = _relative_glob(select, root_dir, dir_fd) # Skip empty string. - for path in paths: - if path: - yield path - break + path = next(paths, None) + if path: + yield path yield from paths _deprecated_function_message = ( @@ -93,12 +92,12 @@ def glob1(dirname, pattern): def _relative_glob(select, dirname, dir_fd=None): """Globs using a *select* function from the given dirname. The dirname - prefix is removed from results. + prefix is removed from results. If dir_fd is supplied, then dirname is + opened relative to the given file descriptor. """ dirname = _StringGlobber.add_slash(dirname) slicer = operator.itemgetter(slice(len(dirname), None)) - for path in select(dirname, dir_fd, dirname): - yield slicer(path) + return map(slicer, select(dirname, dir_fd, dirname)) magic_check = re.compile('([*?[])') magic_check_bytes = re.compile(b'([*?[])') @@ -391,7 +390,7 @@ def select_recursive(path, dir_fd=None, rel_path=None, exists=False): finally: # Close any file descriptors still on the stack. while stack: - path, dir_fd, rel_path = stack.pop() + path, dir_fd, _rel_path = stack.pop() if path is None: try: self.close(dir_fd) From 66af33db814eae630d5ba498382105a260839d34 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sun, 27 Oct 2024 23:20:45 +0000 Subject: [PATCH 57/59] Drop parse_entry --- Lib/glob.py | 19 ++++--------------- Lib/pathlib/_abc.py | 5 ----- 2 files changed, 4 insertions(+), 20 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index d030a3d28317a7..1f7ee692eb0d16 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -248,12 +248,6 @@ def concat_path(path, text): """ raise NotImplementedError - @staticmethod - def parse_entry(entry): - """Returns the path of an entry yielded from scandir(). - """ - raise NotImplementedError - # High-level methods def compile(self, pat): @@ -328,10 +322,10 @@ def select_wildcard(path, dir_fd=None, rel_path=None, exists=False): fd = self.open(rel_path, _dir_open_flags, dir_fd=dir_fd) with self.scandir(fd) as scandir_it: entries = list(scandir_it) - prefix = self.add_slash(path) except OSError: pass else: + prefix = self.add_slash(path) for entry in entries: if match is None or match(entry.name): if dir_only: @@ -340,9 +334,7 @@ def select_wildcard(path, dir_fd=None, rel_path=None, exists=False): continue except OSError: continue - entry_path = self.parse_entry(entry) - if fd is not None: - entry_path = self.concat_path(prefix, entry_path) + entry_path = self.concat_path(prefix, entry.name) if dir_only: yield from select_next( entry_path, fd, entry.name, exists=True) @@ -412,10 +404,10 @@ def select_recursive_step(stack, match_pos): stack.append((None, fd, None)) with self.scandir(fd) as scandir_it: entries = list(scandir_it) - prefix = self.add_slash(path) except OSError: pass else: + prefix = self.add_slash(path) for entry in entries: is_dir = False try: @@ -425,9 +417,7 @@ def select_recursive_step(stack, match_pos): pass if is_dir or not dir_only: - entry_path = self.parse_entry(entry) - if fd is not None: - entry_path = self.concat_path(prefix, entry_path) + entry_path = self.concat_path(prefix, entry.name) if match is None or match(str(entry_path), match_pos): if dir_only: yield from select_next( @@ -467,7 +457,6 @@ class _StringGlobber(_GlobberBase): open = staticmethod(os.open) scandir = staticmethod(os.scandir) close = staticmethod(os.close) - parse_entry = operator.attrgetter('path') concat_path = operator.add if os.name == 'nt': diff --git a/Lib/pathlib/_abc.py b/Lib/pathlib/_abc.py index e398ca0a864ec1..8621699581d55d 100644 --- a/Lib/pathlib/_abc.py +++ b/Lib/pathlib/_abc.py @@ -108,11 +108,6 @@ def concat_path(path, text): """Appends text to the given path.""" return path.with_segments(path._raw_path + text) - @staticmethod - def parse_entry(entry): - """Returns the path of an entry yielded from scandir().""" - return entry - class PurePathBase: """Base class for pure path objects. From ce74ef17d8e4df1d2845d9293aab8b3157c7c2c5 Mon Sep 17 00:00:00 2001 From: barneygale Date: Mon, 28 Oct 2024 18:56:09 +0000 Subject: [PATCH 58/59] Address review feedback --- Lib/glob.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index 1f7ee692eb0d16..9adc0e2b34f9e5 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -53,8 +53,8 @@ def iglob(pathname, *, root_dir=None, dir_fd=None, recursive=False, return _iglob(pathname, root_dir, dir_fd, recursive, include_hidden) def _iglob(pathname, root_dir, dir_fd, recursive, include_hidden): - if os.name == 'nt': - pathname = pathname.replace('/', '\\') + if os.path.altsep: + pathname = pathname.replace(os.path.altsep, os.path.sep) drive, root, tail = os.path.splitroot(pathname) anchor = drive + root parts = tail.split(os.path.sep)[::-1] if tail else [] @@ -70,8 +70,7 @@ def _iglob(pathname, root_dir, dir_fd, recursive, include_hidden): root_dir = os.path.curdir paths = _relative_glob(select, root_dir, dir_fd) # Skip empty string. - path = next(paths, None) - if path: + if path := next(paths, None): yield path yield from paths From a69a060b134c665dacf6aaaaf9c316a6d016902a Mon Sep 17 00:00:00 2001 From: barneygale Date: Mon, 28 Oct 2024 18:59:56 +0000 Subject: [PATCH 59/59] Add comment. --- Lib/glob.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Lib/glob.py b/Lib/glob.py index 9adc0e2b34f9e5..3d531a5505eacd 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -400,6 +400,7 @@ def select_recursive_step(stack, match_pos): entries = list(scandir_it) else: fd = self.open(rel_path, _dir_open_flags, dir_fd=dir_fd) + # Schedule the file descriptor to be closed next step. stack.append((None, fd, None)) with self.scandir(fd) as scandir_it: entries = list(scandir_it)