fix(pypi): change the parallelisation scheme for querying SimpleAPI (#2531)

aignas · dougthor42 · web-flow · commit 475a99e283ac · 2024-12-31T06:16:38.000Z
Instead of querying everything in parallel and yielding a lot of 404 warnings, let's query the main index first and then query the other indexes only for the packages that were not yet found. What is more, we can print the value of `experimental_index_url_overrides` for the users to use. Whilst at it, add a unit test to check the new logic. Fixes #2100, since this is the best `rules_python` can do for now. --------- Co-authored-by: Douglas Thor <dougthor42@users.noreply.github.com>
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -56,6 +56,10 @@ Unreleased changes template.
 * Bazel 6 support is dropped and Bazel 7.4.1 is the minimum supported
   version, per our Bazel support matrix. Earlier versions are not
   tested by CI, so functionality cannot be guaranteed.
+* ({bzl:obj}`pip.parse`) From now we will make fewer calls to indexes when
+  fetching the metadata from SimpleAPI. The calls will be done in parallel to
+  each index separately, so the extension evaluation time might slow down if
+  not using {bzl:obj}`pip.parse.experimental_index_url_overrides`.
 * ({bzl:obj}`pip.parse`) Only query SimpleAPI for packages that have
   sha values in the `requirements.txt` file.
 
diff --git a/python/private/pypi/extension.bzl b/python/private/pypi/extension.bzl
@@ -657,6 +657,11 @@ The indexes must support Simple API as described here:
 https://packaging.python.org/en/latest/specifications/simple-repository-api/
 
 This is equivalent to `--extra-index-urls` `pip` option.
+
+:::{versionchanged} 1.1.0
+Starting with this version we will iterate over each index specified until
+we find metadata for all references distributions.
+:::
 """,
             default = [],
         ),
diff --git a/python/private/pypi/simpleapi_download.bzl b/python/private/pypi/simpleapi_download.bzl
@@ -20,9 +20,17 @@ load("@bazel_features//:features.bzl", "bazel_features")
 load("//python/private:auth.bzl", "get_auth")
 load("//python/private:envsubst.bzl", "envsubst")
 load("//python/private:normalize_name.bzl", "normalize_name")
+load("//python/private:text_util.bzl", "render")
 load(":parse_simpleapi_html.bzl", "parse_simpleapi_html")
 
-def simpleapi_download(ctx, *, attr, cache, parallel_download = True):
+def simpleapi_download(
+        ctx,
+        *,
+        attr,
+        cache,
+        parallel_download = True,
+        read_simpleapi = None,
+        _fail = fail):
     """Download Simple API HTML.
 
     Args:
@@ -49,6 +57,9 @@ def simpleapi_download(ctx, *, attr, cache, parallel_download = True):
             reflected when re-evaluating the extension unless we do
             `bazel clean --expunge`.
         parallel_download: A boolean to enable usage of bazel 7.1 non-blocking downloads.
+        read_simpleapi: a function for reading and parsing of the SimpleAPI contents.
+            Used in tests.
+        _fail: a function to print a failure. Used in tests.
 
     Returns:
         dict of pkg name to the parsed HTML contents - a list of structs.
@@ -64,15 +75,22 @@ def simpleapi_download(ctx, *, attr, cache, parallel_download = True):
 
     # NOTE @aignas 2024-03-31: we are not merging results from multiple indexes
     # to replicate how `pip` would handle this case.
-    async_downloads = {}
     contents = {}
     index_urls = [attr.index_url] + attr.extra_index_urls
-    for pkg in attr.sources:
-        pkg_normalized = normalize_name(pkg)
-
-        success = False
-        for index_url in index_urls:
-            result = _read_simpleapi(
+    read_simpleapi = read_simpleapi or _read_simpleapi
+
+    found_on_index = {}
+    warn_overrides = False
+    for i, index_url in enumerate(index_urls):
+        if i != 0:
+            # Warn the user about a potential fix for the overrides
+            warn_overrides = True
+
+        async_downloads = {}
+        sources = [pkg for pkg in attr.sources if pkg not in found_on_index]
+        for pkg in sources:
+            pkg_normalized = normalize_name(pkg)
+            result = read_simpleapi(
                 ctx = ctx,
                 url = "{}/{}/".format(
                     index_url_overrides.get(pkg_normalized, index_url).rstrip("/"),
@@ -84,42 +102,45 @@ def simpleapi_download(ctx, *, attr, cache, parallel_download = True):
             )
             if hasattr(result, "wait"):
                 # We will process it in a separate loop:
-                async_downloads.setdefault(pkg_normalized, []).append(
-                    struct(
-                        pkg_normalized = pkg_normalized,
-                        wait = result.wait,
-                    ),
+                async_downloads[pkg] = struct(
+                    pkg_normalized = pkg_normalized,
+                    wait = result.wait,
                 )
-                continue
-
-            if result.success:
+            elif result.success:
                 contents[pkg_normalized] = result.output
-                success = True
-                break
-
-        if not async_downloads and not success:
-            fail("Failed to download metadata from urls: {}".format(
-                ", ".join(index_urls),
-            ))
-
-    if not async_downloads:
-        return contents
-
-    # If we use `block` == False, then we need to have a second loop that is
-    # collecting all of the results as they were being downloaded in parallel.
-    for pkg, downloads in async_downloads.items():
-        success = False
-        for download in downloads:
+                found_on_index[pkg] = index_url
+
+        if not async_downloads:
+            continue
+
+        # If we use `block` == False, then we need to have a second loop that is
+        # collecting all of the results as they were being downloaded in parallel.
+        for pkg, download in async_downloads.items():
             result = download.wait()
 
-            if result.success and download.pkg_normalized not in contents:
+            if result.success:
                 contents[download.pkg_normalized] = result.output
-                success = True
-
-        if not success:
-            fail("Failed to download metadata from urls: {}".format(
-                ", ".join(index_urls),
-            ))
+                found_on_index[pkg] = index_url
+
+    failed_sources = [pkg for pkg in attr.sources if pkg not in found_on_index]
+    if failed_sources:
+        _fail("Failed to download metadata for {} for from urls: {}".format(
+            failed_sources,
+            index_urls,
+        ))
+        return None
+
+    if warn_overrides:
+        index_url_overrides = {
+            pkg: found_on_index[pkg]
+            for pkg in attr.sources
+            if found_on_index[pkg] != attr.index_url
+        }
+
+        # buildifier: disable=print
+        print("You can use the following `index_url_overrides` to avoid the 404 warnings:\n{}".format(
+            render.dict(index_url_overrides),
+        ))
 
     return contents
 
diff --git a/tests/pypi/simpleapi_download/BUILD.bazel b/tests/pypi/simpleapi_download/BUILD.bazel
@@ -0,0 +1,5 @@
+load("simpleapi_download_tests.bzl", "simpleapi_download_test_suite")
+
+simpleapi_download_test_suite(
+    name = "simpleapi_download_tests",
+)
diff --git a/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl b/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl
@@ -0,0 +1,128 @@
+# Copyright 2024 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""
+
+load("@rules_testing//lib:test_suite.bzl", "test_suite")
+load("//python/private/pypi:simpleapi_download.bzl", "simpleapi_download")  # buildifier: disable=bzl-visibility
+
+_tests = []
+
+def _test_simple(env):
+    calls = []
+
+    def read_simpleapi(ctx, url, attr, cache, block):
+        _ = ctx  # buildifier: disable=unused-variable
+        _ = attr
+        _ = cache
+        env.expect.that_bool(block).equals(False)
+        calls.append(url)
+        if "foo" in url and "main" in url:
+            return struct(
+                output = "",
+                success = False,
+            )
+        else:
+            return struct(
+                output = "data from {}".format(url),
+                success = True,
+            )
+
+    contents = simpleapi_download(
+        ctx = struct(
+            os = struct(environ = {}),
+        ),
+        attr = struct(
+            index_url_overrides = {},
+            index_url = "main",
+            extra_index_urls = ["extra"],
+            sources = ["foo", "bar", "baz"],
+            envsubst = [],
+        ),
+        cache = {},
+        parallel_download = True,
+        read_simpleapi = read_simpleapi,
+    )
+
+    env.expect.that_collection(calls).contains_exactly([
+        "extra/foo/",
+        "main/bar/",
+        "main/baz/",
+        "main/foo/",
+    ])
+    env.expect.that_dict(contents).contains_exactly({
+        "bar": "data from main/bar/",
+        "baz": "data from main/baz/",
+        "foo": "data from extra/foo/",
+    })
+
+_tests.append(_test_simple)
+
+def _test_fail(env):
+    calls = []
+    fails = []
+
+    def read_simpleapi(ctx, url, attr, cache, block):
+        _ = ctx  # buildifier: disable=unused-variable
+        _ = attr
+        _ = cache
+        env.expect.that_bool(block).equals(False)
+        calls.append(url)
+        if "foo" in url:
+            return struct(
+                output = "",
+                success = False,
+            )
+        else:
+            return struct(
+                output = "data from {}".format(url),
+                success = True,
+            )
+
+    simpleapi_download(
+        ctx = struct(
+            os = struct(environ = {}),
+        ),
+        attr = struct(
+            index_url_overrides = {},
+            index_url = "main",
+            extra_index_urls = ["extra"],
+            sources = ["foo", "bar", "baz"],
+            envsubst = [],
+        ),
+        cache = {},
+        parallel_download = True,
+        read_simpleapi = read_simpleapi,
+        _fail = fails.append,
+    )
+
+    env.expect.that_collection(fails).contains_exactly([
+        """Failed to download metadata for ["foo"] for from urls: ["main", "extra"]""",
+    ])
+    env.expect.that_collection(calls).contains_exactly([
+        "extra/foo/",
+        "main/bar/",
+        "main/baz/",
+        "main/foo/",
+    ])
+
+_tests.append(_test_fail)
+
+def simpleapi_download_test_suite(name):
+    """Create the test suite.
+
+    Args:
+        name: the name of the test suite
+    """
+    test_suite(name = name, basic_tests = _tests)