Skip to content

Commit 475a99e

Browse files
aignasdougthor42
andauthored
fix(pypi): change the parallelisation scheme for querying SimpleAPI (#2531)
Instead of querying everything in parallel and yielding a lot of 404 warnings, let's query the main index first and then query the other indexes only for the packages that were not yet found. What is more, we can print the value of `experimental_index_url_overrides` for the users to use. Whilst at it, add a unit test to check the new logic. Fixes #2100, since this is the best `rules_python` can do for now. --------- Co-authored-by: Douglas Thor <[email protected]>
1 parent 9035db2 commit 475a99e

File tree

5 files changed

+202
-39
lines changed

5 files changed

+202
-39
lines changed

CHANGELOG.md

+4
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,10 @@ Unreleased changes template.
5656
* Bazel 6 support is dropped and Bazel 7.4.1 is the minimum supported
5757
version, per our Bazel support matrix. Earlier versions are not
5858
tested by CI, so functionality cannot be guaranteed.
59+
* ({bzl:obj}`pip.parse`) From now we will make fewer calls to indexes when
60+
fetching the metadata from SimpleAPI. The calls will be done in parallel to
61+
each index separately, so the extension evaluation time might slow down if
62+
not using {bzl:obj}`pip.parse.experimental_index_url_overrides`.
5963
* ({bzl:obj}`pip.parse`) Only query SimpleAPI for packages that have
6064
sha values in the `requirements.txt` file.
6165

python/private/pypi/extension.bzl

+5
Original file line numberDiff line numberDiff line change
@@ -657,6 +657,11 @@ The indexes must support Simple API as described here:
657657
https://packaging.python.org/en/latest/specifications/simple-repository-api/
658658
659659
This is equivalent to `--extra-index-urls` `pip` option.
660+
661+
:::{versionchanged} 1.1.0
662+
Starting with this version we will iterate over each index specified until
663+
we find metadata for all references distributions.
664+
:::
660665
""",
661666
default = [],
662667
),

python/private/pypi/simpleapi_download.bzl

+60-39
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,17 @@ load("@bazel_features//:features.bzl", "bazel_features")
2020
load("//python/private:auth.bzl", "get_auth")
2121
load("//python/private:envsubst.bzl", "envsubst")
2222
load("//python/private:normalize_name.bzl", "normalize_name")
23+
load("//python/private:text_util.bzl", "render")
2324
load(":parse_simpleapi_html.bzl", "parse_simpleapi_html")
2425

25-
def simpleapi_download(ctx, *, attr, cache, parallel_download = True):
26+
def simpleapi_download(
27+
ctx,
28+
*,
29+
attr,
30+
cache,
31+
parallel_download = True,
32+
read_simpleapi = None,
33+
_fail = fail):
2634
"""Download Simple API HTML.
2735
2836
Args:
@@ -49,6 +57,9 @@ def simpleapi_download(ctx, *, attr, cache, parallel_download = True):
4957
reflected when re-evaluating the extension unless we do
5058
`bazel clean --expunge`.
5159
parallel_download: A boolean to enable usage of bazel 7.1 non-blocking downloads.
60+
read_simpleapi: a function for reading and parsing of the SimpleAPI contents.
61+
Used in tests.
62+
_fail: a function to print a failure. Used in tests.
5263
5364
Returns:
5465
dict of pkg name to the parsed HTML contents - a list of structs.
@@ -64,15 +75,22 @@ def simpleapi_download(ctx, *, attr, cache, parallel_download = True):
6475

6576
# NOTE @aignas 2024-03-31: we are not merging results from multiple indexes
6677
# to replicate how `pip` would handle this case.
67-
async_downloads = {}
6878
contents = {}
6979
index_urls = [attr.index_url] + attr.extra_index_urls
70-
for pkg in attr.sources:
71-
pkg_normalized = normalize_name(pkg)
72-
73-
success = False
74-
for index_url in index_urls:
75-
result = _read_simpleapi(
80+
read_simpleapi = read_simpleapi or _read_simpleapi
81+
82+
found_on_index = {}
83+
warn_overrides = False
84+
for i, index_url in enumerate(index_urls):
85+
if i != 0:
86+
# Warn the user about a potential fix for the overrides
87+
warn_overrides = True
88+
89+
async_downloads = {}
90+
sources = [pkg for pkg in attr.sources if pkg not in found_on_index]
91+
for pkg in sources:
92+
pkg_normalized = normalize_name(pkg)
93+
result = read_simpleapi(
7694
ctx = ctx,
7795
url = "{}/{}/".format(
7896
index_url_overrides.get(pkg_normalized, index_url).rstrip("/"),
@@ -84,42 +102,45 @@ def simpleapi_download(ctx, *, attr, cache, parallel_download = True):
84102
)
85103
if hasattr(result, "wait"):
86104
# We will process it in a separate loop:
87-
async_downloads.setdefault(pkg_normalized, []).append(
88-
struct(
89-
pkg_normalized = pkg_normalized,
90-
wait = result.wait,
91-
),
105+
async_downloads[pkg] = struct(
106+
pkg_normalized = pkg_normalized,
107+
wait = result.wait,
92108
)
93-
continue
94-
95-
if result.success:
109+
elif result.success:
96110
contents[pkg_normalized] = result.output
97-
success = True
98-
break
99-
100-
if not async_downloads and not success:
101-
fail("Failed to download metadata from urls: {}".format(
102-
", ".join(index_urls),
103-
))
104-
105-
if not async_downloads:
106-
return contents
107-
108-
# If we use `block` == False, then we need to have a second loop that is
109-
# collecting all of the results as they were being downloaded in parallel.
110-
for pkg, downloads in async_downloads.items():
111-
success = False
112-
for download in downloads:
111+
found_on_index[pkg] = index_url
112+
113+
if not async_downloads:
114+
continue
115+
116+
# If we use `block` == False, then we need to have a second loop that is
117+
# collecting all of the results as they were being downloaded in parallel.
118+
for pkg, download in async_downloads.items():
113119
result = download.wait()
114120

115-
if result.success and download.pkg_normalized not in contents:
121+
if result.success:
116122
contents[download.pkg_normalized] = result.output
117-
success = True
118-
119-
if not success:
120-
fail("Failed to download metadata from urls: {}".format(
121-
", ".join(index_urls),
122-
))
123+
found_on_index[pkg] = index_url
124+
125+
failed_sources = [pkg for pkg in attr.sources if pkg not in found_on_index]
126+
if failed_sources:
127+
_fail("Failed to download metadata for {} for from urls: {}".format(
128+
failed_sources,
129+
index_urls,
130+
))
131+
return None
132+
133+
if warn_overrides:
134+
index_url_overrides = {
135+
pkg: found_on_index[pkg]
136+
for pkg in attr.sources
137+
if found_on_index[pkg] != attr.index_url
138+
}
139+
140+
# buildifier: disable=print
141+
print("You can use the following `index_url_overrides` to avoid the 404 warnings:\n{}".format(
142+
render.dict(index_url_overrides),
143+
))
123144

124145
return contents
125146

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
load("simpleapi_download_tests.bzl", "simpleapi_download_test_suite")
2+
3+
simpleapi_download_test_suite(
4+
name = "simpleapi_download_tests",
5+
)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
# Copyright 2024 The Bazel Authors. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
""
16+
17+
load("@rules_testing//lib:test_suite.bzl", "test_suite")
18+
load("//python/private/pypi:simpleapi_download.bzl", "simpleapi_download") # buildifier: disable=bzl-visibility
19+
20+
_tests = []
21+
22+
def _test_simple(env):
23+
calls = []
24+
25+
def read_simpleapi(ctx, url, attr, cache, block):
26+
_ = ctx # buildifier: disable=unused-variable
27+
_ = attr
28+
_ = cache
29+
env.expect.that_bool(block).equals(False)
30+
calls.append(url)
31+
if "foo" in url and "main" in url:
32+
return struct(
33+
output = "",
34+
success = False,
35+
)
36+
else:
37+
return struct(
38+
output = "data from {}".format(url),
39+
success = True,
40+
)
41+
42+
contents = simpleapi_download(
43+
ctx = struct(
44+
os = struct(environ = {}),
45+
),
46+
attr = struct(
47+
index_url_overrides = {},
48+
index_url = "main",
49+
extra_index_urls = ["extra"],
50+
sources = ["foo", "bar", "baz"],
51+
envsubst = [],
52+
),
53+
cache = {},
54+
parallel_download = True,
55+
read_simpleapi = read_simpleapi,
56+
)
57+
58+
env.expect.that_collection(calls).contains_exactly([
59+
"extra/foo/",
60+
"main/bar/",
61+
"main/baz/",
62+
"main/foo/",
63+
])
64+
env.expect.that_dict(contents).contains_exactly({
65+
"bar": "data from main/bar/",
66+
"baz": "data from main/baz/",
67+
"foo": "data from extra/foo/",
68+
})
69+
70+
_tests.append(_test_simple)
71+
72+
def _test_fail(env):
73+
calls = []
74+
fails = []
75+
76+
def read_simpleapi(ctx, url, attr, cache, block):
77+
_ = ctx # buildifier: disable=unused-variable
78+
_ = attr
79+
_ = cache
80+
env.expect.that_bool(block).equals(False)
81+
calls.append(url)
82+
if "foo" in url:
83+
return struct(
84+
output = "",
85+
success = False,
86+
)
87+
else:
88+
return struct(
89+
output = "data from {}".format(url),
90+
success = True,
91+
)
92+
93+
simpleapi_download(
94+
ctx = struct(
95+
os = struct(environ = {}),
96+
),
97+
attr = struct(
98+
index_url_overrides = {},
99+
index_url = "main",
100+
extra_index_urls = ["extra"],
101+
sources = ["foo", "bar", "baz"],
102+
envsubst = [],
103+
),
104+
cache = {},
105+
parallel_download = True,
106+
read_simpleapi = read_simpleapi,
107+
_fail = fails.append,
108+
)
109+
110+
env.expect.that_collection(fails).contains_exactly([
111+
"""Failed to download metadata for ["foo"] for from urls: ["main", "extra"]""",
112+
])
113+
env.expect.that_collection(calls).contains_exactly([
114+
"extra/foo/",
115+
"main/bar/",
116+
"main/baz/",
117+
"main/foo/",
118+
])
119+
120+
_tests.append(_test_fail)
121+
122+
def simpleapi_download_test_suite(name):
123+
"""Create the test suite.
124+
125+
Args:
126+
name: the name of the test suite
127+
"""
128+
test_suite(name = name, basic_tests = _tests)

0 commit comments

Comments
 (0)