Skip to content

Commit 58415d1

Browse files
authored
apacheGH-38183: [CI][Python] Use pipx to install GCS testbench (apache#43852)
### Rationale for this change Installing the GCS testbench using the same Python that's being used to test PyArrow is fragile: some testbench versions may not be compatible, or there could be conflicts among the dependencies of the respective libraries. ### What changes are included in this PR? Use `pipx` to install the GCS testbench in a separate, controlled environment, using an appropriate Python version. ### Are these changes tested? Yes, by CI. ### Are there any user-facing changes? No. * GitHub Issue: apache#38183 Authored-by: Antoine Pitrou <[email protected]> Signed-off-by: Sutou Kouhei <[email protected]>
1 parent 0bc91dd commit 58415d1

16 files changed

+122
-90
lines changed

.github/workflows/cpp.yml

+5-3
Original file line numberDiff line numberDiff line change
@@ -465,15 +465,17 @@ jobs:
465465
chmod +x /usr/local/bin/minio.exe
466466
- name: Set up Python
467467
uses: actions/[email protected]
468+
id: python-install
468469
with:
469470
python-version: 3.9
470471
- name: Install Google Cloud Storage Testbench
471-
shell: bash
472+
shell: msys2 {0}
473+
env:
474+
PIPX_BIN_DIR: /usr/local/bin
475+
PIPX_PYTHON: ${{ steps.python-install.outputs.python-path }}
472476
run: |
473477
ci/scripts/install_gcs_testbench.sh default
474-
echo "PYTHON_BIN_DIR=$(cygpath --windows $(dirname $(which python3.exe)))" >> $GITHUB_ENV
475478
- name: Test
476479
shell: msys2 {0}
477480
run: |
478-
PATH="$(cygpath --unix ${PYTHON_BIN_DIR}):${PATH}"
479481
ci/scripts/cpp_test.sh "$(pwd)" "$(pwd)/build"

appveyor.yml

+1
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ only_commits:
2424
- appveyor.yml
2525
- ci/appveyor*
2626
- ci/conda*
27+
- ci/scripts/*.bat
2728
- cpp/
2829
- format/
2930
- python/

ci/appveyor-cpp-build.bat

+2
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,9 @@ set ARROW_CMAKE_ARGS=-DARROW_DEPENDENCY_SOURCE=CONDA -DARROW_WITH_BZ2=ON
4646
set ARROW_CXXFLAGS=/WX /MP
4747

4848
@rem Install GCS testbench
49+
set PIPX_BIN_DIR=C:\Windows\
4950
call %CD%\ci\scripts\install_gcs_testbench.bat
51+
storage-testbench -h || exit /B
5052

5153
@rem
5254
@rem Build and test Arrow C++ libraries (including Parquet)

ci/docker/conda-cpp.dockerfile

+7-5
Original file line numberDiff line numberDiff line change
@@ -42,17 +42,19 @@ RUN mamba install -q -y \
4242
valgrind && \
4343
mamba clean --all
4444

45+
# We want to install the GCS testbench using the Conda base environment's Python,
46+
# because the test environment's Python may later change.
47+
ENV PIPX_PYTHON=/opt/conda/bin/python3
48+
COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts
49+
RUN /arrow/ci/scripts/install_gcs_testbench.sh default
50+
4551
# Ensure npm, node and azurite are on path. npm and node are required to install azurite, which will then need to
46-
# be on the path for the tests to run.
52+
# be on the path for the tests to run.
4753
ENV PATH=/opt/conda/envs/arrow/bin:$PATH
4854

4955
COPY ci/scripts/install_azurite.sh /arrow/ci/scripts/
5056
RUN /arrow/ci/scripts/install_azurite.sh
5157

52-
# We want to install the GCS testbench using the same Python binary that the Conda code will use.
53-
COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts
54-
RUN /arrow/ci/scripts/install_gcs_testbench.sh default
55-
5658
COPY ci/scripts/install_sccache.sh /arrow/ci/scripts/
5759
RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin
5860

ci/docker/conda-python.dockerfile

-5
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,6 @@ RUN mamba install -q -y \
3232
nomkl && \
3333
mamba clean --all
3434

35-
# XXX The GCS testbench was already installed in conda-cpp.dockerfile,
36-
# but we changed the installed Python version above, so we need to reinstall it.
37-
COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts
38-
RUN /arrow/ci/scripts/install_gcs_testbench.sh default
39-
4035
ENV ARROW_ACERO=ON \
4136
ARROW_BUILD_STATIC=OFF \
4237
ARROW_BUILD_TESTS=OFF \

ci/docker/python-wheel-windows-test-vs2019.dockerfile

+19-8
Original file line numberDiff line numberDiff line change
@@ -35,16 +35,27 @@ RUN setx path "%path%;C:\Program Files\Git\usr\bin"
3535
RUN wmic product where "name like 'python%%'" call uninstall /nointeractive && \
3636
rm -rf Python*
3737

38+
# Install the GCS testbench using a well-known Python version.
39+
# NOTE: cannot use pipx's `--fetch-missing-python` because of
40+
# https://github.com/pypa/pipx/issues/1521, therefore download Python ourselves.
41+
RUN choco install -r -y --pre --no-progress python --version=3.11.9
42+
ENV PIPX_BIN_DIR=C:\\Windows\\
43+
ENV PIPX_PYTHON="C:\Python311\python.exe"
44+
COPY ci/scripts/install_gcs_testbench.bat C:/arrow/ci/scripts/
45+
RUN call "C:\arrow\ci\scripts\install_gcs_testbench.bat" && \
46+
storage-testbench -h
47+
3848
# Define the full version number otherwise choco falls back to patch number 0 (3.8 => 3.8.0)
3949
ARG python=3.8
40-
RUN (if "%python%"=="3.8" setx PYTHON_VERSION "3.8.10" && setx PATH "%PATH%;C:\Python38;C:\Python38\Scripts") & \
41-
(if "%python%"=="3.9" setx PYTHON_VERSION "3.9.13" && setx PATH "%PATH%;C:\Python39;C:\Python39\Scripts") & \
42-
(if "%python%"=="3.10" setx PYTHON_VERSION "3.10.11" && setx PATH "%PATH%;C:\Python310;C:\Python310\Scripts") & \
43-
(if "%python%"=="3.11" setx PYTHON_VERSION "3.11.9" && setx PATH "%PATH%;C:\Python311;C:\Python311\Scripts") & \
44-
(if "%python%"=="3.12" setx PYTHON_VERSION "3.12.4" && setx PATH "%PATH%;C:\Python312;C:\Python312\Scripts") & \
45-
(if "%python%"=="3.13" setx PYTHON_VERSION "3.13.0-rc1" && setx PATH "%PATH%;C:\Python313;C:\Python313\Scripts")
50+
RUN (if "%python%"=="3.8" setx PYTHON_VERSION "3.8.10") & \
51+
(if "%python%"=="3.9" setx PYTHON_VERSION "3.9.13") & \
52+
(if "%python%"=="3.10" setx PYTHON_VERSION "3.10.11") & \
53+
(if "%python%"=="3.11" setx PYTHON_VERSION "3.11.9") & \
54+
(if "%python%"=="3.12" setx PYTHON_VERSION "3.12.4") & \
55+
(if "%python%"=="3.13" setx PYTHON_VERSION "3.13.0-rc1")
4656

4757
# Install archiver to extract xz archives
48-
RUN choco install -r -y --pre --no-progress python --version=%PYTHON_VERSION% & \
49-
python -m pip install --no-cache-dir -U pip setuptools & \
58+
RUN choco install -r -y --pre --no-progress --force python --version=%PYTHON_VERSION% && \
5059
choco install --no-progress -r -y archiver
60+
61+
ENV PYTHON=$python

ci/docker/ubuntu-20.04-cpp-minimal.dockerfile

+1
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ RUN apt-get update -y -q && \
3333
libssl-dev \
3434
libcurl4-openssl-dev \
3535
python3-pip \
36+
python3-venv \
3637
tzdata \
3738
wget && \
3839
apt-get clean && \

ci/docker/ubuntu-22.04-cpp-minimal.dockerfile

+1
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ RUN apt-get update -y -q && \
3333
libssl-dev \
3434
libcurl4-openssl-dev \
3535
python3-pip \
36+
python3-venv \
3637
tzdata \
3738
wget && \
3839
apt-get clean && \

ci/docker/ubuntu-24.04-cpp-minimal.dockerfile

+1
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ RUN apt-get update -y -q && \
3333
libssl-dev \
3434
libcurl4-openssl-dev \
3535
python3-pip \
36+
python3-venv \
3637
tzdata \
3738
tzdata-legacy \
3839
wget && \

ci/scripts/install_gcs_testbench.bat

+11-2
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,18 @@
1717

1818
@echo on
1919

20-
set GCS_TESTBENCH_VERSION="v0.36.0"
20+
set GCS_TESTBENCH_VERSION="v0.40.0"
21+
22+
set PIPX_FLAGS=--verbose
23+
if NOT "%PIPX_PYTHON%"=="" (
24+
set PIPX_FLAGS=--python %PIPX_PYTHON% %PIPX_FLAGS%
25+
)
26+
27+
python -m pip install -U pipx || exit /B 1
2128

2229
@REM Install GCS testbench %GCS_TESTBENCH_VERSION%
23-
python -m pip install ^
30+
pipx install %PIPX_FLAGS% ^
2431
"https://github.com/googleapis/storage-testbench/archive/%GCS_TESTBENCH_VERSION%.tar.gz" ^
2532
|| exit /B 1
33+
34+
pipx list --verbose

ci/scripts/install_gcs_testbench.sh

+12-8
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
# specific language governing permissions and limitations
1818
# under the License.
1919

20-
set -e
20+
set -ex
2121

2222
if [ "$#" -ne 1 ]; then
2323
echo "Usage: $0 <storage-testbench version>"
@@ -34,19 +34,23 @@ case "$(uname -m)" in
3434
;;
3535
esac
3636

37-
# On newer pythons install into the system will fail, so override that
38-
export PIP_BREAK_SYSTEM_PACKAGES=1
39-
4037
version=$1
4138
if [[ "${version}" -eq "default" ]]; then
4239
version="v0.39.0"
43-
# Latests versions of Testbench require newer setuptools
44-
python3 -m pip install --upgrade setuptools
4540
fi
4641

42+
: ${PIPX_PYTHON:=$(which python3)}
43+
44+
export PIP_BREAK_SYSTEM_PACKAGES=1
45+
${PIPX_PYTHON} -m pip install -U pipx
46+
4747
# This script is run with PYTHON undefined in some places,
4848
# but those only use older pythons.
4949
if [[ -z "${PYTHON_VERSION}" ]] || [[ "${PYTHON_VERSION}" != "3.13" ]]; then
50-
python3 -m pip install \
51-
"https://github.com/googleapis/storage-testbench/archive/${version}.tar.gz"
50+
pipx_flags=--verbose
51+
if [[ $(id -un) == "root" ]]; then
52+
# Install globally as /root/.local/bin is typically not in $PATH
53+
pipx_flags="${pipx_flags} --global"
54+
fi
55+
${PIPX_PYTHON} -m pipx install ${pipx_flags} "https://github.com/googleapis/storage-testbench/archive/${version}.tar.gz"
5256
fi

ci/scripts/python_wheel_windows_test.bat

+22-18
Original file line numberDiff line numberDiff line change
@@ -37,28 +37,32 @@ set PYARROW_TEST_TENSORFLOW=ON
3737
set ARROW_TEST_DATA=C:\arrow\testing\data
3838
set PARQUET_TEST_DATA=C:\arrow\cpp\submodules\parquet-testing\data
3939

40-
@REM Install testing dependencies
41-
pip install -r C:\arrow\python\requirements-wheel-test.txt || exit /B 1
40+
@REM List installed Pythons
41+
py -0p
42+
43+
set PYTHON_CMD=py -%PYTHON%
4244

43-
@REM Install GCS testbench
44-
call "C:\arrow\ci\scripts\install_gcs_testbench.bat"
45+
%PYTHON_CMD% -m pip install -U pip setuptools || exit /B 1
46+
47+
@REM Install testing dependencies
48+
%PYTHON_CMD% -m pip install -r C:\arrow\python\requirements-wheel-test.txt || exit /B 1
4549

4650
@REM Install the built wheels
47-
python -m pip install --no-index --find-links=C:\arrow\python\dist\ pyarrow || exit /B 1
51+
%PYTHON_CMD% -m pip install --no-index --find-links=C:\arrow\python\dist\ pyarrow || exit /B 1
4852

4953
@REM Test that the modules are importable
50-
python -c "import pyarrow" || exit /B 1
51-
python -c "import pyarrow._gcsfs" || exit /B 1
52-
python -c "import pyarrow._hdfs" || exit /B 1
53-
python -c "import pyarrow._s3fs" || exit /B 1
54-
python -c "import pyarrow.csv" || exit /B 1
55-
python -c "import pyarrow.dataset" || exit /B 1
56-
python -c "import pyarrow.flight" || exit /B 1
57-
python -c "import pyarrow.fs" || exit /B 1
58-
python -c "import pyarrow.json" || exit /B 1
59-
python -c "import pyarrow.orc" || exit /B 1
60-
python -c "import pyarrow.parquet" || exit /B 1
61-
python -c "import pyarrow.substrait" || exit /B 1
54+
%PYTHON_CMD% -c "import pyarrow" || exit /B 1
55+
%PYTHON_CMD% -c "import pyarrow._gcsfs" || exit /B 1
56+
%PYTHON_CMD% -c "import pyarrow._hdfs" || exit /B 1
57+
%PYTHON_CMD% -c "import pyarrow._s3fs" || exit /B 1
58+
%PYTHON_CMD% -c "import pyarrow.csv" || exit /B 1
59+
%PYTHON_CMD% -c "import pyarrow.dataset" || exit /B 1
60+
%PYTHON_CMD% -c "import pyarrow.flight" || exit /B 1
61+
%PYTHON_CMD% -c "import pyarrow.fs" || exit /B 1
62+
%PYTHON_CMD% -c "import pyarrow.json" || exit /B 1
63+
%PYTHON_CMD% -c "import pyarrow.orc" || exit /B 1
64+
%PYTHON_CMD% -c "import pyarrow.parquet" || exit /B 1
65+
%PYTHON_CMD% -c "import pyarrow.substrait" || exit /B 1
6266

6367
@rem Download IANA Timezone Database for ORC C++
6468
curl https://cygwin.osuosl.org/noarch/release/tzdata/tzdata-2024a-1.tar.xz --output tzdata.tar.xz || exit /B
@@ -67,4 +71,4 @@ arc unarchive tzdata.tar.xz %USERPROFILE%\Downloads\test\tzdata
6771
set TZDIR=%USERPROFILE%\Downloads\test\tzdata\usr\share\zoneinfo
6872

6973
@REM Execute unittest
70-
pytest -r s --pyargs pyarrow || exit /B 1
74+
%PYTHON_CMD% -m pytest -r s --pyargs pyarrow || exit /B 1

cpp/src/arrow/filesystem/gcsfs_test.cc

+34-34
Original file line numberDiff line numberDiff line change
@@ -95,52 +95,52 @@ class GcsTestbench : public ::testing::Environment {
9595
if (const auto* env = std::getenv("PYTHON")) {
9696
names = {env};
9797
}
98-
auto error = std::string(
99-
"Could not start GCS emulator."
100-
" Used the following list of python interpreter names:");
101-
for (const auto& interpreter : names) {
102-
auto exe_path = bp::search_path(interpreter);
103-
error += " " + interpreter;
104-
if (exe_path.empty()) {
105-
error += " (exe not found)";
106-
continue;
107-
}
98+
auto error = std::string("Could not start GCS emulator 'storage-testbench'");
10899

109-
bp::ipstream output;
110-
server_process_ = bp::child(exe_path, "-m", "testbench", "--port", port_, group_,
111-
bp::std_err > output);
100+
auto testbench_is_running = [](bp::child& process, bp::ipstream& output) {
112101
// Wait for message: "* Restarting with"
113-
auto testbench_is_running = [&output, this](bp::child& process) {
114-
std::string line;
115-
std::chrono::time_point<std::chrono::steady_clock> end =
116-
std::chrono::steady_clock::now() + std::chrono::seconds(10);
117-
while (server_process_.valid() && server_process_.running() &&
118-
std::chrono::steady_clock::now() < end) {
119-
if (output.peek() && std::getline(output, line)) {
120-
std::cerr << line << std::endl;
121-
if (line.find("* Restarting with") != std::string::npos) return true;
122-
} else {
123-
std::this_thread::sleep_for(std::chrono::milliseconds(20));
124-
}
102+
std::string line;
103+
std::chrono::time_point<std::chrono::steady_clock> end =
104+
std::chrono::steady_clock::now() + std::chrono::seconds(10);
105+
while (process.valid() && process.running() &&
106+
std::chrono::steady_clock::now() < end) {
107+
if (output.peek() && std::getline(output, line)) {
108+
std::cerr << line << std::endl;
109+
if (line.find("* Restarting with") != std::string::npos) return true;
110+
} else {
111+
std::this_thread::sleep_for(std::chrono::milliseconds(20));
125112
}
126-
return false;
127-
};
113+
}
114+
return false;
115+
};
128116

129-
if (testbench_is_running(server_process_)) break;
130-
error += " (failed to start)";
131-
server_process_.terminate();
132-
server_process_.wait();
117+
auto exe_path = bp::search_path("storage-testbench");
118+
if (!exe_path.empty()) {
119+
bp::ipstream output;
120+
server_process_ =
121+
bp::child(exe_path, "--port", port_, group_, bp::std_err > output);
122+
if (!testbench_is_running(server_process_, output)) {
123+
error += " (failed to start)";
124+
server_process_.terminate();
125+
server_process_.wait();
126+
}
127+
} else {
128+
error += " (exe not found)";
129+
}
130+
if (!server_process_.valid()) {
131+
error_ = std::move(error);
133132
}
134-
if (server_process_.valid() && server_process_.valid()) return;
135-
error_ = std::move(error);
136133
}
137134

138135
bool running() { return server_process_.running(); }
139136

140137
~GcsTestbench() override {
141138
// Brutal shutdown, kill the full process group because the GCS testbench may launch
142139
// additional children.
143-
group_.terminate();
140+
try {
141+
group_.terminate();
142+
} catch (bp::process_error&) {
143+
}
144144
if (server_process_.valid()) {
145145
server_process_.wait();
146146
}

python/pyarrow/tests/conftest.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -233,17 +233,16 @@ def minio_server_health_check(address):
233233
def gcs_server():
234234
port = find_free_port()
235235
env = os.environ.copy()
236-
args = [sys.executable, '-m', 'testbench', '--port', str(port)]
236+
exe = 'storage-testbench'
237+
args = [exe, '--port', str(port)]
237238
proc = None
238239
try:
239-
# check first if testbench module is available
240-
import testbench # noqa:F401
241240
# start server
242241
proc = subprocess.Popen(args, env=env)
243242
# Make sure the server is alive.
244243
if proc.poll() is not None:
245244
pytest.skip(f"Command {args} did not start server successfully!")
246-
except (ModuleNotFoundError, OSError) as e:
245+
except OSError as e:
247246
pytest.skip(f"Command {args} failed to execute: {e}")
248247
else:
249248
yield {

python/scripts/run_emscripten_tests.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -335,7 +335,7 @@ def _load_pyarrow_in_runner(driver, wheel_name):
335335
"""
336336
import pyarrow,pathlib
337337
pyarrow_dir = pathlib.Path(pyarrow.__file__).parent
338-
pytest.main([pyarrow_dir, '-v'])
338+
pytest.main([pyarrow_dir, '-r', 's'])
339339
""",
340340
wait_for_terminate=False,
341341
)

r/tests/testthat/test-gcs.R

+2-2
Original file line numberDiff line numberDiff line change
@@ -116,12 +116,12 @@ test_that("GcsFileSystem$create() can read json_credentials", {
116116
})
117117

118118
skip_on_cran()
119-
skip_if_not(system('python -c "import testbench"') == 0, message = "googleapis-storage-testbench is not installed.")
119+
skip_if_not(system("storage-testbench -h") == 0, message = "googleapis-storage-testbench is not installed.")
120120
library(dplyr)
121121

122122
testbench_port <- Sys.getenv("TESTBENCH_PORT", "9001")
123123

124-
pid_minio <- sys::exec_background("python", c("-m", "testbench", "--port", testbench_port),
124+
pid_minio <- sys::exec_background("storage-testbench", c("--port", testbench_port),
125125
std_out = FALSE,
126126
std_err = FALSE # TODO: is there a good place to send output?
127127
)

0 commit comments

Comments
 (0)