Skip to content

Commit

Permalink
Merge pull request #16813 from bdice/branch-24.10-merge-24.08
Browse files Browse the repository at this point in the history
Forward-merge branch-24.08 to branch-24.10
  • Loading branch information
AyodeAwe authored Sep 20, 2024
2 parents f71f53a + 9834a3a commit eeb4bae
Show file tree
Hide file tree
Showing 66 changed files with 2,685 additions and 435 deletions.
12 changes: 12 additions & 0 deletions .github/workflows/pr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ jobs:
- wheel-tests-cudf
- wheel-build-cudf-polars
- wheel-tests-cudf-polars
- cudf-polars-polars-tests
- wheel-build-dask-cudf
- wheel-tests-dask-cudf
- devcontainer
Expand Down Expand Up @@ -244,6 +245,17 @@ jobs:
# This always runs, but only fails if this PR touches code in
# pylibcudf or cudf_polars
script: "ci/test_wheel_cudf_polars.sh"
cudf-polars-polars-tests:
needs: wheel-build-cudf-polars
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
# This selects "ARCH=amd64 + the latest supported Python + CUDA".
matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
build_type: pull-request
# This always runs, but only fails if this PR touches code in
# pylibcudf or cudf_polars
script: "ci/test_cudf_polars_polars_tests.sh"
wheel-build-dask-cudf:
needs: wheel-build-cudf
secrets: inherit
Expand Down
27 changes: 27 additions & 0 deletions ci/run_cudf_polars_polars_tests.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/bin/bash
# Copyright (c) 2024, NVIDIA CORPORATION.

set -euo pipefail

# Support invoking run_cudf_polars_pytests.sh outside the script directory
# Assumption, polars has been cloned in the root of the repo.
cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../polars/

DESELECTED_TESTS=(
"tests/unit/test_polars_import.py::test_polars_import" # relies on a polars built in place
"tests/unit/streaming/test_streaming_sort.py::test_streaming_sort[True]" # relies on polars built in debug mode
"tests/unit/test_cpu_check.py::test_check_cpu_flags_skipped_no_flags" # Mock library error
"tests/docs/test_user_guide.py" # No dot binary in CI image
)

DESELECTED_TESTS=$(printf -- " --deselect %s" "${DESELECTED_TESTS[@]}")
python -m pytest \
--import-mode=importlib \
--cache-clear \
-m "" \
-p cudf_polars.testing.plugin \
-v \
--tb=short \
${DESELECTED_TESTS} \
"$@" \
py-polars/tests
69 changes: 69 additions & 0 deletions ci/test_cudf_polars_polars_tests.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#!/bin/bash
# Copyright (c) 2024, NVIDIA CORPORATION.

set -eou pipefail

# We will only fail these tests if the PR touches code in pylibcudf
# or cudf_polars itself.
# Note, the three dots mean we are doing diff between the merge-base
# of upstream and HEAD. So this is asking, "does _this branch_ touch
# files in cudf_polars/pylibcudf", rather than "are there changes
# between upstream and this branch which touch cudf_polars/pylibcudf"
# TODO: is the target branch exposed anywhere in an environment variable?
if [ -n "$(git diff --name-only origin/branch-24.10...HEAD -- python/cudf_polars/ python/cudf/cudf/_lib/pylibcudf/)" ];
then
HAS_CHANGES=1
rapids-logger "PR has changes in cudf-polars/pylibcudf, test fails treated as failure"
else
HAS_CHANGES=0
rapids-logger "PR does not have changes in cudf-polars/pylibcudf, test fails NOT treated as failure"
fi

rapids-logger "Download wheels"

RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist

# Download the pylibcudf built in the previous step
RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibcudf-dep

rapids-logger "Install pylibcudf"
python -m pip install ./local-pylibcudf-dep/pylibcudf*.whl

rapids-logger "Install cudf_polars"
python -m pip install $(echo ./dist/cudf_polars*.whl)

# TAG=$(python -c 'import polars; print(f"py-{polars.__version__}")')
TAG="py-1.7.0"
rapids-logger "Clone polars to ${TAG}"
git clone https://github.com/pola-rs/polars.git --branch ${TAG} --depth 1

# Install requirements for running polars tests
rapids-logger "Install polars test requirements"
python -m pip install -r polars/py-polars/requirements-dev.txt -r polars/py-polars/requirements-ci.txt

function set_exitcode()
{
EXITCODE=$?
}
EXITCODE=0
trap set_exitcode ERR
set +e

rapids-logger "Run polars tests"
./ci/run_cudf_polars_polars_tests.sh

trap ERR
set -e

if [ ${EXITCODE} != 0 ]; then
rapids-logger "Running polars test suite FAILED: exitcode ${EXITCODE}"
else
rapids-logger "Running polars test suite PASSED"
fi

if [ ${HAS_CHANGES} == 1 ]; then
exit ${EXITCODE}
else
exit 0
fi
7 changes: 7 additions & 0 deletions ci/test_wheel_cudf_polars.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,14 @@ set -eou pipefail
if [ -n "$(git diff --name-only origin/branch-24.10...HEAD -- python/cudf_polars/ python/pylibcudf/)" ];
then
HAS_CHANGES=1
rapids-logger "PR has changes in cudf-polars/pylibcudf, test fails treated as failure"
else
HAS_CHANGES=0
rapids-logger "PR does not have changes in cudf-polars/pylibcudf, test fails NOT treated as failure"
fi

rapids-logger "Download wheels"

RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 python ./dist

Expand All @@ -43,6 +47,9 @@ python -m pip install \
"$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
"$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)"

rapids-logger "Pin to 1.7.0 Temporarily"
python -m pip install polars==1.7.0

rapids-logger "Run cudf_polars tests"

function set_exitcode()
Expand Down
2 changes: 1 addition & 1 deletion dependencies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -650,7 +650,7 @@ dependencies:
common:
- output_types: [conda, requirements, pyproject]
packages:
- polars>=1.0,<1.3
- polars>=1.6
run_dask_cudf:
common:
- output_types: [conda, requirements, pyproject]
Expand Down
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/cudf/source/_static/pds_benchmark_polars.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
41 changes: 41 additions & 0 deletions docs/cudf/source/cudf_polars/index.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
cuDF-based GPU backend for Polars [Open Beta]
=============================================

cuDF supports an in-memory, GPU-accelerated execution engine for Python users of the Polars Lazy API.
The engine supports most of the core expressions and data types as well as a growing set of more advanced dataframe manipulations
and data file formats. When using the GPU engine, Polars will convert expressions into an optimized query plan and determine
whether the plan is supported on the GPU. If it is not, the execution will transparently fall back to the standard Polars engine
and run on the CPU.

Benchmark
---------
We reproduced the `Polars Decision Support (PDS) <https://github.com/pola-rs/polars-benchmark>`__ benchmark to compare Polars GPU engine with the default CPU settings across several dataset sizes. Here are the results:

.. figure:: ../_static/pds_benchmark_polars.png
:width: 600px



You can see up to 13x speedup using the GPU backend on the compute-heavy PDS queries involving complex aggregation and join operations. Below are the speedups for the top performing queries:


.. figure:: ../_static/compute_heavy_queries_polars.png
:width: 1000px

:emphasis:`PDS-H benchmark | GPU: NVIDIA H100 PCIe | CPU: Intel Xeon W9-3495X (Sapphire Rapids) | Storage: Local NVMe`

You can reproduce the results by visiting the `Polars Decision Support (PDS) GitHub repository <https://github.com/pola-rs/polars-benchmark>`__.

Learn More
----------

The GPU backend for Polars is now available in Open Beta and the engine is undergoing rapid development. To learn more, visit the `GPU Support page <https://docs.pola.rs/user-guide/gpu-support/>`__ on the Polars website.

Launch on Google Colab
----------------------

.. figure:: ../_static/colab.png
:width: 200px
:target: https://colab.research.google.com/github/rapidsai-community/showcase/blob/main/accelerated_data_processing_examples/polars_gpu_engine_demo.ipynb

Take the cuDF backend for Polars for a test-drive in a free GPU-enabled notebook environment using your Google account by `launching on Colab <https://colab.research.google.com/github/rapidsai-community/showcase/blob/main/accelerated_data_processing_examples/polars_gpu_engine_demo.ipynb>`__.
1 change: 1 addition & 0 deletions docs/cudf/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,6 @@ other operations.

user_guide/index
cudf_pandas/index
cudf_polars/index
libcudf_docs/index
developer_guide/index
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ strings
repeat
replace
slice
strip
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
=====
strip
=====

.. automodule:: pylibcudf.strings.strip
:members:
42 changes: 5 additions & 37 deletions python/cudf/cudf/_lib/datetime.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ from pylibcudf.libcudf.types cimport size_type
from cudf._lib.column cimport Column
from cudf._lib.scalar cimport DeviceScalar

import pylibcudf as plc


@acquire_spill_lock()
def add_months(Column col, Column months):
Expand All @@ -38,43 +40,9 @@ def add_months(Column col, Column months):

@acquire_spill_lock()
def extract_datetime_component(Column col, object field):

cdef unique_ptr[column] c_result
cdef column_view col_view = col.view()

with nogil:
if field == "year":
c_result = move(libcudf_datetime.extract_year(col_view))
elif field == "month":
c_result = move(libcudf_datetime.extract_month(col_view))
elif field == "day":
c_result = move(libcudf_datetime.extract_day(col_view))
elif field == "weekday":
c_result = move(libcudf_datetime.extract_weekday(col_view))
elif field == "hour":
c_result = move(libcudf_datetime.extract_hour(col_view))
elif field == "minute":
c_result = move(libcudf_datetime.extract_minute(col_view))
elif field == "second":
c_result = move(libcudf_datetime.extract_second(col_view))
elif field == "millisecond":
c_result = move(
libcudf_datetime.extract_millisecond_fraction(col_view)
)
elif field == "microsecond":
c_result = move(
libcudf_datetime.extract_microsecond_fraction(col_view)
)
elif field == "nanosecond":
c_result = move(
libcudf_datetime.extract_nanosecond_fraction(col_view)
)
elif field == "day_of_year":
c_result = move(libcudf_datetime.day_of_year(col_view))
else:
raise ValueError(f"Invalid datetime field: '{field}'")

result = Column.from_unique_ptr(move(c_result))
result = Column.from_pylibcudf(
plc.datetime.extract_datetime_component(col.to_pylibcudf(mode="read"), field)
)

if field == "weekday":
# Pandas counts Monday-Sunday as 0-6
Expand Down
Loading

0 comments on commit eeb4bae

Please sign in to comment.